In [3]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 13 22:15:15 2021

@author: Arbo
"""
import re
from string import punctuation                   # to extract the puntuation symbols
import nltk
from nltk.tokenize import word_tokenize          # to divide strings into tokens
from nltk.stem import WordNetLemmatizer          # to lemmatize the tokens
from nltk.corpus import stopwords                # to remove the stopwords 
import numpy as np
import tensorflow as tf            
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping


def clean_text(text):
    '''Make text lowercase, remove links,remove punctuation
    and remove words containing numbers.'''
    #text=str(text)
    text = text.lower()
    #get rid of usernames
    tweet_words = text.strip('\r').split(' ')
    for word in [word for word in tweet_words if '@' in word]:
            
            text = text.replace(word, "")
    #get rid of the re-tweet
    tweet_words = text.strip('\r').split(' ')
    for word in [word for word in tweet_words if 'rt' == word]:
            
            text = text.replace(word, "")
            
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('[%s]' % re.escape(punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text


def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words


def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]  ##Notice the use of text.

def concatenate_text(text):
    return ' '.join(text)

def makeglove (path_to_glove_file):
    embeddings_index = {}
    f = open(path_to_glove_file, 'r', encoding='utf8')
    for line in f:
        splitLine = line.split(' ')
        word = splitLine[0]                                  # the first entry is the word
        coefs = np.asarray(splitLine[1:], dtype='float32')   # these are the vectors representing word embeddings
        embeddings_index[word] = coefs
    return embeddings_index

def make_embedding_matrix(train_samples, val_samples, embeddings_index):
    
    """
    This function computes the embedding matrix that will be used in the embedding layer
    
    Parameters:
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        embeddings_index: Python dictionary with word embeddings
    
    Returns:
        embedding_matrix: embedding matrix with the dimensions (num_tokens, embedding_dim), 
        where num_tokens is the vocabulary of the input data, 
        and emdebbing_dim is the number of components in the GloVe vectors (can be 50,100,200,300)
        vectorizer: TextVectorization layer      
    """
    
    vectorizer = TextVectorization(max_tokens=55000, output_sequence_length=50)
    text_ds = tf.data.Dataset.from_tensor_slices(train_samples).batch(128)
    vectorizer.adapt(text_ds)
    
    voc = vectorizer.get_vocabulary()
    word_index = dict(zip(voc, range(len(voc))))
      
    num_tokens = len(voc)
    
    hits = 0
    misses = 0

#   creating an embedding matrix
    embedding_dim = len(embeddings_index['the'])
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1

#     print("Converted %d words (%d misses)" % (hits, misses))
    print(f"Converted {hits} words ({misses} misses).")

    return embedding_matrix, vectorizer

def processme(twts):
    procstart =  datetime.datetime.now()
    print("{}: processing ".format(procstart))
    twts['ptext'] = twts['Text'].apply(lambda x: clean_text(x))
    twts['ptext'] = twts['ptext'].apply(lambda x: word_tokenize(x))
    twts['ptext'] = twts['ptext'].apply(lambda x : remove_stopwords(x))
    twts['ptext'] = twts['ptext'].apply(lambda x : lemmatize_text(x))
    twts['ptext'] = twts['ptext'].apply(lambda x : concatenate_text(x))
    procend=  datetime.datetime.now() 
    print("{}: processed in {}".format(procend, procend - procstart))
    return twts

In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jul 13 22:39:38 2021

@author: Arbo
"""
from mordecai import Geoparser
import numpy as np
import pandas as pd 
import random
import tensorflow as tf            
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping

def train_val_split(df, validation_split):
    """
    This function generates the training and validation splits from an input dataframe
    
    Parameters:
        dataframe: pandas dataframe with columns "text" and "target" (binary)
        validation_split: should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the validation split
    
    Returns:
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        train_labels: list of labels (0 or 1) in the training dataset
        val_labels: list of labels (0 or 1) in the validation dataset      
    """
       
    text = df['text'].values.tolist()                         # input text as list
    targets = df['class_label_cat'].values.tolist()                    # targets
    
#   Preparing the training/validation datasets
    
    seed = random.randint(1,50)   # random integer in a range (1, 50)
    rng = np.random.RandomState(seed)
    rng.shuffle(text)
    rng = np.random.RandomState(seed)
    rng.shuffle(targets)

    num_validation_samples = int(validation_split * len(text))

    train_samples = text[:-num_validation_samples]
    val_samples = text[-num_validation_samples:]
    train_labels = targets[:-num_validation_samples]
    val_labels = targets[-num_validation_samples:]
    
    print(f"Total size of the dataset: {df.shape[0]}.")
    print(f"Training dataset: {len(train_samples)}.")
    print(f"Validation dataset: {len(val_samples)}.")
    
    return train_samples, val_samples, train_labels, val_labels

def test_listerine(df):
    """
    This function generates the test x and y from an input dataframe
    
    Parameters:
        dataframe: pandas dataframe with columns "text" and "class_label_cat" (binary)
        
    
    Returns:
        test_samples: list of strings in the training dataset

        test_labels: list of labels (0 or 1) in the training dataset
    
    """
       
    text = df['text'].values.tolist()                         # input text as list
    targets = df['class_label_cat'].values.tolist()                    # targets
    
#   Preparing the training/validation datasets
    
    seed = random.randint(1,50)   # random integer in a range (1, 50)
    rng = np.random.RandomState(seed)
    rng.shuffle(text)
    rng = np.random.RandomState(seed)
    rng.shuffle(targets)

   

    test_samples = text
   
    test_labels = targets
    
    
    print(f"Total size of the dataset: {df.shape[0]}.")

    
    return test_samples, test_labels


def geo_df(df,geo):
    
# =============================================================================
#     geo = Geoparser()
# =============================================================================
    df['geos'] = geo.batch_geoparse(df['Text'])
    df_geo = df[df["geos"].str.len() != 0]
    df_geo = df_geo.explode('geos')
    df_geo = pd.concat([df_geo.drop(['geos'], axis=1), df_geo['geos'].apply(pd.Series)], axis=1)
    df_geo = pd.concat([df_geo.drop(['geo'], axis=1), df_geo['geo'].apply(pd.Series)], axis=1)
    df_geo = df_geo[df_geo['lat'].notnull()]
    df_geo.lat = df_geo.lat.astype(float)
    df_geo.lon =df_geo.lon.astype(float)
    return df_geo

#     df_js = pd.DataFrame()
#     for row in range(len(result_inf)):
#         df_temp = pd.json_normalize(result_inf['geos'], record_path =['spans'], 
#         meta=['word',"country_predicted", "country_conf",['geo',"admin1"],
#               ['geo',"lat"],['geo',"lon"],['geo',"country_code3"],['geo',"geonameid"],
#               ['geo',"place_name"],['geo',"feature_class"],['geo',"feature_code"]],
#         errors='ignore'
#     )
#         df_temp['TweetId']=''
#         for i in range(len(df_temp)):
#             df_temp['TweetId'][i]=tweets_df['TweetId'][row]
#         df_js=df_js.append(df_temp,ignore_index=True)

#     df_js = df_js.rename(columns = {'TweetId':'TweetId', 'start':'start', 'end':'end', 
#                                     'word':'word','country_predicted':'country_predicted', 
#                                     'country_conf': 'country_conf','geo.admin1':'admin1', 
#                                     'geo.lat':'lat', 'geo.lon':'lon', 
#                                     'geo.country_code3':'country_code3','geo.geonameid':'geonameid', 
#                                     'geo.place_name':'place_name', 
#                                     'geo.feature_class':'feature_class','geo.feature_code':'feature_code'})
#     return df_js




def suggest_nn2(df, model, vectorizer):
    """
    This function generates (binary) targets from a dataframe with column "text" using trained Keras model
    
    Parameters:
        df: pandas dataframe with column "text"
        model: Keras model (trained)
    
    Output:
        predictions: list of suggested targets corresponding to string entries from the column "text"
    """
    
    string_input = keras.Input(shape=(1,), dtype="string")
    x = vectorizer(string_input)
    preds = model(x)
    end_to_end_model = keras.Model(string_input, preds)

    probabilities = end_to_end_model.predict(df["ptext"])
    
    predictions = [1 if i > 0.5 else 0 for i in probabilities]
    
    return predictions

def suggest_nn3(df, model, vectorizer):
    """
    This function generates (binary) targets from a dataframe with column "text" using trained Keras model
    
    Parameters:
        df: pandas dataframe with column "text"
        model: Keras model (trained)
    
    Output:
        predictions: list of suggested targets corresponding to string entries from the column "text"
    """
    
    string_input = keras.Input(shape=(1,), dtype="string")
    x = vectorizer(string_input)
    preds = model(x)
    end_to_end_model = keras.Model(string_input, preds)

    probabilities = end_to_end_model.predict(df["text"])
    
    predictions = [1 if i > 0.5 else 0 for i in probabilities]
    
    return predictions

def initialize_nn(embedding_matrix):
    """
    This function initializes Keras model for binary text classification
    
    Parameters:
        embedding matrix with the dimensions (num_tokens, embedding_dim),
         where num_tokens is the vocabulary size of the input data,
          and emdebbing_dim is the number of components in the GloVe vectors
    
    Returns:
        model: Keras model    
    """
    
    num_tokens = embedding_matrix.shape[0]
    embedding_dim = embedding_matrix.shape[1]
    
    embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False,                # we are not going to train the embedding vectors
    )
    
#   Here we define the architecture of the Keras model. 
    int_sequences_input = keras.Input(shape=(None,), dtype="int64")
    x = embedding_layer(int_sequences_input) 
    x = layers.Dropout(.7)(x)
    x = layers.Bidirectional(layers.LSTM(128,                                        
                                          dropout=.4,
                                          return_sequences=True))(x)
# =============================================================================
#     x = layers.Bidirectional(layers.LSTM(32,
#                                           dropout=.5))(x)
# =============================================================================
    x = layers.Dense(128)(x)
    x = layers.Dropout(.5)(x)
    preds = layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(int_sequences_input, preds)
    
    return model

def train_nn(model, train_samples, val_samples, train_labels, val_labels, vectorizer, stop = True):
    """
    This function fits the training data using validation data to calculate metrics.
    
    Parameters:
        model: preinitialized Keras model
        train_samples: list of strings in the training dataset
        val_samples: list of strings in the validation dataset
        train_labels: list of labels (0 or 1) in the training dataset
        val_labels: list of labels (0 or 1) in the validation dataset
        vectorizer: TextVectorization layer
        stop (Boolean): flag for Early Stopping (aborting training when a monitored metric has stopped improving)
    
    Returns:
        model: trained Keras model
        history: callback that can be used to track the learning process
    """
    
    print('')
    print("Training the model...")
    
    model.compile(loss="binary_crossentropy", 
              optimizer="adam", 
              metrics=["binary_accuracy"])
    
    x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
    x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()
    
    y_train = np.asarray(train_labels).astype('float32').reshape((-1,1))
    y_val = np.asarray(val_labels).astype('float32').reshape((-1,1))

    
    
    if stop:
        early_stopping = EarlyStopping(monitor='val_loss', patience=1)
        history = model.fit(x_train, y_train, batch_size=32, epochs=50, validation_data=(x_val, y_val), callbacks=[early_stopping], verbose=1)
    else:
        history = model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_val, y_val), verbose=1)
        
    return model, history

    

In [9]:
# #!/usr/bin/env python3
# # -*- coding: utf-8 -*-
# """
# Created on Tue Aug 10 23:46:58 2021

# @author: Arbo
# """
# import datetime
# overallstart = datetime.datetime.now() 
# print ("started at: {}".format(overallstart))

# import numpy as np                               # linear algebra
# import pandas as pd                              # data processing, CSV file I/O (e.g. pd.read_csv)
# import os
# import re                                        # to handle regular expressions
# from string import punctuation                   # to extract the puntuation symbols
# import nltk
# from nltk.tokenize import word_tokenize          # to divide strings into tokens
# from nltk.stem import WordNetLemmatizer          # to lemmatize the tokens
# from nltk.corpus import stopwords                # to remove the stopwords 

# import random                                    # for generating (pseudo-)random numbers
# import matplotlib.pyplot as plt                  # to plot some visualizations

# import tensorflow as tf            
# from tensorflow import keras
# from tensorflow.keras import layers
# from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
# from tensorflow.keras.layers import Embedding
# from tensorflow.keras.callbacks import EarlyStopping

# from tensorflow.keras import layers
# # =============================================================================
# # from kerastuner.tuners import RandomSearch
# # from kerastuner.tuners import BayesianOptimization
# # from kerastuner.tuners import Hyperband
# # import kerastuner as kt
# # =============================================================================


# # import text_proccessing
# # from text_proccessing import clean_text
# # from text_proccessing import remove_stopwords
# # from text_proccessing import lemmatize_text
# # from text_proccessing import concatenate_text
# # from text_proccessing import makeglove
# # from text_proccessing import make_embedding_matrix
# # from modhelp import train_val_split
# # from modhelp import test_listerine
# # from modhelp import suggest_nn3
# # from modhelp import initialize_nn
# # from modhelp import  train_nn

# starti = datetime.datetime.now() 
print("{}: modules loaded in {} loading data".format(starti, starti-overallstart))
code_dir = os.getcwd()
print("Current working directory: {0}".format(code_dir))
parent_dir = os.getcwd()
print("C working directory: {0}".format(code_dir))
data_dir = os.path.join(parent_dir,"Data")
tweet_dir = os.path.join(parent_dir,"TweetMap")
directory = os.path.join(data_dir,'splits')
dsa= os.path.dirname(code_dir)
# model = tf.keras.models.load_model(os.path.join(dsa,'model1'))
train_data = pd.read_csv(os.path.join(directory, 'InformativenessTrain_Processed.csv'))
test_data  = pd.read_csv(os.path.join(directory, 'InformativenessTest_Processed.csv'))
dtl = datetime.datetime.now()
print("{}: data loaded in {} cleaning text elapsed: {}".format(starti,dtl-starti,dtl-overallstart))


train_data['text'] = train_data['text'].apply(lambda x: clean_text(x))
test_data['text'] = test_data['text'].apply(lambda x: clean_text(x))
train_data['text'] = train_data['text'].apply(lambda x:word_tokenize(x))
test_data['text'] = test_data['text'].apply(lambda x:word_tokenize(x))
train_data['text'] = train_data['text'].apply(lambda x : remove_stopwords(x))
test_data['text'] = test_data['text'].apply(lambda x : remove_stopwords(x))
train_data['text'] = train_data['text'].apply(lambda x : lemmatize_text(x))
test_data['text'] = test_data['text'].apply(lambda x : lemmatize_text(x))
train_data['text'] = train_data['text'].apply(lambda x : concatenate_text(x))
test_data['text'] = test_data['text'].apply(lambda x : concatenate_text(x))

step1 = datetime.datetime.now()
print("{}: text cleand in {} loading vector and spliting samples elapsed: {}".format(step1,dtl-step1, step1-overallstart))
path_to_glove_file = os.path.join(dsa,'WordVector','glove.twitter.27B.200d.txt')
train_samples, val_samples, train_labels, val_labels = train_val_split(train_data, 0.25)
# =============================================================================
# test_samples, test_labels = test_listerine(test_data)
# =============================================================================
print("indexing")
embeddings_index=makeglove(path_to_glove_file)
step2 = datetime.datetime.now()
print("{}: indexed {} matrix and vectorize ellapsed {}".format(step2, step2-step1, step2-overallstart))
embedding_matrix, vectorizer = make_embedding_matrix(train_samples, val_samples, embeddings_index)
step3 = datetime.datetime.now()
print("{}: mtx'd at vct'd in {} loading variables as vectors ellapsed {}".format(step3, step3-step2,  step3-overallstart))

x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()
# =============================================================================
# x_test = vectorizer(np.array([[s] for s in test_samples])).numpy()
# =============================================================================
y_train = np.asarray(train_labels).astype('float32').reshape((-1,1))
y_val = np.asarray(val_labels).astype('float32').reshape((-1,1))
# =============================================================================
# y_test = np.asarray(test_labels).astype('float32').reshape((-1,1))
# =============================================================================
step4 = datetime.datetime.now()
print("{}: training data ready in {} buildinf initial mod  ellapsed {}".format(step4, step4-step3,  step4-overallstart))




predictions = suggest_nn3(test_data, model,vectorizer)

submission_data = {"ID": test_data['id'].tolist(),"tweet": test_data['text'].tolist(), "target": predictions}

submission_df = pd.DataFrame(submission_data)
result = test_data.join(submission_df.target)

result.to_csv(os.path.join(tweet_dir,"dubblecheck"),index =False)



2021-08-11 17:54:43.173678: modules loaded in 0:00:00.004797 loading data
Current working directory: /Volumes/Elements/DataScience/dsa/capstone
C working directory: /Volumes/Elements/DataScience/dsa/capstone
2021-08-11 17:54:43.173678: data loaded in 0:12:35.725148 cleaning text elapsed: 0:12:35.729945
2021-08-11 18:25:43.805174: text cleand in -1 day, 23:41:35.093652 loading vector and spliting samples elapsed: 0:31:00.636293
Total size of the dataset: 117339.
Training dataset: 88005.
Validation dataset: 29334.
indexing
2021-08-11 18:26:47.999284: indexed 0:01:04.194110 matrix and vectorize ellapsed 0:32:04.830403
Converted 32297 words (22703 misses).
2021-08-11 18:26:53.137032: mtx'd at vct'd in 0:00:05.137748 loading variables as vectors ellapsed 0:32:09.968151
2021-08-11 18:27:08.600511: training data ready in 0:00:15.463479 buildinf initial mod  ellapsed 0:32:25.431630


In [10]:
result.head()

Unnamed: 0,id,event,source,text,lang,lang_confidence,class_label,class_label_cat,processed_txt,target
0,368967734542864384,2013_manila_floods,crisislext26,pagasa yellow advisory metro manila moderatehe...,en,1.0,informative,0,pagasa 12 20pm yellow advisory for metro moder...,0
1,369089159635283968,2013_manila_floods,crisislext26,pagasa red rainfall warning weather system sou...,en,0.72282,informative,0,pagasa red rainfall warning no D weather syste...,0
2,369122655359598592,2013_manila_floods,crisislext26,tropical storm maring continue enhance southwe...,en,1.0,informative,0,tropical storm maring will continue to enhance...,0
3,369139034133524480,2013_manila_floods,crisislext26,cant sleep rain maringph ughh,en,1.0,informative,0,can t sleep because of this rain #maringph #ughh,0
4,369142238564974592,2013_manila_floods,crisislext26,well bad u heard news already waistdeep flood ...,en,1.0,informative,0,well it s not that bad for us here but i heard...,0


In [11]:
from sklearn.metrics import confusion_matrix
y_true = result.class_label_cat
y_pred = result.class_label_cat
confusion_matrix(y_true, y_pred)

array([[23432,     0],
       [    0, 15681]])