In [25]:
import nltk
import pickle
import pandas as pd 
import ahocorasick
import regex as re
import numpy as np 
from nltk import word_tokenize, pos_tag_sents, ngrams
from nltk.stem import PorterStemmer

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


import warnings
import time
warnings.filterwarnings('ignore')

## stemmer

In [26]:
def stem_sentences(sentence):
    porter_stemmer = PorterStemmer()
    tokens = sentence.split()
    stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

## char remover

In [27]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt   

In [28]:
contractions = {
"ain't": "am not / are not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is",
"i'd": "I had / I would",
"i'd've": "I would have",
"i'll": "I shall / I will",
"i'll've": "I shall have / I will have",
"i'm": "I am",
"i've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

In [29]:
def replace(text):
    for word in text.split():
        if word.lower() in contractions:
            text = text.replace(word, contractions[word.lower()])
    return text

In [30]:
def join(text):
    list = []
    for x in text:
        list.append(' '.join(x))
    return list

## cell preprocessor

In [31]:
def preprocess_dataframe(data):

    data['text'] = data['text'].apply(lambda x: replace(x))
    
    data['clean_text'] = np.vectorize(remove_pattern)(data['text'], "@[\w]*")
    # remove special characters 
    data['clean_text'] = data['clean_text'].str.replace("[^a-zA-Z#]", " ")
    # remove URL's 
    data['clean_text'] = np.vectorize(remove_pattern)(data['clean_text'], "http[\w]*")

    data['clean_text'] = data['clean_text'].str.lower()

    data['stemmed'] = data['clean_text'].apply(stem_sentences)


    # generate bigrams and unigrams  
    data['unigrams'] = data.apply(lambda row: nltk.word_tokenize(row['stemmed']), axis=1)
    
    
    data['bigrams'] = data['stemmed'].apply(lambda row: list(ngrams(row.split(), 2)))
    data['bigrams'] = data['bigrams'].apply(lambda row: join(row))
    
    data['ngrams'] = data.apply(lambda x: list([x['unigrams'] , x['bigrams']]), axis=1)

    #POS tagging 
    clean_text = data['clean_text'].tolist()
    tagged_texts = pos_tag_sents(map(word_tokenize, clean_text))
    data['POS_unigrams'] = tagged_texts
    return data

# Read in data

In [32]:
data = pd.read_csv('2013_Queensland_Floods_dev.tsv', sep='\t')
data = data[['text', 'label']]

data = preprocess_dataframe(data)

#relevant_data = data.loc[data['label'] == 'relevant']
#irelevant_data = data.loc[data['label'] == 'not_relevant']

In [33]:
def aho_corasik(model, lst):
    count = 0
    for string in lst:
        for word in string:
            try:
                model.get(word)
                count +=1
            except:
                count = count
    return count

# Extract features from each tweet

In [34]:
# load the pickled trained aho-corasik models 
pickled_first_person = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/first_person.pickle'
pickled_current ='/Users/benjaminkolber/Desktop/aho_corasik_trained_models/current.pickle'
pickled_curse = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/curse.pickle'
pickled_proximity = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/proximity.pickle'
pickled_disasters = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/disasters.pickle'
pickled_caution = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/caution.pickle'
pickled_emotions = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/emotions.pickle'
pickled_perceptual = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/perceptual.pickle'
pickled_dmg = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/dmg.pickle'
pickled_suffering = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/suffering.pickle'
pickled_distress = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/distress.pickle'
pickled_emergency = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/emergency.pickle'
pickled_descriptors = '/Users/benjaminkolber/Desktop/aho_corasik_trained_models/descriptors.pickle'


files = [pickled_first_person, pickled_current, pickled_curse, 
         pickled_proximity, pickled_disasters, pickled_caution, 
         pickled_emotions, pickled_perceptual, pickled_dmg, 
         pickled_suffering, pickled_distress, pickled_emergency, 
         pickled_descriptors]

ahocorasik_models = []
count = 0
for file in files:
    fp_file = open(file , 'rb')
    #ahocorasik_models.append(
    ahocorasik_models.append(pickle.load(fp_file))
    time.sleep(0.5)
    count+=1

In [35]:
def extract_features(data):
    feature_matrix = data[['unigrams' , 'bigrams','ngrams' ,'label']]
    features = ['is_first_person', 'is_current', 'is_curse', 'is_proximitiy', 
                'is_disaster', 'is_caution' , 'is_emotion', 'is_preceptual',
               'is_dmg' , 'is_suffering', 'is_distress', 'is_emergency', 'is_descriptor']

    count = 0
    # extract bigram and unigram features
    for feature in features:
        feature_matrix[feature] = feature_matrix['ngrams'].apply(lambda x: aho_corasik(ahocorasik_models[count],list(x)))
        count += 1
    return feature_matrix

In [36]:
# shuffle data 
feature_matrix = extract_features(data)
feature_matrix = feature_matrix.sample(frac=1).reset_index(drop=True)

In [37]:
X_train = feature_matrix[['is_first_person',
       'is_current', 'is_curse', 'is_proximitiy', 'is_disaster', 'is_caution',
       'is_emotion', 'is_preceptual', 'is_dmg' , 'is_suffering', 'is_distress', 
                          'is_emergency', 'is_descriptor']]

Y_train = feature_matrix['label']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    X_train, Y_train, test_size=0.33, random_state=42)

In [69]:
def log_reg(X_train, X_test, y_train, y_test):
    nums = [10, 50, 100, 500, 1000, 10000]
    for num in nums:
        logreg = LogisticRegression(max_iter = num)
        logreg.fit(X_train, y_train)
        Y_pred = logreg.predict(X_test)
        acc_log = round(logreg.score(X_train, y_train) * 100, 2)
        print('accuracy of logistic regression: {} for max_iter = {}'.format(acc_log, num))
        print('-'*30)

In [55]:
def svc(X_train, X_test, y_train, y_test):
    svc = SVC()
    svc.fit(X_train, y_train)
    Y_pred = svc.predict(X_test)
    acc_svc = round(svc.score(X_train, y_train) * 100, 2)
    print('accuracy of support vector: {}'.format(acc_svc))
    print('-'*30)

In [56]:
def knn(X_train, X_test, y_train, y_test):
    knn = KNeighborsClassifier(n_neighbors = 3)
    knn.fit(X_train, y_train)
    Y_pred = knn.predict(X_test)
    acc_knn = round(knn.score(X_train, y_train) * 100, 2)
    print('accuracy of k- nearest neighbors: {}'.format(acc_knn))
    print('-'*30)

In [57]:
def gaussian(X_train, X_test, y_train, y_test):
    gaussian = GaussianNB()
    gaussian.fit(X_train, y_train)
    Y_pred = gaussian.predict(X_test)
    acc_gaussian = round(gaussian.score(X_train, y_train) * 100, 2)
    print('accuracy of guassian: {}'.format(acc_gaussian))
    print('-'*30)

In [58]:
def perceptron(X_train, X_test, y_train, y_test):
    perceptron = Perceptron()
    perceptron.fit(X_train, y_train)
    Y_pred = perceptron.predict(X_test)
    acc_perceptron = round(perceptron.score(X_train, y_train) * 100, 2)
    print('accuracy of perceptron: {}'.format(acc_perceptron))
    print('-'*30)

In [65]:
def random_forest(X_train, x_test, y_train, y_test):
    random_forest = RandomForestClassifier()
    random_forest.fit(X_train, y_train)
    Y_pred = random_forest.predict(X_test)
    acc_rand_forst = round(random_forest.score(X_train, y_train) * 100, 2)
    print('accuracy of random forest: {}'.format(acc_rand_forst))
    print('-'*30)

In [44]:
quake_test = pd.read_csv('2015_Nepal_Earthquake_test.csv')
quake_train = pd.read_csv('2015_Nepal_Earthquake_train.csv')

flood_test = pd.read_csv('2013_Queensland_Floods_test.csv')
flood_train = pd.read_csv('2013_Queensland_Floods_train.csv')

## Testing models on different, bigger more mixed scale

In [45]:
# combine train + test to two dataframe
# preprocess
# test on models 

In [46]:
data_test = quake_test.append(flood_test , ignore_index=True)
data_train = quake_train.append(flood_train , ignore_index=True)

In [47]:
data_test = preprocess_dataframe(data_test)
data_train = preprocess_dataframe(data_train)

In [60]:
train_matrix = extract_features(data_train)
test_matrix = extract_features(data_test)

train_matrix = train_matrix.sample(frac=1).reset_index(drop=True)
test_matrix = test_matrix.sample(frac=1).reset_index(drop=True)

In [61]:
X_train = train_matrix[['is_first_person',
       'is_current', 'is_curse', 'is_proximitiy', 'is_disaster', 'is_caution',
       'is_emotion', 'is_preceptual', 'is_dmg' , 'is_suffering', 'is_distress', 
                          'is_emergency', 'is_descriptor']]

X_test = test_matrix[['is_first_person',
       'is_current', 'is_curse', 'is_proximitiy', 'is_disaster', 'is_caution',
       'is_emotion', 'is_preceptual', 'is_dmg' , 'is_suffering', 'is_distress', 
                          'is_emergency', 'is_descriptor']]

y_train = train_matrix['label']

y_test = test_matrix['label']

In [70]:
log_reg(X_train, X_test, y_train, y_test)
knn(X_train, X_test, y_train, y_test)
perceptron(X_train, X_test, y_train, y_test)
gaussian(X_train, X_test, y_train, y_test)
svc(X_train, X_test, y_train, y_test)
random_forest(X_train, X_test, y_train, y_test)


accuracy of logistic regression: 79.35 for max_iter = 10
------------------------------
accuracy of logistic regression: 79.35 for max_iter = 50
------------------------------
accuracy of logistic regression: 79.35 for max_iter = 100
------------------------------
accuracy of logistic regression: 79.35 for max_iter = 500
------------------------------
accuracy of logistic regression: 79.35 for max_iter = 1000
------------------------------
accuracy of logistic regression: 79.35 for max_iter = 10000
------------------------------
accuracy of k- nearest neighbors: 80.24
------------------------------
accuracy of perceptron: 78.35
------------------------------
accuracy of guassian: 76.74
------------------------------
accuracy of support vector: 79.5
------------------------------
accuracy of random forest: 81.72
------------------------------
