In [82]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB, BernoulliNB 

In [55]:
training_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [56]:
training_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [57]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


We won't be using keyword, location, and id information in our models so we drop them. We need to retain the id column of test_data since we will be using it to submit our predictions.

In [58]:
training_data = training_data.drop(columns = ["keyword","location","id"])
id_column = test_data["id"].copy()
test_data = test_data.drop(columns = ["keyword","location","id"])

In [59]:
display(training_data.info())
display(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3263 non-null   object
dtypes: object(1)
memory usage: 25.6+ KB


None

There are no null-values. So we can proceed. Let's see some of the tweets.

In [60]:
for tweet in training_data["text"].sample(15,random_state = 1):
    print(tweet)

Goulburn man Henry Van Bilsen missing: Emergency services are searching for a Goulburn man who disappeared from hisÛ_ http://t.co/z99pKJzTRp
The things we fear most in organizations--fluctuations disturbances imbalances--are the primary sources of creativity. - Margaret Wheatley
@tsunami_esh ?? hey Esh
@POTUS you until you drown by water entering the lungs. You being alive has caused this great country to fall to shit because you're a pussy
Crawling in my skin
These wounds they will not hea
#np agalloch - the desolation song
Hollywood Movie About Trapped Miners Released in Chile: 'The 33' Hollywood movie about trapped miners starring... http://t.co/tyyfG4qQvM
New roof and hardy up..Windstorm inspection tomorrow http://t.co/kKeH8qCgc3
The Catastrophic Effects of Hiroshima and Nagasaki Atomic Bombings Still Being Felt Today http://t.co/WC8AqXeDF7
tiffanyfrizzell has a crush: http://t.co/RaF732vRtt
Holy fuck QVC bitch just got burned so hard.
I added a video to a @YouTube playlist http:/

Now we want to remove #,@ symbols, quotation marks, punctuations, links, and numbers.

In [61]:
dataset = [training_data,test_data]

for data in dataset:
    data["text"] = data["text"].apply(lambda x: x.lower())
    data["text"] = data["text"].apply(lambda x: re.sub("(@|#)","",x))
    data["text"] = data["text"].apply(lambda x: re.sub("http[^\s]*","",x))
    data["text"] = data["text"].apply(lambda x: re.sub("[0-9]*","",x))
    data["text"] = data["text"].apply(lambda x: re.sub("[^a-z]+"," ",x))
    data["text"] = data["text"].apply(lambda x: re.sub("(?<=\s)[a-z](?=\s)","",x))


In [62]:
for tweet in training_data["text"].sample(15,random_state = 1):
    print(tweet)

goulburn man henry van bilsen missing emergency services are searching for  goulburn man who disappeared from his 
the things we fear most in organizations fluctuations disturbances imbalances are the primary sources of creativity margaret wheatley
tsunami esh hey esh
potus you until you drown by water entering the lungs you being alive has caused this great country to fall to shit because you re  pussy
crawling in my skin these wounds they will not hea
np agalloch the desolation song
hollywood movie about trapped miners released in chile the hollywood movie about trapped miners starring 
new roof and hardy up windstorm inspection tomorrow 
the catastrophic effects of hiroshima and nagasaki atomic bombings still being felt today 
tiffanyfrizzell has  crush 
holy fuck qvc bitch just got burned so hard 
i added  video to  youtube playlist panic at the disco collar full audio 
behindashield wars goddess sweet lord  collapse as my knees buckle 
a laois girl advertised for  new friend to re

Now we will tokenize each tweet so that we can remove stopwords.

In [63]:
for data in dataset:
    data["text"] = data["text"].apply(lambda x: word_tokenize(x))

for tweet in training_data["text"][0:5]:
    print(tweet)

['our', 'deeds', 'are', 'the', 'reason', 'of', 'this', 'earthquake', 'may', 'allah', 'forgive', 'us', 'all']
['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']
['all', 'residents', 'asked', 'to', 'shelter', 'in', 'place', 'are', 'being', 'notified', 'by', 'officers', 'no', 'other', 'evacuation', 'or', 'shelter', 'in', 'place', 'orders', 'are', 'expected']
['people', 'receive', 'wildfires', 'evacuation', 'orders', 'in', 'california']
['just', 'got', 'sent', 'this', 'photo', 'from', 'ruby', 'alaska', 'as', 'smoke', 'from', 'wildfires', 'pours', 'into', 'school']


Removing the stop words:

In [64]:
stop_words = set(stopwords.words("english"))

for data in dataset:
    data["text"] = data["text"].apply(lambda x: [token for token in x if not token in stop_words])

Now we will lemmatize each tweet using the grammatical class of each word so that we can reduce them to their roots. For each word we will find which part of speech it belongs to using pos_tag function and then use this information while lemmatizing words. We also need to define a function that will turn the pos information into one that can be used by the lemmatizer.

In [65]:
 def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

for data in dataset:
    data["text"] = data["text"].apply(lambda x: [lemmatizer.lemmatize(token,get_wordnet_pos(token)) for token in x])

In [66]:
for tweet in training_data["text"][0:5]:
    print(tweet)

['deed', 'reason', 'earthquake', 'may', 'allah', 'forgive', 'u']
['forest', 'fire', 'near', 'la', 'ronge', 'sask', 'canada']
['resident', 'ask', 'shelter', 'place', 'notify', 'officer', 'evacuation', 'shelter', 'place', 'order', 'expect']
['people', 'receive', 'wildfire', 'evacuation', 'order', 'california']
['get', 'sent', 'photo', 'ruby', 'alaska', 'smoke', 'wildfire', 'pours', 'school']


Each tweet is turned into a list of words. Now we need to turn the list into a string again so we can use CountVectorizer.

In [67]:
for data in dataset:
    data["text"] = data["text"].apply(lambda x: " ".join(x))

In [68]:
for tweet in training_data["text"][0:5]:
    print(tweet)

deed reason earthquake may allah forgive u
forest fire near la ronge sask canada
resident ask shelter place notify officer evacuation shelter place order expect
people receive wildfire evacuation order california
get sent photo ruby alaska smoke wildfire pours school


Before moving ahead with vectorizer, we will first split our training_data into two parts: training and testing. This is so that we can try multiple machine learning models and decide which one performs better. The test_data doesn't contain the target values; it will be used only for the final prediction, to be submitted to kaggle.

In [70]:
X_train, X_test, y_train, y_test = train_test_split(training_data["text"], training_data["target"], test_size = 0.2
                                                   , random_state = 12)

We will create vectors using all the words, i.e. min_df = 1, and using a one-gram model, i.e ngram_range = 1. Later we will try to optimize our model varying these parameters.

In [71]:
vectorizer = CountVectorizer(min_df = 1, ngram_range = (1,1))
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()


The following code gives us the vocabulary used to create the vectors. There are 12133 words. Increasing min_df would decrease the number of words available since most of these words are not meaningful, they are caused by spelling mistakes and they probably appear once throughout the entire tweet collection.

In [87]:
features = vectorizer.get_feature_names_out()
len(features)

12133

A sample of the vocabulary.

In [88]:
for word in features[0:25]:
    print(word)

aa
aaaaaaallll
aaarrrgghhh
aace
aal
aamir
aan
aannnnd
aar
ab
aba
abandon
abandonedpics
abbandoned
abbott
abbruchsimulator
abbswinston
abbyairshow
abc
abcchicago
abceyewitness
abcnews
abcnorio
abe
aberdeen


We try Logistic Regression and Bernoulli Naive Bayes.

In [72]:
LR = LogisticRegression(random_state = 30)
LR.fit(X_train,y_train)
acc_LR = LR.score(X_test,y_test)

In [73]:
acc_LR

0.7931713722915299

In [80]:
bernoulli_nb = BernoulliNB()
bernoulli_nb.fit(X_train,y_train)
acc_bnb = bernoulli_nb.score(X_test,y_test) 

acc_bnb

0.8036769533814839

In [100]:
n_grams = [(1,1),(1,2),(1,3)]
min_df = [1,2,3,4,5]

for gram in n_grams:
    for df in min_df:
        X_train, X_test, y_train, y_test = train_test_split(training_data["text"], training_data["target"], test_size = 0.2
                                                   , random_state = 12)
        
        vectorizer = CountVectorizer(min_df = df, ngram_range = gram)
        X_train = vectorizer.fit_transform(X_train).toarray()
        X_test = vectorizer.transform(X_test).toarray()
        
        LR = LogisticRegression(random_state = 30)
        LR.fit(X_train,y_train)
        acc_LR = LR.score(X_test,y_test)
        
        bernoulli_nb = BernoulliNB()
        bernoulli_nb.fit(X_train,y_train)
        acc_bnb = bernoulli_nb.score(X_test,y_test) 
        
        print("for n_grams = " + str(gram) + " and min_df = " + str(df) + ", the LR acc is " + str(acc_LR) +
              " and " + "Bernoulli NB acc is " + str(acc_bnb) + ".")

for n_grams = (1, 1) and min_df = 1, the LR acc is 0.7931713722915299 and Bernoulli NB acc is 0.8036769533814839.
for n_grams = (1, 1) and min_df = 2, the LR acc is 0.7872619829284307 and Bernoulli NB acc is 0.8023637557452397.
for n_grams = (1, 1) and min_df = 3, the LR acc is 0.7892317793827971 and Bernoulli NB acc is 0.804333552199606.
for n_grams = (1, 1) and min_df = 4, the LR acc is 0.7898883782009193 and Bernoulli NB acc is 0.8049901510177282.
for n_grams = (1, 1) and min_df = 5, the LR acc is 0.7925147734734077 and Bernoulli NB acc is 0.7997373604727511.
for n_grams = (1, 1) and min_df = 6, the LR acc is 0.7892317793827971 and Bernoulli NB acc is 0.7977675640183848.
for n_grams = (1, 1) and min_df = 7, the LR acc is 0.7872619829284307 and Bernoulli NB acc is 0.7951411687458962.
for n_grams = (1, 1) and min_df = 8, the LR acc is 0.7918581746552856 and Bernoulli NB acc is 0.7964543663821405.
for n_grams = (1, 1) and min_df = 9, the LR acc is 0.7905449770190414 and Bernoulli NB ac

Instead of just taking into account the existence/non-existence of words from the vocabulary in each tweet we can also take into account their frequencies within a tweet and accros the collection of tweets. This is provided by Tfidf Vectorizer.

In [83]:
X_train, X_test, y_train, y_test = train_test_split(training_data["text"], training_data["target"], test_size = 0.2
                                                   , random_state = 12)

In [84]:
vectorizer_tfidf = TfidfVectorizer(min_df = 1, ngram_range = (1,1))
X_train = vectorizer_tfidf.fit_transform(X_train).toarray()
X_test = vectorizer_tfidf.transform(X_test).toarray()

This time we try Logistic Regression and Multinomial Naive Bayes.

In [85]:
LR = LogisticRegression(random_state = 30)
LR.fit(X_train,y_train)
acc_LR = LR.score(X_test,y_test)

acc_LR

0.7925147734734077

In [86]:
multinomial_nb = MultinomialNB()
multinomial_nb.fit(X_train,y_train)
acc_mnb = multinomial_nb.score(X_test,y_test) 

acc_mnb

0.7964543663821405

In [101]:
n_grams = [(1,1),(1,2),(1,3)]
min_df = [1,2,3,4,5]

for gram in n_grams:
    for df in min_df:
        X_train, X_test, y_train, y_test = train_test_split(training_data["text"], training_data["target"], test_size = 0.2
                                                   , random_state = 12)
        
        vectorizer_tfidf = TfidfVectorizer(min_df = 1, ngram_range = (1,1))
        X_train = vectorizer_tfidf.fit_transform(X_train).toarray()
        X_test = vectorizer_tfidf.transform(X_test).toarray()
        
        LR = LogisticRegression(random_state = 30)
        LR.fit(X_train,y_train)
        acc_LR = LR.score(X_test,y_test)
        
        multinomial_nb = MultinomialNB()
        multinomial_nb.fit(X_train,y_train)
        acc_mnb = multinomial_nb.score(X_test,y_test) 

        acc_mnb
        
        print("for n_grams = " + str(gram) + " and min_df = " + str(df) + ", the LR acc is " + str(acc_LR) +
              " and " + "Multinomial NB acc is " + str(acc_mnb) + ".")

for n_grams = (1, 1) and min_df = 1, the LR acc is 0.7925147734734077 and Multinomial NB acc is 0.7964543663821405.
for n_grams = (1, 1) and min_df = 2, the LR acc is 0.7925147734734077 and Multinomial NB acc is 0.7964543663821405.
for n_grams = (1, 1) and min_df = 3, the LR acc is 0.7925147734734077 and Multinomial NB acc is 0.7964543663821405.
for n_grams = (1, 1) and min_df = 4, the LR acc is 0.7925147734734077 and Multinomial NB acc is 0.7964543663821405.
for n_grams = (1, 1) and min_df = 5, the LR acc is 0.7925147734734077 and Multinomial NB acc is 0.7964543663821405.
for n_grams = (1, 2) and min_df = 1, the LR acc is 0.7925147734734077 and Multinomial NB acc is 0.7964543663821405.
for n_grams = (1, 2) and min_df = 2, the LR acc is 0.7925147734734077 and Multinomial NB acc is 0.7964543663821405.
for n_grams = (1, 2) and min_df = 3, the LR acc is 0.7925147734734077 and Multinomial NB acc is 0.7964543663821405.
for n_grams = (1, 2) and min_df = 4, the LR acc is 0.7925147734734077 an

One-hot encoding of vectors don't take into account the similarity of words. In this construction every word has the same distance to every other word. However, we would want words that have similar meanings to be close together. This is provided by Word2Vec function. To use this function we must first create a list of lists where each inner list contains one tweet in a tokenized form. Then we feed this list into Word2Vec function which creates the vectors.

In [25]:
w2v_train, w2v_test, y_train, y_test = train_test_split(training_data["text"], training_data["target"], 
                                                        test_size = 0.2, random_state = 12)


w2v_train = w2v_train.apply(lambda x: word_tokenize(x))
w2v_test = w2v_test.apply(lambda x: word_tokenize(x))

w2v_train.reset_index(drop = True, inplace = True)
w2v_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

corpus = []

for token_list in w2v_train:
    corpus.append(token_list)
    

In [106]:
vector_model = Word2Vec(sentences = corpus, vector_size = 200, window = 3, min_count = 4, seed = 42, epochs =50)

Word2Vec creates a vector for each word and not for each tweet. Therefore we must find a way to turn them into vectors for tweets. One thing we can do is we can take the average of each vector in the tweet. The following function does exactly that.

In [95]:
def word_vector(token_list,size):
    vec = np.zeros(size).reshape((1,size))
    count = 0
    for token in token_list:
        try:
            vec += vector_model.wv[token].reshape((1,size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec

In [107]:
X_train = np.zeros((len(w2v_train),200))
X_test = np.zeros((len(w2v_test),200))

for index in range(len(w2v_train)):
    X_train[index,:] = word_vector(w2v_train[index],200)

for index in range(len(w2v_test)):
    X_test[index,:] = word_vector(w2v_test[index],200)
    
X_train = StandardScaler().fit_transform(X_train)
X_test = StandardScaler().fit_transform(X_test)

In [108]:
LR = LogisticRegression(random_state = 30, solver = "newton-cg")
LR.fit(X_train,y_train)
acc_LR = LR.score(X_test,y_test)

In [109]:
acc_LR

0.7590282337491793

Since our copus is very limited, word2vec doesn't perform well.

It is clear from above that we will create our vectors using Vectorizer with a mix of unigrams and bigrams. We shall exclude terms that appear less than 4 times. We choose Naive Bayse over Logistic Regression and train it using all the training data. Finally we predict labels of tweets in the test data and submit our predictions to kaggle.

In [116]:
vectorizer = CountVectorizer(min_df = 4, ngram_range = (1,2))
X_train = vectorizer.fit_transform(training_data["text"]).toarray()
X_test = vectorizer.transform(test_data["text"]).toarray()
y_train = training_data["target"]


LR = LogisticRegression(random_state = 30)
LR.fit(X_train,y_train)
predictions = LR.predict(X_test)

predictions = pd.Series(predictions)
submit = pd.concat([id_column,predictions], axis = 1, keys =["id", "Target"])

In [118]:
submit.to_csv("NLP_with_disaster_tweets.csv", index=False)