In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import nltk 
from nltk.stem import PorterStemmer 
import string
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn import linear_model, feature_extraction, metrics
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bls24\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
print(len(train), len(test)) #number of training and testing data points

7613 3263


In [4]:
len(train[train["target"] == 1]) / len(train) #percentage of real disasters

0.4296597924602653

In [5]:
train, developement = train_test_split(train, train_size = .7, test_size = .3)

In [6]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
from nltk.stem import PorterStemmer
ps = PorterStemmer()

def lemmatize_text(text):
    words_array = w_tokenizer.tokenize(str(text))
    return [ps.stem(w) for w in words_array]

In [7]:
def stem(dataframe):
    dataframe.text.fillna(' ', inplace=True)
    dataframe['text'] = dataframe.text.apply(lemmatize_text)
    dataframe['text'] = dataframe.text.apply(" ".join)

In [8]:
def clean(dataframe):
    tweets=[]
    for tweet in dataframe["text"]:
        tweet = re.sub(r"http\S+","",tweet)
        tweet = re.sub(r"@[/\w/]*","",tweet)
        tweets.append(tweet)
    dataframe["text"] = tweets

In [9]:
stem(train)
stem(developement)
stem(test)

In [10]:
clean(train)
clean(developement)
clean(test)

In [11]:
def print_word_count(word):
    print("'"+word+"' count:",train['text'].str.contains(word).sum())
test_words = ["emergency","sinkhole","earthquake","fire","mudslide",\
              "lightning","flood","tsunami","hurricane","tornado"]
for word in test_words:
    print_word_count(word)

'emergency' count: 8
'sinkhole' count: 6
'earthquake' count: 4
'fire' count: 260
'mudslide' count: 11
'lightning' count: 5
'flood' count: 111
'tsunami' count: 27
'hurricane' count: 4
'tornado' count: 29


In [21]:
count_vect = feature_extraction.text.CountVectorizer(binary=True, min_df = 4, \
                                                     stop_words = 'english',\
                                                    lowercase=True,\
                                                    strip_accents='ascii')
X_train = count_vect.fit_transform(train["text"])
y_train = train.target
X_developement = count_vect.transform(developement["text"])
y_developement = developement.target
X_test = count_vect.transform(test["text"])

In [22]:
X_train, X_developement

(<5329x2225 sparse matrix of type '<class 'numpy.int64'>'
 	with 33607 stored elements in Compressed Sparse Row format>,
 <2284x2225 sparse matrix of type '<class 'numpy.int64'>'
 	with 13584 stored elements in Compressed Sparse Row format>)

In [23]:
clf_no = linear_model.LogisticRegression(penalty='none', max_iter=1500, solver='saga').fit(X_train, y_train)
train_pred_no = clf_no.predict(X_train)
devel_pred_no = clf_no.predict(X_developement)
train_score_no = metrics.f1_score(y_train,train_pred_no)
devel_score_no = metrics.f1_score(y_developement,devel_pred_no)
print(train_score_no, devel_score_no)

0.9655475093263112 0.6761277242777496


In [24]:
clf_l1 = linear_model.LogisticRegression(penalty='l1',max_iter = 1500, solver='saga').fit(X_train, y_train)
train_pred_l1 = clf_l1.predict(X_train)
devel_pred_l1 = clf_l1.predict(X_developement)
train_score_l1 = metrics.f1_score(y_train,train_pred_l1)
devel_score_l1 = metrics.f1_score(y_developement,devel_pred_l1)
print(train_score_l1, devel_score_l1)

0.8415199258572751 0.7319195214790646


In [25]:
clf_l2 = linear_model.LogisticRegression(penalty='l2',max_iter = 1500).fit(X_train, y_train)
train_pred_l2 = clf_l2.predict(X_train)
devel_pred_l2 = clf_l2.predict(X_developement)
train_score_l2 = metrics.f1_score(y_train,train_pred_l2)
devel_score_l2 = metrics.f1_score(y_developement,devel_pred_l2)
print(train_score_l2, devel_score_l2)

0.8699676075890791 0.7342047930283224


In [26]:
max_index2 = np.argpartition(clf_l1.coef_[0], -10)[-10:]
vocab = [(k, v) for k, v in count_vect.vocabulary_.items()]
for num in max_index2:
    print(vocab[num])

('prepared', 1520)
('mlb', 1289)
('feel', 788)
('dublin', 656)
('hostages', 991)
('exp', 735)
('tire', 1989)
('daughter', 547)
('caught', 392)
('memori', 1251)


In [27]:
X_train = X_train.toarray()
X_developement = X_developement.toarray()
n = X_train.shape[0] #tweets
d = X_train.shape[1] #words
K = 2
psis = np.zeros([K,d])
phis = np.zeros([K])

for k in range(K):
    X_k = X_train[y_train == k]
    psis[k] = (X_k.sum(axis=0)+1)/(X_k.shape[0]+2)
    phis[k] = X_k.shape[0] / float(n)

In [30]:
def nb_predictions(x, psis, phis):
    n, d = x.shape
    x = np.reshape(x, (1, n, d))
    psis = np.reshape(psis, (K, 1, d))
    psis = psis.clip(1e-14, 1-1e-14)
    logpy = np.log(phis).reshape([K,1])
    logpxy = x * np.log(psis) + (1-x) * np.log(1-psis)
    logpyx = logpxy.sum(axis=2) + logpy
    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])

In [31]:
idx_train, logpyx_train = nb_predictions(X_train, psis, phis)
idx_devel, logpyx_devel = nb_predictions(X_developement, psis, phis)

train_score_nb = metrics.f1_score(y_train,idx_train)
devel_score_nb = metrics.f1_score(y_developement,idx_devel)

print("Train f1 score:", train_score_nb, "\nDevel f1 score:", devel_score_nb)

Train f1 score: 0.7979846449136276 
Devel f1 score: 0.7461318051575931


In [33]:
count_vectorizer = feature_extraction.text.CountVectorizer(binary = True,\
                                                           ngram_range = (1,2),\
                                                           min_df = 4,\
                                                           strip_accents='ascii',\
                                                           lowercase=True,\
                                                           stop_words = 'english')
X_train = count_vectorizer.fit_transform(train["text"]).toarray()
X_devel = count_vectorizer.transform(developement["text"]).toarray()

In [35]:
unigram_count = 0
bigram_count = 0
counter = 0
example_bigrams = []
for index, key in enumerate(count_vectorizer.vocabulary_.keys()):
    if(' ' in key):
        bigram_count += 1
        if (counter <= 10):
            example_bigrams.append(key)
            counter +=1
    else:
        unigram_count += 1
print(bigram_count, unigram_count, bigram_count+unigram_count)
print('ratio:',bigram_count/(unigram_count+bigram_count))
print('Example Bigrams:', example_bigrams)

867 2225 3092
ratio: 0.28040103492884866
Example Bigrams: ['polic offic', 'dust storm', 'view download', 'download video', 'panic disco', 'downtown emerg', 'emerg servic', 'servic center', 'chemic depend', 'depend counselor', 'counselor intern']


In [37]:
clf_l2.fit(X_train, y_train)
d = X_train.shape[1] #words
psis = np.zeros([K,d])
phis = np.zeros([K])
for k in range(K):
    X_k = X_train[y_train == k]
    psis[k] = (X_k.sum(axis=0)+1)/(X_k.shape[0]+2)
    phis[k] = X_k.shape[0] / float(n)

In [38]:
train_predictions_l2 = clf_l2.predict(X_train)
devel_predictions_l2 = clf_l2.predict(X_devel)
idx_train, logpyx_train = nb_predictions(X_train, psis, phis)
idx_devel, logpyx_devel = nb_predictions(X_devel, psis, phis)

In [40]:
train_score_l2 = metrics.f1_score(y_train,train_predictions_l2)
devel_score_l2 = metrics.f1_score(y_developement,devel_predictions_l2)
train_score_nb = metrics.f1_score(y_train,idx_train)
devel_score_nb = metrics.f1_score(y_developement,idx_devel)

In [41]:
scores = pd.DataFrame([['l2',train_score_l2,devel_score_l2],['NB',train_score_nb,devel_score_nb]],columns = ["technique","train f1 score","devel f1 score"])
scores.head()

Unnamed: 0,technique,train f1 score,devel f1 score
0,l2,0.874449,0.740579
1,NB,0.768215,0.705374


In [42]:
train_df = pd.read_csv("train.csv")
train_df.text.fillna(' ', inplace=True)
train_df['text'] = train_df.text.apply(lemmatize_text)
train_df['text'] = train_df.text.apply(" ".join)

In [43]:
count_vectorizer = feature_extraction.text.CountVectorizer(binary = True,\
                                                           min_df = 10,\
                                                           strip_accents='ascii',\
                                                           lowercase=True,\
                                                           stop_words = 'english')
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test["text"])

In [44]:
X_train = train_vectors
y_train = train_df.target
X_test = test_vectors

In [45]:
clf_l2.fit(X_train, y_train)
train_predictions_l2 = clf_l2.predict(X_train)
test_predictions_l2 = clf_l2.predict(X_test)
train_score_l2 = metrics.f1_score(y_train,train_predictions_l2)



In [48]:
print(train_score_l2)

0.8264571054354943


In [47]:
submission = pd.read_csv("sample_submission.csv")
submission["target"] = test_predictions_l2
submission.to_csv("submission.csv", index = False)