In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
training_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
pd.set_option('display.max_colwidth', None)

In [3]:
training_df.info()
test_df.info()
training_df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


Unnamed: 0,id,target
count,7613.0,7613.0
mean,5441.934848,0.42966
std,3137.11609,0.49506
min,1.0,0.0
25%,2734.0,0.0
50%,5408.0,0.0
75%,8146.0,1.0
max,10873.0,1.0


Question 1a

There are 7613 training data points, and 3263 test data points.
42.966% of the training tweets are real disasters, and 57.034% are not.

Question 1c

Preprocessing the full training data before splitting, because eventually we would need to preprocess all of the training and test data anyway.
The preprocessing measures taken are:
1) converting all words to lowercase -> to avoid duplicate features of the same word due to different casing

2) removing the URLs - many url's are not comprising meaningful English words from which we can deduce the context of the tweet, and so removing the URLs should help the prediction.

   The URLs are removed before punctuation removal because the '/' are needed to identify them
   
3) removing the usernames - same reason as removing URLs. This is also done before stripping punctuation because the '@' symbol is used for the identifications

4) strip the punctuation - for easier lemmatizing later

5) removing the numbers - because in our models used we cannot place the numbers well into context, so removing them as unusable data (for the purpose of the predictions).

6) removing stopwords - stopwords would not add valuable information which would help the predictions.

7) lemmatize all words - reduce the number of features so the model can use them better.

In [4]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
 
stop_words = set(stopwords.words('english'))

def lowercase(text):
    #lowercase
    return text.lower()

def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)

def remove_usernames(tweet):
    return re.sub('@[^\s]+','',tweet)

def remove_punctuation_translate(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def remove_numbers(text):
    translator = str.maketrans('', '', '0123456789')
    return text.translate(translator)

def remove_stopwords(text):
    str=''
    word_tokens = word_tokenize(text)
 
    for w in word_tokens:
        if w not in stop_words:
            str = str + w + ' '
 
    return str

def clean_text(text):
    output_str = lowercase(text)
    output_str = remove_URL(output_str)
    output_str = remove_usernames(output_str)
    output_str = remove_punctuation_translate(output_str)
    output_str = remove_numbers(output_str)
    output_str = remove_stopwords(output_str)
    return output_str
    

In [5]:
training_df['text'] = training_df['text'].apply(clean_text)

In [6]:
training_df['text'].tail(10)

7603                                  officials say quarantine place alabama home possible ebola case developing symptoms 
7604                                     worldnews fallen powerlines glink tram update fire crews evacuated passengers tr 
7605                                                          flip side im walmart bomb everyone evacuate stay tuned blow 
7606                                   suicide bomber kills saudi security site mosque reuters via world google news wall 
7607                                           stormchase violent record breaking ef el reno oklahoma tornado nearly runs 
7608                                                                two giant cranes holding bridge collapse nearby homes 
7609                                                     control wild fires california even northern part state troubling 
7610                                                                                                 utckm volcano hawaii 
7611    police i

In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')

def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

In [8]:
training_df['text'] = training_df['text'].apply(lemmatizer)

In [9]:
training_df['text'].tail(10)

7603                             official say quarantine place alabama home possible ebola case develop symptom
7604                                  worldnew fall powerline glink tram update fire crew evacuate passenger tr
7605                                                flip side I m walmart bomb everyone evacuate stay tune blow
7606                           suicide bomber kill saudi security site mosque reuter via world google news wall
7607                                     stormchase violent record break ef el reno oklahoma tornado nearly run
7608                                                           two giant crane hold bridge collapse nearby home
7609                                            control wild fire california even northern part state troubling
7610                                                                                       utckm volcano hawaii
7611    police investigate ebike collide car little portugal ebike rider suffer serious nonlife threaten

In [10]:
training_target = training_df['target']
training_df.drop('target', axis=1, inplace=True)

Question 1b:
The next cell splits the training data.

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_dev, y_train, y_dev = train_test_split(training_df,training_target,random_state=104,test_size=0.3,shuffle=True)

In [12]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5329 entries, 6922 to 69
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5329 non-null   int64 
 1   keyword   5285 non-null   object
 2   location  3568 non-null   object
 3   text      5329 non-null   object
dtypes: int64(1), object(3)
memory usage: 208.2+ KB


In [13]:
x_dev.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2284 entries, 7071 to 3689
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2284 non-null   int64 
 1   keyword   2267 non-null   object
 2   location  1512 non-null   object
 3   text      2284 non-null   object
dtypes: int64(1), object(3)
memory usage: 89.2+ KB


Question 1d from here

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df = 10, binary=True)
x_train_bow = vectorizer.fit_transform(x_train['text'])
print(x_train_bow.toarray().shape)

features = list(vectorizer.get_feature_names_out())
print(len(features))
print(features[:15])

transformed_x_train = vectorizer.transform(x_train['text'])
print(transformed_x_train.shape)

pd.DataFrame(transformed_x_train.toarray()).head(5)

(5329, 1004)
1004
['abandon', 'abc', 'ablaze', 'absolutely', 'accident', 'across', 'act', 'action', 'actually', 'add', 'affect', 'aftershock', 'ago', 'agree', 'air']
(5329, 1004)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,994,995,996,997,998,999,1000,1001,1002,1003
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
transformed_x_dev = vectorizer.transform(x_dev['text'])
print(transformed_x_dev.shape)

(2284, 1004)


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.naive_bayes import BernoulliNB

Question 1e part i here

In [17]:
model1 = LogisticRegression(penalty = 'none', max_iter=1500)
model1.fit(transformed_x_train, y_train)
f1_score1_train = f1_score(y_train, model1.predict(transformed_x_train))
print(f1_score1_train)

f1_score1_dev = f1_score(y_dev, model1.predict(transformed_x_dev))
print(f1_score1_dev)



0.8446755595749491
0.704652378463147


Question 1e part ii here

In [18]:
model_L1_train = LogisticRegression(penalty='l1', solver='liblinear')
model_L1_train.fit(transformed_x_train, y_train)
f1_score_L1_train = f1_score(y_train, model_L1_train.predict(transformed_x_train))
print(f1_score_L1_train)
f1_score1_L1_dev = f1_score(y_dev, model_L1_train.predict(transformed_x_dev))
print(f1_score1_L1_dev)

0.8121637426900585
0.7236624379481522


In [19]:
model_L2_train = LogisticRegression(penalty='l2', solver='lbfgs')

model_L2_train.fit(transformed_x_train, y_train)

f1_score_L2_train = f1_score(y_train, model_L2_train.predict(transformed_x_train))
print(f1_score_L2_train)
f1_score1_L2_dev = f1_score(y_dev, model_L2_train.predict(transformed_x_dev))
print(f1_score1_L2_dev)

0.8198930978387171
0.7268746579091406


In [20]:
print(model_L1_train.coef_[0])

model_L1_coef_df = pd.DataFrame()
model_L1_coef_df['words'] = features
model_L1_coef_df['weight'] = model_L1_train.coef_[0]
print(model_L1_coef_df.sort_values(by='weight', ascending = False).head(10))
print(model_L1_coef_df.sort_values(by='weight').head(10))

[0.         0.         0.         ... 0.         0.36711927 0.        ]
          words    weight
623    outbreak  4.616913
799       spill  4.335677
907     typhoon  4.127481
963    wildfire  4.047786
110     bombing  3.969742
545     migrant  3.363149
259  earthquake  3.328956
263       ebola  3.184166
407   hiroshima  3.078705
569      murder  3.046417
        words    weight
982     write -2.427214
127       buy -2.050464
554      mode -1.785238
618    online -1.764904
175  complete -1.618294
758      self -1.609962
790      song -1.519384
129      cake -1.416140
507      long -1.410451
863     throw -1.398161


In [21]:
n = transformed_x_train.shape[0] # size of the dataset
d = transformed_x_train.shape[1] # number of features in our dataset
print(n)
print(d)

K = 2 # number of clases

# these are the shapes of the parameters
psis = np.zeros([K,d])
phis = np.zeros([K])

# we now compute the parameters
for k in range(K):
    X_k = transformed_x_train[y_train == k]
    psis[k] = np.mean(X_k, axis=0)
    phis[k] = X_k.shape[0] / float(n)

# print out the class proportions
print(phis)

5329
1004
[0.56802402 0.43197598]


In [22]:
def nb_predictions(x, psis, phis):
    """This returns class assignments and scores under the NB model.
    
    We compute \arg\max_y p(y|x) as \arg\max_y p(x|y)p(y)
    """
    # adjust shapes
    n, d = x.shape
    print(n)
    print(d)
    x = np.reshape(x.toarray(), (1, n, d))
    psis = np.reshape(psis, (K, 1, d))
    
    # clip probabilities to avoid log(0)
    psis = psis.clip(1e-14, 1-1e-14)
    
    # compute log-probabilities
    logpy = np.log(phis).reshape([K,1])
    logpxy = x * np.log(psis) + (1-x) * np.log(1-psis)
    logpyx = logpxy.sum(axis=2) + logpy

    return logpyx.argmax(axis=0).flatten(), logpyx.reshape([K,n])

idx, logpyx = nb_predictions(transformed_x_dev, psis, phis)
print(idx[:10])

2284
1004
[0 1 0 1 1 0 0 1 0 0]


In [23]:
(idx==y_dev).mean()

0.7850262697022767

In [24]:
f1_score_BNB = f1_score(y_dev, idx)
print(f1_score_BNB)

0.7291781577495863


In [25]:
vectorizer_ngram = CountVectorizer(ngram_range=(2,2), min_df=3)
x_train_ngram = vectorizer_ngram.fit_transform(x_train['text'])
print(x_train_ngram.toarray().shape)

features_ngram = list(vectorizer_ngram.get_feature_names_out())
print(len(features_ngram))
print(features_ngram[:15])

transformed_x_train_ngram = vectorizer_ngram.transform(x_train['text'])
print(transformed_x_train_ngram.shape)

pd.DataFrame(transformed_x_train_ngram.toarray()).head(10)

(5329, 1592)
1592
['aba woman', 'abandon aircraft', 'abbswinston zionist', 'abc news', 'access secret', 'accident expert', 'accident indian', 'accident man', 'accident property', 'accuse nema', 'act mass', 'action hostage', 'action year', 'activate municipal', 'add video']
(5329, 1592)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1582,1583,1584,1585,1586,1587,1588,1589,1590,1591
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
transformed_x_dev_ngram = vectorizer_ngram.transform(x_dev['text'])
print(transformed_x_dev_ngram.shape)

(2284, 1592)


In [27]:
# Logistic Regression Classifier
model1_ngram = LogisticRegression(penalty = 'none', max_iter=10000)
model1_ngram.fit(transformed_x_train_ngram, y_train)
f1_score1_train_ngram = f1_score(y_train, model1_ngram.predict(transformed_x_train_ngram))
print(f1_score1_train_ngram)



0.6767040788393102


In [28]:
model1_ngram_L1 = LogisticRegression(penalty='l1', solver='liblinear')

model1_ngram_L1.fit(transformed_x_train_ngram, y_train)
f1_score1_train_ngram_L1 = f1_score(y_train, model1_ngram_L1.predict(transformed_x_train_ngram))
print(f1_score1_train_ngram_L1)

0.6285714285714287


In [29]:
model1_ngram_L2 = LogisticRegression(penalty='l2', solver='lbfgs')

model1_ngram_L2.fit(transformed_x_train_ngram, y_train)
f1_score1_train_ngram_L2 = f1_score(y_train, model1_ngram_L2.predict(transformed_x_train_ngram))
print(f1_score1_train_ngram_L2)

0.6525303929884083


In [30]:
f1_score1_dev_ngram = f1_score(y_dev, model1_ngram.predict(transformed_x_dev_ngram))
print(f1_score1_dev_ngram)

f1_score1_dev_ngram_L1 = f1_score(y_dev, model1_ngram_L1.predict(transformed_x_dev_ngram))
print(f1_score1_dev_ngram_L1)

f1_score1_dev_ngram_L2 = f1_score(y_dev, model1_ngram_L2.predict(transformed_x_dev_ngram))
print(f1_score1_dev_ngram_L2)

0.5667796610169492
0.5440229062276306
0.5532212885154062


In [31]:
model_ngram_L1_coef_df = pd.DataFrame()
model_ngram_L1_coef_df['words'] = features_ngram
model_ngram_L1_coef_df['weight'] = model1_ngram_L1.coef_[0]
model_ngram_L1_coef_df.sort_values(by='weight', ascending = False).head(10)

Unnamed: 0,words,weight
1337,suicide bomber,3.968303
1023,oil spill,3.649493
1445,typhoon soudelor,3.49277
1338,suicide bombing,3.487778
1247,severe thunderstorm,3.381132
82,atomic bomb,3.353852
1411,train derailment,3.298614
1410,train derail,3.269008
611,helicopter crash,3.167348
268,confirm mh,3.114723


In [32]:
# Bernoulli Classifier
n_ngram = transformed_x_train_ngram.shape[0] # size of the dataset
d_ngram = transformed_x_train_ngram.shape[1] # number of features in our dataset
print(n_ngram)
print(d_ngram)

K_ngram = 2 # number of clases

# these are the shapes of the parameters
psis_ngram = np.zeros([K_ngram,d_ngram])
phis_ngram = np.zeros([K_ngram])

# we now compute the parameters
for k in range(K_ngram):
    X_k_ngram = transformed_x_train_ngram[y_train == k]
    psis_ngram[k] = np.mean(X_k_ngram, axis=0)
    phis_ngram[k] = X_k_ngram.shape[0] / float(n_ngram)

# print out the class proportions
print(phis_ngram)

5329
1592
[0.56802402 0.43197598]


In [33]:
idx_ngram, logpyx_ngram = nb_predictions(transformed_x_dev_ngram, psis_ngram, phis_ngram)
print(idx_ngram[:10])

2284
1592
[0 0 0 0 0 0 0 1 0 0]


In [34]:
(idx_ngram==y_dev).mean()

0.7140980735551664

In [35]:
f1_score_BNB_ngram = f1_score(y_dev, idx_ngram)
print(f1_score_BNB_ngram)

0.520909757887014


In [36]:
test_df['text'] = test_df['text'].apply(clean_text)
test_df['text'] = test_df['text'].apply(lemmatizer)

In [37]:
test_df['text'].head(10)

0                                  happen terrible car crash
1          hear earthquake different city stay safe everyone
2        forest fire spot pond geese flee across street save
3                          apocalypse light spokane wildfire
4                        typhoon soudelor kills china taiwan
5                                       shakingit earthquake
6    they d probably still show life arsenal yesterday eh eh
7                                                        hey
8                                                   nice hat
9                                                       fuck
Name: text, dtype: object

In [38]:
vectorizer_fulltraining_ngram = CountVectorizer(ngram_range=(2,2), min_df=3)
training_df_ngram = vectorizer_fulltraining_ngram.fit_transform(training_df['text'])
print(training_df_ngram.toarray().shape)

features_fulltraining_ngram = list(vectorizer_fulltraining_ngram.get_feature_names_out())
print(len(features_fulltraining_ngram))
print(features_fulltraining_ngram[:15])

transformed_training_df_ngram = vectorizer_fulltraining_ngram.transform(training_df['text'])
print(transformed_training_df_ngram.shape)

pd.DataFrame(transformed_training_df_ngram.toarray()).head(10)


(7613, 2422)
2422
['aba woman', 'abandon aircraft', 'abbswinston zionist', 'abc news', 'abcnews obama', 'access secret', 'accident expert', 'accident indian', 'accident man', 'accident property', 'account hiroshima', 'accuse nema', 'act mass', 'action hostage', 'action year']
(7613, 2422)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2412,2413,2414,2415,2416,2417,2418,2419,2420,2421
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
# Bernoulli Classifier
n_ft_ngram = transformed_training_df_ngram.shape[0] # size of the dataset
d_ft_ngram = transformed_training_df_ngram.shape[1] # number of features in our dataset
print(n_ft_ngram)
print(d_ft_ngram)

K_ft_ngram = 2 # number of clases

# these are the shapes of the parameters
psis_ft_ngram = np.zeros([K_ft_ngram,d_ft_ngram])
phis_ft_ngram = np.zeros([K_ft_ngram])

# we now compute the parameters
for k in range(K_ft_ngram):
    X_k_ft_ngram = transformed_training_df_ngram[training_target == k]
    psis_ft_ngram[k] = np.mean(X_k_ft_ngram, axis=0)
    phis_ft_ngram[k] = X_k_ft_ngram.shape[0] / float(n_ft_ngram)

# print out the class proportions
print(phis_ft_ngram)

7613
2422
[0.57034021 0.42965979]


In [40]:
transformed_test_ngram = vectorizer_fulltraining_ngram.transform(test_df['text'])
print(transformed_test_ngram.shape)

(3263, 2422)


In [41]:
idx_ft_ngram, logpyx_ft_ngram = nb_predictions(transformed_test_ngram, psis_ft_ngram, phis_ft_ngram)
print(idx_ft_ngram[:10])


3263
2422
[0 0 0 0 1 0 0 0 0 0]


In [42]:
y_test_pred_df = pd.DataFrame(idx_ft_ngram, columns = ['target'])
test_ids = test_df['id']
submission_list = pd.concat([test_ids, y_test_pred_df], axis=1, join='inner')
submission_list.info()
submission_list.to_csv('/kaggle/working/predictions_for_submission', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   id      3263 non-null   int64
 1   target  3263 non-null   int64
dtypes: int64(2)
memory usage: 51.1 KB
