In [38]:
import pandas as pd
import numpy as np 

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


### Import training data

In [2]:
train = pd.read_csv('E:/Arvind/Kaggle/Real or Not NLP with Disaster Tweets/Input/train.csv')

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


### Column wise EDA

1) keyword

In [6]:
train['keyword'].value_counts(normalize = 'True').sort_index()

ablaze                 0.004767
accident               0.004635
aftershock             0.004502
airplane%20accident    0.004635
ambulance              0.005032
                         ...   
wounded                0.004899
wounds                 0.004370
wreck                  0.004899
wreckage               0.005164
wrecked                0.005164
Name: keyword, Length: 221, dtype: float64

We will not be using keyword column, instead we will rely on the text column since this information is redundant and sometimes misleading.

2) location  

In [7]:
train['location'].value_counts(normalize = 'True').sort_index()

                          0.000197
  Glasgow                 0.000197
  Melbourne, Australia    0.000197
  News                    0.000197
  å_                      0.000197
                            ...   
å_: ?? ÌÑ ? : ?           0.000197
å_å_Los Mina Cityã¢      0.000197
å¡å¡Midwest Û¢Û¢        0.000197
åÊ(?Û¢`?Û¢å«)??         0.000197
åø\_(?)_/åø               0.000197
Name: location, Length: 3341, dtype: float64

We will not be using location column because it has weird characters in the string.

### Text Column Processing
1) convert to lower
2) tokenization
3) remove stop words 
4) stemming

In [8]:
train['text'] = train['text'].str.lower()

In [9]:
train['text'].head()

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
Name: text, dtype: object

In [10]:
def tokenize(row):
    tokens = nltk.word_tokenize(row)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

In [11]:
train['text_tokenized'] = train['text'].apply(tokenize)

In [12]:
train.head()

Unnamed: 0,id,keyword,location,text,target,text_tokenized
0,1,,,our deeds are the reason of this #earthquake m...,1,"[our, deeds, are, the, reason, of, this, earth..."
1,4,,,forest fire near la ronge sask. canada,1,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,all residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, in, place, are, be..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfires, evacuation, order..."
4,7,,,just got sent this photo from ruby #alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala..."


In [13]:

stops = set(stopwords.words("english"))   

def remove_stops(row):
    my_list = row
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)


In [14]:
train['text_tokenized_stopremoved'] = train['text_tokenized'].apply(remove_stops)

In [15]:
train.head()

Unnamed: 0,id,keyword,location,text,target,text_tokenized,text_tokenized_stopremoved
0,1,,,our deeds are the reason of this #earthquake m...,1,"[our, deeds, are, the, reason, of, this, earth...","[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,forest fire near la ronge sask. canada,1,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]"
2,5,,,all residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, in, place, are, be...","[residents, asked, place, notified, officers, ..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfires, evacuation, order...","[people, receive, wildfires, evacuation, order..."
4,7,,,just got sent this photo from ruby #alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala...","[got, sent, photo, ruby, alaska, smoke, wildfi..."


In [16]:

stemming = PorterStemmer()

def stem(row):
    stemmed_list = [stemming.stem(word) for word in row]
    return (stemmed_list)


In [17]:
train['text_tokenized_stopremoved_stemmed'] = train['text_tokenized_stopremoved'].apply(stem)

In [18]:
train.head()

Unnamed: 0,id,keyword,location,text,target,text_tokenized,text_tokenized_stopremoved,text_tokenized_stopremoved_stemmed
0,1,,,our deeds are the reason of this #earthquake m...,1,"[our, deeds, are, the, reason, of, this, earth...","[deeds, reason, earthquake, may, allah, forgiv...","[deed, reason, earthquak, may, allah, forgiv, us]"
1,4,,,forest fire near la ronge sask. canada,1,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]"
2,5,,,all residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, in, place, are, be...","[residents, asked, place, notified, officers, ...","[resid, ask, place, notifi, offic, evacu, shel..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfires, evacuation, order...","[people, receive, wildfires, evacuation, order...","[peopl, receiv, wildfir, evacu, order, califor..."
4,7,,,just got sent this photo from ruby #alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala...","[got, sent, photo, ruby, alaska, smoke, wildfi...","[got, sent, photo, rubi, alaska, smoke, wildfi..."


In [19]:
def rejoin_words(row):
    joined_words = ( " ".join(row))
    return joined_words

In [20]:
train['text_processed'] = train['text_tokenized_stopremoved_stemmed'].apply(rejoin_words)

In [21]:
train.head()

Unnamed: 0,id,keyword,location,text,target,text_tokenized,text_tokenized_stopremoved,text_tokenized_stopremoved_stemmed,text_processed
0,1,,,our deeds are the reason of this #earthquake m...,1,"[our, deeds, are, the, reason, of, this, earth...","[deeds, reason, earthquake, may, allah, forgiv...","[deed, reason, earthquak, may, allah, forgiv, us]",deed reason earthquak may allah forgiv us
1,4,,,forest fire near la ronge sask. canada,1,"[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]",forest fire near la rong sask canada
2,5,,,all residents asked to 'shelter in place' are ...,1,"[all, residents, asked, to, in, place, are, be...","[residents, asked, place, notified, officers, ...","[resid, ask, place, notifi, offic, evacu, shel...",resid ask place notifi offic evacu shelter pla...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[people, receive, wildfires, evacuation, order...","[people, receive, wildfires, evacuation, order...","[peopl, receiv, wildfir, evacu, order, califor...",peopl receiv wildfir evacu order california
4,7,,,just got sent this photo from ruby #alaska as ...,1,"[just, got, sent, this, photo, from, ruby, ala...","[got, sent, photo, ruby, alaska, smoke, wildfi...","[got, sent, photo, rubi, alaska, smoke, wildfi...",got sent photo rubi alaska smoke wildfir pour ...


### Import Test Data

In [22]:
test = pd.read_csv('E:/Arvind/Kaggle/Real or Not NLP with Disaster Tweets/Input/test.csv')

In [23]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [24]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [25]:
test['text'] = test['text'].str.lower()

In [26]:
test['text'].head()

0                   just happened a terrible car crash
1    heard about #earthquake is different cities, s...
2    there is a forest fire at spot pond, geese are...
3             apocalypse lighting. #spokane #wildfires
4        typhoon soudelor kills 28 in china and taiwan
Name: text, dtype: object

In [27]:
test['text_tokenized'] = test['text'].apply(tokenize)

In [28]:
test.head()

Unnamed: 0,id,keyword,location,text,text_tokenized
0,0,,,just happened a terrible car crash,"[just, happened, a, terrible, car, crash]"
1,2,,,"heard about #earthquake is different cities, s...","[heard, about, earthquake, is, different, citi..."
2,3,,,"there is a forest fire at spot pond, geese are...","[there, is, a, forest, fire, at, spot, pond, g..."
3,9,,,apocalypse lighting. #spokane #wildfires,"[apocalypse, lighting, spokane, wildfires]"
4,11,,,typhoon soudelor kills 28 in china and taiwan,"[typhoon, soudelor, kills, in, china, and, tai..."


In [29]:
test['text_tokenized_stopremoved'] = test['text_tokenized'].apply(remove_stops)

In [30]:
test.head()

Unnamed: 0,id,keyword,location,text,text_tokenized,text_tokenized_stopremoved
0,0,,,just happened a terrible car crash,"[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]"
1,2,,,"heard about #earthquake is different cities, s...","[heard, about, earthquake, is, different, citi...","[heard, earthquake, different, cities, stay, s..."
2,3,,,"there is a forest fire at spot pond, geese are...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr..."
3,9,,,apocalypse lighting. #spokane #wildfires,"[apocalypse, lighting, spokane, wildfires]","[apocalypse, lighting, spokane, wildfires]"
4,11,,,typhoon soudelor kills 28 in china and taiwan,"[typhoon, soudelor, kills, in, china, and, tai...","[typhoon, soudelor, kills, china, taiwan]"


In [31]:
test['text_tokenized_stopremoved_stemmed'] = test['text_tokenized_stopremoved'].apply(stem)

In [32]:
test.head()

Unnamed: 0,id,keyword,location,text,text_tokenized,text_tokenized_stopremoved,text_tokenized_stopremoved_stemmed
0,0,,,just happened a terrible car crash,"[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[happen, terribl, car, crash]"
1,2,,,"heard about #earthquake is different cities, s...","[heard, about, earthquake, is, different, citi...","[heard, earthquake, different, cities, stay, s...","[heard, earthquak, differ, citi, stay, safe, e..."
2,3,,,"there is a forest fire at spot pond, geese are...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[forest, fire, spot, pond, gees, flee, across,..."
3,9,,,apocalypse lighting. #spokane #wildfires,"[apocalypse, lighting, spokane, wildfires]","[apocalypse, lighting, spokane, wildfires]","[apocalyps, light, spokan, wildfir]"
4,11,,,typhoon soudelor kills 28 in china and taiwan,"[typhoon, soudelor, kills, in, china, and, tai...","[typhoon, soudelor, kills, china, taiwan]","[typhoon, soudelor, kill, china, taiwan]"


In [33]:
test['text_processed'] = test['text_tokenized_stopremoved_stemmed'].apply(rejoin_words)

In [36]:
test.head()

Unnamed: 0,id,keyword,location,text,text_tokenized,text_tokenized_stopremoved,text_tokenized_stopremoved_stemmed,text_processed
0,0,,,just happened a terrible car crash,"[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[happen, terribl, car, crash]",happen terribl car crash
1,2,,,"heard about #earthquake is different cities, s...","[heard, about, earthquake, is, different, citi...","[heard, earthquake, different, cities, stay, s...","[heard, earthquak, differ, citi, stay, safe, e...",heard earthquak differ citi stay safe everyon
2,3,,,"there is a forest fire at spot pond, geese are...","[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[forest, fire, spot, pond, gees, flee, across,...",forest fire spot pond gees flee across street ...
3,9,,,apocalypse lighting. #spokane #wildfires,"[apocalypse, lighting, spokane, wildfires]","[apocalypse, lighting, spokane, wildfires]","[apocalyps, light, spokan, wildfir]",apocalyps light spokan wildfir
4,11,,,typhoon soudelor kills 28 in china and taiwan,"[typhoon, soudelor, kills, in, china, and, tai...","[typhoon, soudelor, kills, china, taiwan]","[typhoon, soudelor, kill, china, taiwan]",typhoon soudelor kill china taiwan


In [39]:
vectorizer=TfidfVectorizer(ngram_range=(1,3),min_df=3,strip_accents='unicode',use_idf=1,smooth_idf=1,sublinear_tf=1,max_features=None)
vectorizer.fit(list(train['text_processed'])+list(test['text_processed']))
print('vocab length',len(vectorizer.vocabulary_))

vocab length 10794


In [40]:
X_train = vectorizer.transform(train['text_processed']).todense()
X_test = vectorizer.transform(test['text_processed']).todense()

In [41]:
RF = RandomForestClassifier(n_estimators=201)
param_grid = { "criterion" : ["gini", "entropy"]}
gs = GridSearchCV(estimator=RF, param_grid=param_grid, cv=3)

In [42]:
gs.fit(X_train,train['target'])

GridSearchCV(cv=3, estimator=RandomForestClassifier(n_estimators=200),
             param_grid={'criterion': ['gini', 'entropy']})

In [224]:
print(gs.best_score_)
print(gs.best_params_)

0.6548016076022852
{'criterion': 'entropy'}


In [225]:
submission = gs.predict(X_test)

In [226]:
pd.DataFrame(submission).head()

Unnamed: 0,0
0,0
1,1
2,1
3,1
4,1


In [227]:
submission = pd.concat((test['id'],pd.DataFrame(submission,columns = ['target'])),axis =1)

In [228]:
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1


In [229]:
submission.shape

(3263, 2)

In [230]:
submission.to_csv('E:/Arvind/Kaggle/Real or Not NLP with Disaster Tweets/Output/submission_v5.csv')