In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
train

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## Text analysis

In [4]:
import re
import string

In [5]:
# tweets that has link
train.loc[train['text'].str.contains(r'http:\S+')]

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
37,55,ablaze,World Wide!!,INEC Office in Abia Set Ablaze - http://t.co/3...,1
...,...,...,...,...,...
7606,10866,,,Suicide bomber kills 15 in Saudi security site...,1
7607,10867,,,#stormchase Violent Record Breaking EF-5 El Re...,1
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1


In [6]:
train['link'] = train['text'].str.contains(r'http:\S+').astype(int)

In [7]:
train

Unnamed: 0,id,keyword,location,text,target,link
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,0
2,5,,,All residents asked to 'shelter in place' are ...,1,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,0
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,1
7611,10872,,,Police investigating after an e-bike collided ...,1,0


In [8]:
train.corr()

Unnamed: 0,id,target,link
id,1.0,0.060781,0.039645
target,0.060781,1.0,0.269713
link,0.039645,0.269713,1.0


Low correlation between link and target

In [9]:
# remove @
train['text'] = train['text'].replace(r'@\S+', '', regex=True)

In [10]:
# remove link
train['text'] = train['text'].replace(r'http:\S+', '', regex=True)

In [11]:
# remove punctuation
train['text'] = train['text'].replace('[^\w\s]','', regex=True)

In [12]:
train

Unnamed: 0,id,keyword,location,text,target,link
0,1,,,Our Deeds are the Reason of this earthquake Ma...,1,0
1,4,,,Forest fire near La Ronge Sask Canada,1,0
2,5,,,All residents asked to shelter in place are be...,1,0
3,6,,,13000 people receive wildfires evacuation orde...,1,0
4,7,,,Just got sent this photo from Ruby Alaska as s...,1,0
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,1
7609,10870,,,The out of control wild fires in California ...,1,0
7610,10871,,,M194 0104 UTC5km S of Volcano Hawaii,1,1
7611,10872,,,Police investigating after an ebike collided w...,1,0


## Machine learning

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [14]:
X = train['text']
y = train['target']

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [16]:
# 2-gram
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

In [17]:
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [18]:
len(vect.get_feature_names())

3713

In [19]:
# parameters for logistic regression
lr_param_grid = {'C':[0.01,0.1,1,10,20,100],
                'penalty':['l1','l2'],
                'fit_intercept':[True,False]}
lr_grid = GridSearchCV(LogisticRegression(),
                      lr_param_grid,
                      cv=5,
                      scoring='roc_auc',
                      return_train_score=True)
lr_grid.fit(X_train_vectorized, y_train)
lr_grid.best_params_

{'C': 0.1, 'fit_intercept': True, 'penalty': 'l2'}

In [20]:
model = lr_grid.best_estimator_
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:' )
print(feature_names[sorted_coef_index[:10]])
      
print('\n Largest Coefs:')      
print(feature_names[sorted_coef_index[:-11:-1]])

Smallest Coefs:
['my' 'new' 'you' 'full' 'im' 'body' 'your' 'love' 'screaming' 'or']

 Largest Coefs:
['hiroshima' 'fires' 'suicide' 'wildfire' 'california' 'storm'
 'earthquake' 'train' 'disaster' 'bombing']


In [22]:
# evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

predictions = model.predict(vect.transform(X_val))
y_proba = model.predict_proba(vect.transform(X_val))
                              
print('AUC: ', roc_auc_score(y_val, y_proba[:,1])) 

AUC:  0.8543444462858927


## Model predict with test dataset

In [23]:
test['text'] = test['text'].replace(r'@\S+', '', regex=True)
test['text'] = test['text'].replace(r'http:\S+', '', regex=True)
test['text'] = test['text'].replace('[^\w\s]','', regex=True)

In [30]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,Heard about earthquake is different cities sta...
2,3,,,there is a forest fire at spot pond geese are ...
3,9,,,Apocalypse lighting Spokane wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTEN...
3259,10865,,,Storm in RI worse than last hurricane My citya...
3260,10868,,,Green Line derailment in Chicago
3261,10874,,,MEG issues Hazardous Weather Outlook HWO


In [24]:
X_test = test['text']

In [25]:
len(X_test)

3263

In [26]:
y_pred = model.predict(vect.transform(X_test))
y_proba = model.predict_proba(vect.transform(X_test))

In [27]:
len(y_pred)

3263

In [28]:
pred = pd.DataFrame(y_pred)

In [29]:
df = pd.read_csv('./sample_submission.csv')
output = pd.concat([df['id'],pred], axis=1)
output.columns = ['id','target']
output.to_csv('./submission.csv', index=False)
output

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,0
3260,10868,1
3261,10874,1
