In [46]:
import numpy as np
import pandas as pd 

import preprocessing as pu

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
import lightgbm as lgb

from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,roc_auc_score,roc_curve,f1_score
from sklearn.pipeline import Pipeline

import pickle

In [47]:
df=pd.read_csv('dataset/train.csv')

In [48]:
df.head(2)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1


In [49]:
def Preprocess_text(df,col,text=True):
    df=pu.DealMissing(df,col)
    df=pu.LowerCase(df,col)
    df=pu.cont2expansion(df,col)
    df=pu.Remove_Emails(df,col)
    df=pu.Remove_urls(df,col)
    df=pu.Remove_rt(df,col)
    df=pu.Remove_SpecialChar(df,col)
    df=pu.Remove_Numeric(df,col)
    df=pu.Remove_ExtraSpaces(df,col)
    df=pu.Remove_HTMLTags(df,col)
    df=pu.Remove_AccentedChar(df,col)
    df=pu.Remove_StopWords(df,col)
    df=pu.Convert2Base(df,col)
    if text:
        df=pu.Remove_MostOccuring(df,col)   #Vary with data
        df=pu.Remove_RarelyOccuring(df,col)  #Vary with data
        df=pu.Remove_Blank(df,col)
        print('Main Text')
    return df

In [50]:
df=Preprocess_text(df,col='text')

Main Text


In [51]:
df=Preprocess_text(df,col='keyword',text=False)

In [52]:
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,reason earthquake allah forgive,1
1,4,,,forest fire near la canada,1
2,5,,,resident ask shelter place officer evacuation ...,1
3,6,,,people receive wildfire evacuation order calif...,1
4,7,,,got send photo alaska smoke wildfire pour school,1


In [53]:
df['combined']=df['text']+' '+df['keyword']

In [54]:
df.head()

Unnamed: 0,id,keyword,location,text,target,combined
0,1,,,reason earthquake allah forgive,1,reason earthquake allah forgive null
1,4,,,forest fire near la canada,1,forest fire near la canada null
2,5,,,resident ask shelter place officer evacuation ...,1,resident ask shelter place officer evacuation ...
3,6,,,people receive wildfire evacuation order calif...,1,people receive wildfire evacuation order calif...
4,7,,,got send photo alaska smoke wildfire pour school,1,got send photo alaska smoke wildfire pour scho...


## Generating Independent and dependent feature

In [99]:
tf_idf=TfidfVectorizer()
X=tf_idf.fit_transform(df['text'])
X=X.toarray()

In [100]:
y=df['target']

In [95]:
# train_test split
X_train, X_test, y_train, y_test = train_test_split(\
X, y, test_size=0.2, random_state=42)

## Model building
### RandomForestClassifer

In [89]:
model_rf=RandomForestClassifier()
model_rf.fit(X_train,y_train)

RandomForestClassifier()

#### Making Prediction

In [90]:
pred_prob_rf=model_rf.predict_proba(X_test)[:,1]

In [91]:
fpr, tpr, thresholds = roc_curve(y_test, pred_prob_rf)

#### To find the best threshold based on roc

In [92]:
f1_score_ = []
for thres in thresholds:
    y_pred = np.where(pred_prob_rf>thres,1,0)
    f1_score_.append(f1_score(y_test, y_pred))
    
f1_score_ = pd.concat([pd.Series(thresholds), pd.Series(f1_score_)],
                        axis=1)
f1_score_.columns = ['thresholds', 'f1_score']
f1_score_.sort_values(by='f1_score', ascending=False, inplace=True)
f1_score_.head()

Unnamed: 0,thresholds,f1_score
198,0.319238,0.739785
199,0.313333,0.739099
200,0.31,0.738571
197,0.32,0.736311
196,0.321429,0.7354


In [63]:
pred_rf=np.where(pred_prob_rf>0.310667,1,0)

#### Metrics

In [64]:
print("Test roc_auc_score = {}".format(roc_auc_score(y_test,pred_rf)))
print("Test f1_score = {}".format(f1_score(y_test,pred_rf)))
print("Test Accuracy= {}".format(accuracy_score(y_test,pred_rf)))
pd.DataFrame(confusion_matrix(y_test,pred_rf),\
             columns=['Predicted Negative', 'Predicted Positive'],\
             index=['Actual Negative', 'Actual Positive'])  

Test roc_auc_score = 0.7632697533260402
Test f1_score = 0.7411598302687411
Test Accuracy= 0.7590520078999342


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,629,231
Actual Positive,135,524


In [27]:
model_rf.feature_importances_

array([3.12841410e-05, 2.44260340e-04, 4.81202750e-04, ...,
       2.83582806e-04, 1.41052678e-09, 1.42862945e-08])

# ---------------------------------------------------------------------------------------------------------------

### ExtratreeClassifer

In [65]:
model_et=ExtraTreesClassifier()
model_et.fit(X_train,y_train)

ExtraTreesClassifier()

#### Making Prediction

In [66]:
pred_prob_et=model_et.predict_proba(X_test)[:,1]

In [67]:
fpr, tpr, thresholds = roc_curve(y_test, pred_prob_et)

#### To find the best threshold based on roc

In [69]:
f1_score_ = []
for thres in thresholds:
    y_pred = np.where(pred_prob_et>thres,1,0)
    f1_score_.append(f1_score(y_test, y_pred))
    
f1_score_ = pd.concat([pd.Series(thresholds), pd.Series(f1_score_)],
                        axis=1)
f1_score_.columns = ['thresholds', 'f1_score']
f1_score_.sort_values(by='f1_score', ascending=False, inplace=True)
f1_score_.head()

Unnamed: 0,thresholds,f1_score
134,0.4,0.739195
133,0.405,0.738806
129,0.418571,0.737397
135,0.391667,0.737389
136,0.39,0.736842


In [70]:
pred_et=np.where(pred_prob_et>0.400000,1,0)

#### Metrics

In [71]:
print("Test roc_auc_score = {}".format(roc_auc_score(y_test,pred_et)))
print("Test f1_score = {}".format(f1_score(y_test,pred_et)))
print("Test Accuracy= {}".format(accuracy_score(y_test,pred_et)))
pd.DataFrame(confusion_matrix(y_test,pred_et),\
             columns=['Predicted Negative', 'Predicted Positive'],\
             index=['Actual Negative', 'Actual Positive'])  

Test roc_auc_score = 0.7676068391149381
Test f1_score = 0.7391952309985097
Test Accuracy= 0.7695852534562212


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,673,187
Actual Positive,163,496


# ---------------------------------------------------------------------------------------------------------------

### Light GBM

In [72]:
model_lgbm = lgb.LGBMClassifier()
model_lgbm.fit(X_train,y_train)

LGBMClassifier()

#### Making Prediction

In [73]:
pred_prob_lgbm=model_lgbm.predict_proba(X_test)[:,1]

In [74]:
fpr, tpr, thresholds = roc_curve(y_test, pred_prob_lgbm)

#### To find the best threshold based on roc

In [75]:
f1_score_ = []
for thres in thresholds:
    y_pred = np.where(pred_prob_lgbm>thres,1,0)
    f1_score_.append(f1_score(y_test, y_pred))
    
f1_score_ = pd.concat([pd.Series(thresholds), pd.Series(f1_score_)],
                        axis=1)
f1_score_.columns = ['thresholds', 'f1_score']
f1_score_.sort_values(by='f1_score', ascending=False, inplace=True)
f1_score_.head()

Unnamed: 0,thresholds,f1_score
305,0.357041,0.736689
304,0.357791,0.735766
303,0.361088,0.73538
302,0.362936,0.734993
301,0.364775,0.734604


In [76]:
pred_lgbm=np.where(pred_prob_lgbm>0.357041,1,0)

#### Metrics

In [77]:
print("Test roc_auc_score = {}".format(roc_auc_score(y_test,pred_lgbm)))
print("Test f1_score = {}".format(f1_score(y_test,pred_lgbm)))
print("Test Accuracy= {}".format(accuracy_score(y_test,pred_lgbm)))
pd.DataFrame(confusion_matrix(y_test,pred_lgbm),\
             columns=['Predicted Negative', 'Predicted Positive'],\
             index=['Actual Negative', 'Actual Positive'])  

Test roc_auc_score = 0.7628074602110315
Test f1_score = 0.7366885485047411
Test Accuracy= 0.7623436471362739


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,653,207
Actual Positive,154,505


# ---------------------------------------------------------------------------------------------------------------

### SVC

In [78]:
model_svc=SVC()
model_svc.fit(X_train,y_train)

SVC()

#### Making Prediction

In [79]:
pred_svc=model_svc.predict(X_test)

#### Metrics

In [80]:
print("Test roc_auc_score = {}".format(roc_auc_score(y_test,pred_svc)))
print("Test f1_score = {}".format(f1_score(y_test,pred_svc)))
print("Test Accuracy= {}".format(accuracy_score(y_test,pred_svc)))
print("Classification report\n{}".format(classification_report(y_test,pred_svc)))
pd.DataFrame(confusion_matrix(y_test,pred_svc),\
             columns=['Predicted Negative', 'Predicted Positive'],\
             index=['Actual Negative', 'Actual Positive'])

Test roc_auc_score = 0.7769903306630908
Test f1_score = 0.7363184079601992
Test Accuracy= 0.7906517445687953
Classification report
              precision    recall  f1-score   support

           0       0.78      0.88      0.83       860
           1       0.81      0.67      0.74       659

    accuracy                           0.79      1519
   macro avg       0.80      0.78      0.78      1519
weighted avg       0.79      0.79      0.79      1519



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,757,103
Actual Positive,215,444


# ---------------------------------------------------------------------------------------------------------------

## Submission

In [101]:
model_final=SVC()
model_final.fit(X,y)

SVC()

In [102]:
### Saving Model and Vectorizer
pickle.dump(model_final, open('model_SVC.pkl', 'wb'))
pickle.dump(tf_idf,open('Vectorizer_tfidf.pkl', 'wb'))