In [16]:
import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, confusion_matrix,classification_report,roc_auc_score,roc_curve
from sklearn.pipeline import Pipeline

import pickle

In [9]:
df=pd.read_csv('dataset/train_preprocessed_prelim.csv')

In [10]:
df.head(2)

Unnamed: 0,keyword,location,text,target,Word_Count,Character_count,StopWord_Count,Mention_count,Hashtag_count,Numerics_Count,Upper_Count,emails,email_Count,url_flag,Retweet_flag
0,Missing,Missing,deed reason earthquake allah forgive,1,13,57,6,0,1,0,1,[],0,0,0
1,Missing,Missing,forest fire near la canada,1,7,32,0,0,0,0,0,[],0,0,0


## Generating Independent and dependent feature

In [11]:
tf_idf=TfidfVectorizer()
X=tf_idf.fit_transform(df['text'])
X=X.toarray()

In [12]:
y=df['target']

In [13]:
# train_test split
X_train, X_test, y_train, y_test = train_test_split(\
X, y, test_size=0.2, random_state=42)

## Model building
### RandomForestClassifer

In [10]:
model_rf=RandomForestClassifier()
model_rf.fit(X_train,y_train)

RandomForestClassifier()

#### Making Prediction

In [11]:
pred_prob_rf=model_rf.predict_proba(X_test)[:,1]

In [12]:
fpr, tpr, thresholds = roc_curve(y_test, pred_prob_rf)

#### To find the best threshold based on roc

In [13]:
roc_auc_score_ = []
for thres in thresholds:
    y_pred = np.where(pred_prob_rf>thres,1,0)
    roc_auc_score_.append(roc_auc_score(y_test, y_pred))
    
roc_auc_score_ = pd.concat([pd.Series(thresholds), pd.Series(roc_auc_score_)],
                        axis=1)
roc_auc_score_.columns = ['thresholds', 'roc_auc_score']
roc_auc_score_.sort_values(by='roc_auc_score', ascending=False, inplace=True)
roc_auc_score_.head()

Unnamed: 0,thresholds,roc_auc_score
125,0.54,0.763202
123,0.546667,0.763004
124,0.545,0.762431
122,0.5475,0.762232
120,0.554,0.760866


In [16]:
pred_rf=np.where(pred_prob_rf>0.540000,1,0)

#### Metrics

In [17]:
print("Test roc_auc_score = {}".format(roc_auc_score(y_test,pred_rf)))
print("Test Accuracy= {}".format(accuracy_score(y_test,pred_rf)))
pd.DataFrame(confusion_matrix(y_test,pred_rf),\
             columns=['Predicted Negative', 'Predicted Positive'],\
             index=['Actual Negative', 'Actual Positive'])  

Test roc_auc_score = 0.7632022312832711
Test Accuracy= 0.7769736842105263


Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,747,125
Actual Positive,214,434


### SVC

In [13]:
model_svc=SVC()
model_svc.fit(X_train,y_train)

SVC()

#### Making Prediction

In [18]:
pred_svc=model_svc.predict(X_test)

#### Metrics

In [19]:
print("Test roc_auc_score = {}".format(roc_auc_score(y_test,pred_svc)))
print("Test Accuracy= {}".format(accuracy_score(y_test,pred_svc)))
print("Classification report\n{}".format(classification_report(y_test,pred_svc)))
pd.DataFrame(confusion_matrix(y_test,pred_svc),\
             columns=['Predicted Negative', 'Predicted Positive'],\
             index=['Actual Negative', 'Actual Positive'])  

Test roc_auc_score = 0.7812889341941329
Test Accuracy= 0.8
Classification report
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       872
           1       0.84      0.65      0.74       648

    accuracy                           0.80      1520
   macro avg       0.81      0.78      0.79      1520
weighted avg       0.81      0.80      0.80      1520



Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,792,80
Actual Positive,224,424


## Submission

In [14]:
model_final=SVC()
model_final.fit(X,y)

SVC()

In [17]:
### Saving Model and Vectorizer
pickle.dump(model_final, open('model_SVC.pkl', 'wb'))
pickle.dump(tf_idf,open('Vectorizer_tfidf.pkl', 'wb'))