# Disaster Tweet Classification #2

This notebook consists of TFIDF vectorization and modelling and tuning of the TFIDF models.

The BERT model is a complex model and consists of specific preprocessing and setting. The training time of the model was over 2 hours.

I didnt want to tamper with the results of the model at any cost.
Therefore, this separate notebook is maintained.

In [None]:
# Imports
import string
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline

from plotly import tools
import plotly.offline as py
import plotly.figure_factory as ff
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn import model_selection, preprocessing, metrics, ensemble, naive_bayes, linear_model
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from statistics import *
import concurrent.futures
import time
import pyLDAvis.sklearn
from pylab import bone, pcolor, colorbar, plot, show, rcParams, savefig
import textstat

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

import pandas as pd
pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

import numpy as np

from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import folium 
from folium import plugins 

import re


#### 1. Utility functions

In [None]:

#utility functions:
def plot_readability(a,b,title,bins=0.1,colors=['#3A4750', '#F64E8B']):
  trace1 = ff.create_distplot([a,b], [" Real disaster tweets","Not real disaster tweets"], bin_size=bins, colors=colors, show_rug=False)
  
  trace1['layout'].update(title=title)
  py.iplot(trace1, filename='Distplot')
  table_data= [["Statistical Measures"," Not real disaster tweets","real disaster tweets"], ["Mean",mean(a),mean(b)], ["Standard Deviation",pstdev(a),pstdev(b)],
               ["Variance",pvariance(a),pvariance(b)],
               ["Median",median(a),median(b)],
               ["Maximum value",max(a),max(b)],
               ["Minimum value",min(a),min(b)]]
  trace2 = ff.create_table(table_data)
  py.iplot(trace2, filename='Table')

punctuations = string.punctuation
stopwords = list(STOP_WORDS)

parser = English()
def spacy_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    mytokens = " ".join([i for i in mytokens])
    return mytokens


def cleanhtml(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext

def removeurl(raw_text):
  clean_text = re.sub(r'^https?:\/\/.*[\r\n]*', '', raw_text, flags=re.MULTILINE)
  return clean_text

### 2. Import Data

We work on the preprocessed data that we had created previously while working on the BERT model.

In [None]:
# Importing cleaned data
train_df = pd.read_csv('data/preprocessed_train.csv')
test_df = pd.read_csv('data/preprocessed_test.csv')
sub_df = pd.read_csv('data/sample_submission.csv')


In [None]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,id,keyword,location,text,target,word_count,unique_word_count,stopword_count,url_count,mean_word_length,char_count,punctuation_count,hashtag_count,mention_count,text_cleaned,target_relabeled
0,0,1,no_keyword,no_location,Our Deeds are the Reason of this #earthquake M...,1,13,13,5,0,4.384615,69,1,1,0,Our Deeds are the Reason of this # earthquake...,1
1,1,4,no_keyword,no_location,Forest fire near La Ronge Sask. Canada,1,7,7,0,0,4.571429,38,1,0,0,Forest fire near La Ronge Sask . Canada,1
2,2,5,no_keyword,no_location,All residents asked to 'shelter in place' are ...,1,22,20,9,0,5.090909,133,3,0,0,All residents asked to ' shelter in place ' ...,1
3,3,6,no_keyword,no_location,"13,000 people receive #wildfires evacuation or...",1,8,8,1,0,7.125,65,2,1,0,"13,000 people receive # wildfires evacuation ...",1
4,4,7,no_keyword,no_location,Just got sent this photo from Ruby #Alaska as ...,1,16,15,6,0,4.5,88,2,2,0,Just got sent this photo from Ruby # Alaska a...,1


#### 3. TFIDF Vectorization

In [None]:
tfidf_vec = TfidfVectorizer(stop_words='english', ngram_range=(1,3))
tfidf_vec.fit_transform(train_df['text_cleaned'].values.tolist() + test_df['text_cleaned'].values.tolist())
train_tfidf = tfidf_vec.transform(train_df['text_cleaned'].values.tolist())
test_tfidf = tfidf_vec.transform(test_df['text_cleaned'].values.tolist())

In [None]:
train_tfidf

<7613x139122 sparse matrix of type '<class 'numpy.float64'>'
	with 174644 stored elements in Compressed Sparse Row format>

#### 4. Building Classification models

##### 4.1 Generic Classification with default parameters

##### 4.1.1 Define function

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

def getClassifierObj(classifier, rs):
    '''Functio to get the classifier object
    '''
    if classifier == 'XGBoost':
        from xgboost import XGBClassifier
        classifier = XGBClassifier()
    
    elif classifier == 'LogisticRegression':
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(random_state = rs)
    
    elif classifier == 'KNN':
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    
    elif classifier == 'SVM':
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'linear', random_state = rs)
        
    elif classifier == 'Kernel SVM':
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'rbf', random_state = rs)
        
    elif classifier == 'NB':
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB()
        
    elif classifier == 'DecisionTree':
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = rs)
        
    elif classifier == 'RandomForest':
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = rs)
        
    return classifier
            
def fitAndPredict(estimator,X_train,y_train,X_test):
    '''Function to fit and predict
    '''
    estimator.fit(X_train,y_train)
    predictions = estimator.predict(X_test)
    return predictions

def getModelAccuracy(y_test, y_pred):
    '''Function to get model accuracy'''
    from sklearn.metrics import accuracy_score
    return accuracy_score(y_test, y_pred)*100

def getFinalPredictions(classifiers, X_train,X_test, y_train, y_test):
    ''' Function to get classifier with highest accuracy on the data
    '''
    # classifiers = ['LogisticRegression','KNN', 'Kernel SVM','DecisionTree','RandomForest','XGBoost']
    # classifiers = ['RandomForest']
    accuracy = 0
    best_predictions = ''
    bestClassifierName = ''
    classifier = ''
    
    for classifierName in classifiers:
        print('Evaluation started for ', classifierName)
        classifier = getClassifierObj(classifierName,8)
        y_pred = fitAndPredict(classifier,X_train,y_train,X_test)
        
        classifierAccuracy = getModelAccuracy(y_test,y_pred)
        print('Average accuracy of {} is {:.2f}%'.format(classifierName,classifierAccuracy))
        if classifierAccuracy > accuracy :
            accuracy = classifierAccuracy
            bestClassifierName = classifierName
            best_predictions = y_pred
            bestClassifier = classifier
            
            
    print('Classifier with highest accuracy is {}'.format(bestClassifierName))
    return best_predictions, accuracy, classifier      

def classify(classifiers, X_train,X_test, y_train, y_test):
    final_pred, final_accuracy, classifier = getFinalPredictions(classifiers, X_train,X_test, y_train, y_test)        
    cm = confusion_matrix(y_test, final_pred)
    print(cm)
    return final_pred, final_accuracy, classifier

##### 4.1.2 Classification using default parameters for the following classifiers:-

1.   LogisticRegression
2.   KNN
3.   Kernel SVM
4.   Random Forest
5.   XGBoost


In [None]:

import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

classifiers = ['LogisticRegression','KNN', 'Kernel SVM','RandomForest','XGBoost']
# classifiers = ['RandomForest']


train_y = train_df.target_relabeled.values
X_train, X_test, y_train, y_test = train_test_split(train_tfidf, train_y, random_state=8, test_size = 0.2)
final_pred, final_accuracy, bestClassifier = classify(classifiers,X_train, X_test, y_train, y_test)

Evaluation started for  LogisticRegression
Average accuracy of LogisticRegression is 74.85%
Evaluation started for  KNN
Average accuracy of KNN is 75.71%
Evaluation started for  Kernel SVM
Average accuracy of Kernel SVM is 73.08%
Evaluation started for  DecisionTree
Average accuracy of DecisionTree is 73.01%
Evaluation started for  RandomForest
Average accuracy of RandomForest is 75.77%
Evaluation started for  XGBoost
Average accuracy of XGBoost is 70.78%
Classifier with highest accuracy is RandomForest
[[812  49]
 [320 342]]


In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

# classifiers = ['LogisticRegression','KNN', 'Kernel SVM','DecisionTree','RandomForest','XGBoost']
classifiers = ['RandomForest']

train_y = train_df.target_relabeled.values
X_train, X_test, y_train, y_test = train_test_split(train_tfidf, train_y, random_state=8, test_size = 0.2)
final_pred, final_accuracy, bestClassifier = classify(classifiers,X_train, X_test, y_train, y_test)

Evaluation started for  RandomForest
Average accuracy of RandomForest is 76.43%
Classifier with highest accuracy is RandomForest
[[818  43]
 [316 346]]


#### 4.2 Tuning of hyperparameters

##### 4.2.1 Random Forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators' : list(range(100,300,50))}]

grid_search = GridSearchCV(estimator = RandomForestClassifier(),
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = -1)

grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print('Best Accuracy: {:.2f}%'.format(best_accuracy*100))
print('Best Parameters:', best_parameters)
# train_scoreNum, test_scoreNum = validation_curve(RandomForestClassifier(),
#                                 X = X_train, y = y_train, 
#                                 param_name = 'n_estimators', 
#                                 param_range = num_est, cv = 3)

Best Accuracy: 76.58%
Best Parameters: {'n_estimators': 150}


##### 4.2.2 Kernel SVM classifier

In [None]:
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
classifier = SVC()

parameters = [{'C': [1, 10, 100], 'kernel': ['linear']},
              {'C': [1, 10, 100], 'kernel': ['rbf'], 'gamma': [0.1, 0.5, 0.9]}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print('Best Accuracy: {:.2f}%'.format(best_accuracy*100))
print('Best Parameters:', best_parameters)


Best Accuracy: 79.59%
Best Parameters: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}


##### 4.2.3 K nearest neighbors classifier

In [None]:
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier()

parameters = [{'n_neighbors' : list(range(5,50,2)),
               'leaf_size': [30,35,40,45,50],
               'metric': ['minkowski', 'euclidean']
               }]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print('Best Accuracy: {:.2f}%'.format(best_accuracy*100))
print('Best Parameters:', best_parameters)

Best Accuracy: 77.54%
Best Parameters: {'leaf_size': 30, 'metric': 'minkowski', 'n_neighbors': 43}


##### 4.2.4 Logistic regression classifier

In [None]:
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

parameters = [{'C': [1, 10, 100,1000], 
               'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
               'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
               }]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 3,
                           n_jobs = -1)
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print('Best Accuracy: {:.2f}%'.format(best_accuracy*100))
print('Best Parameters:', best_parameters)

Best Accuracy: 79.67%
Best Parameters: {'C': 100, 'penalty': 'l1', 'solver': 'saga'}


##### 4.2.5 XGBoost classifier

In [None]:
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from datetime import datetime
from xgboost import XGBClassifier

def timer(start_time=None):
  if not start_time:
    start_time = datetime.now()
    return start_time
  elif start_time:
    thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
    tmin, tsec = divmod(temp_sec, 60)
    print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))


params = {'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        
        }

xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)

folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=-1, cv=skf.split(X_train,y_train), verbose=3, random_state=1001 )

# Here we go
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable 

print('\n All results:')
print(random_search.cv_results_)
print('\n Best estimator:')
print(random_search.best_estimator_)
print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
print(random_search.best_score_ * 2 - 1)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 26.0min finished

The nthread parameter is deprecated as of version .6.Please use n_jobs instead.nthread is deprecated.


The silent parameter is deprecated.Please use verbosity instead.silent is depreated




 Time taken: 0 hours 28 minutes and 0.62 seconds.

 All results:
{'mean_fit_time': array([164.6867017 , 234.52644324, 235.00639017, 177.8218534 ,
       189.56150119]), 'std_fit_time': array([ 0.61276134,  0.45626824,  1.82746491,  2.45172681, 47.9650434 ]), 'mean_score_time': array([0.14637272, 0.1670723 , 0.18501226, 0.16493742, 0.13338113]), 'std_score_time': array([0.00312982, 0.00222792, 0.00682904, 0.00268828, 0.02753328]), 'param_subsample': masked_array(data=[1.0, 0.6, 0.8, 1.0, 0.8],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_min_child_weight': masked_array(data=[5, 1, 5, 5, 1],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_max_depth': masked_array(data=[3, 5, 5, 5, 4],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object), 'param_gamma': masked_array(data=[5, 1.5, 1, 5, 1],
             mask=[

#### 4.2 Reclassify using the best parameters

In [None]:
# LogisticRegression
final_classifier = LogisticRegression(C =100, penalty ='l1', solver= 'saga')
final_classifier.fit(X_train,y_train)
final_pred = final_classifier.predict(test_tfidf)
sub_df['target'] = final_pred


In [None]:

# KNN
final_knn_classifier = KNeighborsClassifier(leaf_size= 30, metric= 'minkowski', n_neighbors= 43)
final_knn_classifier.fit(X_train,y_train)
final_knn_pred = final_knn_classifier.predict(test_tfidf)
sub_df['target'] = final_knn_pred


In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

final_rf_classifier = RandomForestClassifier(n_estimators = 150, criterion = 'entropy', random_state = 8)
final_rf_classifier.fit(X_train,y_train)
final_rf_pred = final_rf_classifier.predict(test_tfidf)
sub_df['target'] = final_rf_pred


In [None]:
# Kernel SVM
from sklearn.svm import SVC

final_svm_classifier = SVC(C = 10, gamma = 0.1, kernel = 'rbf')
final_svm_classifier.fit(X_train,y_train)
final_svm_pred = final_svm_classifier.predict(test_tfidf)
sub_df['target'] = final_svm_pred


In [None]:
# XGB

final_xgb_classifier = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=1.5,
              learning_rate=0.02, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=600, n_jobs=1,
              nthread=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=0.6, verbosity=1)

final_xgb_classifier.fit(X_train,y_train)
final_xgb_pred = final_xgb_classifier.predict(test_tfidf)
sub_df['target'] = final_xgb_pred


The nthread parameter is deprecated as of version .6.Please use n_jobs instead.nthread is deprecated.


The silent parameter is deprecated.Please use verbosity instead.silent is depreated



In [None]:
# Export the files and submit one by one
import csv
sub_df.to_csv('submission_rf.csv')

## 5. Results

|Classifier           | Accuracy |
|---------------------|----------|
|Random Forest        | 77.51%   |
|KNN                  | 77.11%   |
|Logistic Regression  | 79.93%   |
|Kernel   SVM         | 79.99%   |
|XGBoost              | 73.95%   |


### Kernel SVM achieved the best score of 79.99% after submission in Kaggle Competition.