<a href="https://colab.research.google.com/github/YangJiao85/disaster-tweets-kaggle/blob/master/disaster_tweets_MLmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Disaster Tweets 

This notebook will build a machine learning model that predicts which Tweets are about real disasters and which one's aren't. 

## Set up

Set up Kaggle environment and essential modules.

In [0]:
# Load essential modules
import numpy as np 
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")

import sklearn.model_selection as ms
import sklearn.metrics as skm

In [0]:
# Google colab Kaggle setting

os.environ['KAGGLE_USERNAME'] =  <kaggle_username>
os.environ['KAGGLE_KEY'] = <kaggle_key>

!kaggle competitions download -c 'nlp-getting-started'


Downloading sample_submission.csv to /content
  0% 0.00/22.2k [00:00<?, ?B/s]
100% 22.2k/22.2k [00:00<00:00, 37.1MB/s]
Downloading train.csv to /content
  0% 0.00/965k [00:00<?, ?B/s]
100% 965k/965k [00:00<00:00, 63.7MB/s]
Downloading test.csv to /content
  0% 0.00/411k [00:00<?, ?B/s]
100% 411k/411k [00:00<00:00, 56.9MB/s]


In [0]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_sub = pd.read_csv('sample_submission.csv')


## Extract feature vectors


- id
- keyword
- location
- text
- target

In [24]:
df_train.duplicated().sum()

0

In [0]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD

# reshape 2D to 1D
class trans_ravel(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.ravel()

# keyword
kw_vec = Pipeline([
    ('kw_imp', SimpleImputer(strategy='constant')),
    ('kw_ravel', trans_ravel()),
    ('kw_vect', CountVectorizer())                   
    ], verbose = True)

# location
loc_vec = Pipeline([
    ('loc_imp', SimpleImputer(strategy = 'constant')),
    ('loc_ohe', OneHotEncoder(handle_unknown='ignore'))                    
    ], verbose = True)

# text
text_vec = Pipeline([
    ('text_tfidf', TfidfVectorizer())                     
    ], verbose = True)

# 
transformer = ColumnTransformer([
    ('kw_vec', kw_vec, ['keyword']),
    ('loc_vec', loc_vec, ['location']),
    ('text_vec', text_vec, 'text')                             
    ], remainder = 'drop')


## Model

### Support Vector Classifier (SVC)

In [0]:
from sklearn.svm import LinearSVC
pip_clf = Pipeline([
    ('trans', transformer),
    ('clf', LinearSVC())                    
    ], verbose = True)



In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_train[['keyword', 'location', 'text']], df_train['target'],
    test_size = 0.3, random_state = 1234
)

pip_fit = pip_clf.fit(X_train, y_train)


[Pipeline] ............ (step 1 of 3) Processing kw_imp, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing kw_ravel, total=   0.0s
[Pipeline] ........... (step 3 of 3) Processing kw_vect, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing loc_imp, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing loc_ohe, total=   0.0s
[Pipeline] ........ (step 1 of 1) Processing text_tfidf, total=   0.1s
[Pipeline] ............. (step 1 of 2) Processing trans, total=   0.1s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s


In [0]:
from sklearn.metrics import classification_report

y_pred = pip_clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82      1288
           1       0.78      0.73      0.76       996

    accuracy                           0.79      2284
   macro avg       0.79      0.79      0.79      2284
weighted avg       0.79      0.79      0.79      2284



### Gradient Boosting Decision Trees (GBDT)

In [0]:
from sklearn.ensemble import GradientBoostingClassifier

clf_gbc = GradientBoostingClassifier(
    n_estimators=100, 
    learning_rate=.1, 
    max_leaf_nodes=10,
    subsample=0.5, 
    random_state=0
    )

pip_gbc = Pipeline([
    ('trans', transformer),
    ('clf_gbc', clf_gbc)                    
    ], verbose = True)

In [0]:
X_train, X_test, y_train, y_test = ms.train_test_split(
    df_train[['keyword', 'location', 'text']], df_train['target'],
    test_size = 0.3, random_state = 1234
    )

pip_gbc_fit = pip_gbc.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 3) Processing kw_imp, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing kw_ravel, total=   0.0s
[Pipeline] ........... (step 3 of 3) Processing kw_vect, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing loc_imp, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing loc_ohe, total=   0.0s
[Pipeline] ........ (step 1 of 1) Processing text_tfidf, total=   0.1s
[Pipeline] ............. (step 1 of 2) Processing trans, total=   0.1s
[Pipeline] ........... (step 2 of 2) Processing clf_gbc, total=   2.7s


In [0]:
from sklearn.metrics import classification_report

y_pred = pip_gbc.predict(X_test)
print(classification_report(y_test, y_pred))
print(pip_gbc.score(X_test, y_test))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80      1288
           1       0.76      0.63      0.69       996

    accuracy                           0.75      2284
   macro avg       0.76      0.74      0.74      2284
weighted avg       0.76      0.75      0.75      2284

0.7543782837127846


#### Grid search with cross validation

In [22]:
from pprint import pprint
from time import time

tuned_parameters = {
    'clf_gbc__n_estimators': (10, 100, 200),
    'clf_gbc__learning_rate': (1., 0.5, 0.1),
    'clf_gbc__max_leaf_nodes': (2, 5, 10, 20),
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected block

    # find the best parameters
    grid_search = ms.GridSearchCV(pip_gbc, tuned_parameters, n_jobs=-1, 
                                  verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters to be tuned:")
    pprint(tuned_paramerters)
    t0 = time()
    grid_search.fit()

Performing grid search...
pipeline: ['trans', 'clf_gbc']
parameters to be tuned:
{'clf_gbc__learning_rate': (1.0, 0.5, 0.1),
 'clf_gbc__max_leaf_nodes': (1, 5, 10, 20),
 'clf_gbc__n_estimators': (10, 100, 200)}
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  4.2min finished


[Pipeline] ............ (step 1 of 3) Processing kw_imp, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing kw_ravel, total=   0.0s
[Pipeline] ........... (step 3 of 3) Processing kw_vect, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing loc_imp, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing loc_ohe, total=   0.0s
[Pipeline] ........ (step 1 of 1) Processing text_tfidf, total=   0.1s
[Pipeline] ............. (step 1 of 2) Processing trans, total=   0.2s
[Pipeline] ........... (step 2 of 2) Processing clf_gbc, total=   5.1s
done in 258.752s

Best score: 0.765
Best parameter set:
	clf_gbc__learning_rate: 0.1
	clf_gbc__max_leaf_nodes: 5
	clf_gbc__n_estimators: 200
nan (+/-nan for {'clf_gbc__learning_rate': 1.0, 'clf_gbc__max_leaf_nodes': 1, 'clf_gbc__n_estimators': 10}
nan (+/-nan for {'clf_gbc__learning_rate': 1.0, 'clf_gbc__max_leaf_nodes': 1, 'clf_gbc__n_estimators': 100}
nan (+/-nan for {'clf_gbc__learning_rate': 1.0, 'clf_gbc__max_leaf_nodes': 1