<a href="https://colab.research.google.com/github/YangJiao85/disaster-tweets-kaggle/blob/master/disaster_tweets_MLmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Disaster Tweets 

This notebook will build a machine learning model that predicts which Tweets are about real disasters and which one's aren't. 

## Set up

Set up Kaggle environment and essential modules.

In [2]:
# Load essential modules
import numpy as np 
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")

import sklearn.model_selection as ms
import sklearn.metrics as skm

  import pandas.util.testing as tm


In [3]:
# Google colab Kaggle setting

os.environ['KAGGLE_USERNAME'] =  <kaggle_username>
os.environ['KAGGLE_KEY'] =  <kaggle_key>

!kaggle competitions download -c 'nlp-getting-started'


Downloading train.csv to /content
  0% 0.00/965k [00:00<?, ?B/s]
100% 965k/965k [00:00<00:00, 64.4MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/22.2k [00:00<?, ?B/s]
100% 22.2k/22.2k [00:00<00:00, 19.6MB/s]
Downloading test.csv to /content
  0% 0.00/411k [00:00<?, ?B/s]
100% 411k/411k [00:00<00:00, 133MB/s]


In [0]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_sub = pd.read_csv('sample_submission.csv')


## Extract feature vectors


- id
- keyword
- location
- text
- target

In [5]:
df_train.duplicated().sum()

0

In [0]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import TruncatedSVD

# reshape 2D to 1D
class trans_ravel(TransformerMixin, BaseEstimator):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X.ravel()

# keyword
kw_vec = Pipeline([
    ('kw_imp', SimpleImputer(strategy='constant')),
    ('kw_ravel', trans_ravel()),
    ('kw_vect', CountVectorizer())                   
    ], verbose = True)

# location
loc_vec = Pipeline([
    ('loc_imp', SimpleImputer(strategy = 'constant')),
    ('loc_ohe', OneHotEncoder(handle_unknown='ignore'))                    
    ], verbose = True)

# text
text_vec = Pipeline([
    ('text_tfidf', TfidfVectorizer())                     
    ], verbose = True)

# 
transformer = ColumnTransformer([
    ('kw_vec', kw_vec, ['keyword']),
    ('loc_vec', loc_vec, ['location']),
    ('text_vec', text_vec, 'text')                             
    ], remainder = 'drop')


## Model

### Some functions

In [0]:
def perform_grid_search(pip_clf, tuned_parameters):
    from pprint import pprint
    from time import time
    if __name__ == "__main__":
        # multiprocessing requires the fork to happen in a __main__ protected block

        # find the best parameters
        grid_search = ms.GridSearchCV(pip_clf, tuned_parameters, n_jobs=-1, 
                                  verbose=1)

        print("Performing grid search...")
        print("pipeline:", [name for name, _ in pip_clf.steps])
        print("parameters to be tuned:")
        pprint(tuned_parameters)
        t0 = time()
        grid_search.fit(X_train, y_train)

        print("Best parameters set found on train set:\n")
        print(grid_search.best_params_)
        print("Grid scores on train set:\n")
        means = grid_search.cv_results_['mean_test_score']
        stds = grid_search.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
            print("{:.3f} (+/-{:.3f}) for {}".format(mean, std, params))

        print("\nDetailed classification report:\n")
        y_pred = grid_search.predict(X_test)
        print(skm.classification_report(y_test, y_pred))
    return
    


In [0]:
def print_sub_csv(pip_clf, sub_file = 'submission.csv', ldl = False):
    y_pred = pip_clf.predict(df_test)
    df_sub = pd.DataFrame(data={
        'id': df_test['id'],
        'target': y_pred
    })
    df.head()
    df_sub.to_csv(sub_file, index=False)

    if(ldl):
        from google.colab import files
        files.download(sub_file)
    
    return

In [0]:
X_train, X_test, y_train, y_test = ms.train_test_split(
    df_train[['keyword', 'location', 'text']],
    df_train['target'],
    test_size = 0.3,
    random_state = 1234
)

### Support Vector Classifier (SVC)

In [0]:
from sklearn.svm import LinearSVC
pip_svc = Pipeline([
    ('trans', transformer),
    ('clf', LinearSVC())                    
    ], verbose = True)



In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_train[['keyword', 'location', 'text']], df_train['target'],
    test_size = 0.3, random_state = 1234
)

pip_fit = pip_svc.fit(X_train, y_train)


[Pipeline] ............ (step 1 of 3) Processing kw_imp, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing kw_ravel, total=   0.0s
[Pipeline] ........... (step 3 of 3) Processing kw_vect, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing loc_imp, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing loc_ohe, total=   0.0s
[Pipeline] ........ (step 1 of 1) Processing text_tfidf, total=   0.1s
[Pipeline] ............. (step 1 of 2) Processing trans, total=   0.2s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   0.1s


In [22]:
y_pred = pip_svc.predict(X_test)
print(skm.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.84      0.82      1288
           1       0.78      0.73      0.76       996

    accuracy                           0.79      2284
   macro avg       0.79      0.79      0.79      2284
weighted avg       0.79      0.79      0.79      2284



#### Grid search with cross validation

In [0]:
tuned_parameters = ({
    '': (),
})

### Gradient Boosting Decision Trees (GBDT)

In [0]:
from sklearn.ensemble import GradientBoostingClassifier

clf_gbc = GradientBoostingClassifier(
    n_estimators=100, 
    learning_rate=.1, 
    max_leaf_nodes=10,
    subsample=0.5, 
    random_state=0
    )

pip_gbc = Pipeline([
    ('trans', transformer),
    ('clf_gbc', clf_gbc)                    
    ], verbose = True)

In [7]:
X_train, X_test, y_train, y_test = ms.train_test_split(
    df_train[['keyword', 'location', 'text']], df_train['target'],
    test_size = 0.3, random_state = 1234
    )

pip_gbc_fit = pip_gbc.fit(X_train, y_train)

[Pipeline] ............ (step 1 of 3) Processing kw_imp, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing kw_ravel, total=   0.0s
[Pipeline] ........... (step 3 of 3) Processing kw_vect, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing loc_imp, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing loc_ohe, total=   0.0s
[Pipeline] ........ (step 1 of 1) Processing text_tfidf, total=   0.1s
[Pipeline] ............. (step 1 of 2) Processing trans, total=   0.2s
[Pipeline] ........... (step 2 of 2) Processing clf_gbc, total=   2.8s


In [9]:
from sklearn.metrics import classification_report

y_pred = pip_gbc.predict(X_test)
print(classification_report(y_test, y_pred))
print(pip_gbc.score(X_test, y_test))

              precision    recall  f1-score   support

           0       0.75      0.85      0.80      1288
           1       0.76      0.63      0.69       996

    accuracy                           0.75      2284
   macro avg       0.76      0.74      0.74      2284
weighted avg       0.76      0.75      0.75      2284

0.7543782837127846


#### Grid search with cross validation

In [9]:
tuned_parameters = {
    'clf_gbc__n_estimators': (10, 100, 200),
    'clf_gbc__learning_rate': (1., 0.5, 0.1),
    'clf_gbc__max_leaf_nodes': (2, 5, 10, 20),
}

perform_grid_search(pip_gbc, tuned_parameters)

Performing grid search...
pipeline: ['trans', 'clf_gbc']
parameters to be tuned:
{'clf_gbc__learning_rate': (1.0, 0.5, 0.1),
 'clf_gbc__max_leaf_nodes': (2, 5, 10, 20),
 'clf_gbc__n_estimators': (10, 100, 200)}
Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  5.2min finished


[Pipeline] ............ (step 1 of 3) Processing kw_imp, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing kw_ravel, total=   0.0s
[Pipeline] ........... (step 3 of 3) Processing kw_vect, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing loc_imp, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing loc_ohe, total=   0.0s
[Pipeline] ........ (step 1 of 1) Processing text_tfidf, total=   0.1s
[Pipeline] ............. (step 1 of 2) Processing trans, total=   0.2s
[Pipeline] ........... (step 2 of 2) Processing clf_gbc, total=   5.3s
Best parameters set found on train set:

{'clf_gbc__learning_rate': 0.1, 'clf_gbc__max_leaf_nodes': 5, 'clf_gbc__n_estimators': 200}
Grid scores on train set:

0.663 (+/-0.016) for {'clf_gbc__learning_rate': 1.0, 'clf_gbc__max_leaf_nodes': 2, 'clf_gbc__n_estimators': 10}
0.727 (+/-0.020) for {'clf_gbc__learning_rate': 1.0, 'clf_gbc__max_leaf_nodes': 2, 'clf_gbc__n_estimators': 100}
0.724 (+/-0.014) for {'clf_gbc__learning_rate':

In [13]:
pip_gbc.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'trans', 'clf_gbc', 'trans__n_jobs', 'trans__remainder', 'trans__sparse_threshold', 'trans__transformer_weights', 'trans__transformers', 'trans__verbose', 'trans__kw_vec', 'trans__loc_vec', 'trans__text_vec', 'trans__kw_vec__memory', 'trans__kw_vec__steps', 'trans__kw_vec__verbose', 'trans__kw_vec__kw_imp', 'trans__kw_vec__kw_ravel', 'trans__kw_vec__kw_vect', 'trans__kw_vec__kw_imp__add_indicator', 'trans__kw_vec__kw_imp__copy', 'trans__kw_vec__kw_imp__fill_value', 'trans__kw_vec__kw_imp__missing_values', 'trans__kw_vec__kw_imp__strategy', 'trans__kw_vec__kw_imp__verbose', 'trans__kw_vec__kw_vect__analyzer', 'trans__kw_vec__kw_vect__binary', 'trans__kw_vec__kw_vect__decode_error', 'trans__kw_vec__kw_vect__dtype', 'trans__kw_vec__kw_vect__encoding', 'trans__kw_vec__kw_vect__input', 'trans__kw_vec__kw_vect__lowercase', 'trans__kw_vec__kw_vect__max_df', 'trans__kw_vec__kw_vect__max_features', 'trans__kw_vec__kw_vect__min_df', 'trans__kw_vec__kw_vec

In [17]:
pip_gbc.set_params(clf_gbc__learning_rate = 0.1) 
pip_gbc.set_params(clf_gbc__max_leaf_nodes = 5)
pip_gbc.set_params(clf_gbc__n_estimators = 200)

tuned_parameters = ({
    'trans__text_vec__text_tfidf__max_df': (0.5, 0.8, 1.0),
    'trans__text_vec__text_tfidf__min_df': (0.0, 0.1),
    'trans__text_vec__text_tfidf__ngram_range': ((1,1), (1,2), (2,2)),
})

perform_grid_search(pip_gbc, tuned_parameters)


  






Performing grid search...
pipeline: ['trans', 'clf_gbc']
parameters to be tuned:
{'trans__text_vec__text_tfidf__max_df': (0.5, 0.8, 1.0),
 'trans__text_vec__text_tfidf__min_df': (0.0, 0.1),
 'trans__text_vec__text_tfidf__ngram_range': ((1, 1), (1, 2), (2, 2))}
Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  5.3min finished


[Pipeline] ............ (step 1 of 3) Processing kw_imp, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing kw_ravel, total=   0.0s
[Pipeline] ........... (step 3 of 3) Processing kw_vect, total=   0.0s
[Pipeline] ........... (step 1 of 2) Processing loc_imp, total=   0.0s
[Pipeline] ........... (step 2 of 2) Processing loc_ohe, total=   0.0s
[Pipeline] ........ (step 1 of 1) Processing text_tfidf, total=   0.1s
[Pipeline] ............. (step 1 of 2) Processing trans, total=   0.1s
[Pipeline] ........... (step 2 of 2) Processing clf_gbc, total=   5.1s
Best parameters set found on train set:

{'trans__text_vec__text_tfidf__max_df': 0.5, 'trans__text_vec__text_tfidf__min_df': 0.0, 'trans__text_vec__text_tfidf__ngram_range': (1, 1)}
Grid scores on train set:

0.766 (+/-0.020) for {'trans__text_vec__text_tfidf__max_df': 0.5, 'trans__text_vec__text_tfidf__min_df': 0.0, 'trans__text_vec__text_tfidf__ngram_range': (1, 1)}
0.763 (+/-0.016) for {'trans__text_vec__text_tfidf__max_df': 