In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick 
import matplotlib.dates as mdates
from matplotlib.ticker import PercentFormatter, FuncFormatter
%matplotlib inline

import seaborn as sns
sns.set()

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

from feature_engine.encoding import OneHotEncoder as fe_OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

from  sklearn.metrics  import accuracy_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix


import os 
# environment settings
pd.set_option('display.max_column',None)
pd.set_option('display.max_rows',None)

We need to perform the following steps in the pipeline:
    - split into Train-Test - I will leave 10% as final Test set, while within the GridSearch CV we will use the validation set
    - tfidf vectorizer - directly transforms tokens into numerical features
    - one-hot encoder - on the following columns: genre
    - estimator
  
Due to the fact that all columns are binary, or text features, there is no need to scale them. 

In [2]:
os.getcwd()

'/Users/asyagadzhalova/Documents/GitHub/disaster_messages_classification/notebooks'

In [3]:
os.chdir('..')

In [4]:
df=pd.read_pickle(os.getcwd()+'/data/data_after_text_processing.pkl')

In [5]:
df.head()

Unnamed: 0,id,message,genre,trans_ind,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report,message_clean,lang,tokens
0,2,Weather update - a cold front from Cuba that c...,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,weather update a cold front from cuba that co...,en,weather update cold front cuba could pas haiti
1,7,Is the Hurricane over or is it not over,direct,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,is the hurricane over or is it not over,en,hurricane
2,8,Looking for someone but no name,direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,looking for someone but no name,en,looking someone name
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,un reports leogane destroyed only hospital st...,en,un report leogane destroyed hospital st croix ...
4,12,"says: west side of Haiti, rest of the country ...",direct,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,says west side of haiti rest of the country to...,en,say west side haiti rest country today tonight


In [6]:
df['genre'].value_counts()

news      13001
direct    10431
social     2317
Name: genre, dtype: int64

In [7]:
df.columns

Index(['id', 'message', 'genre', 'trans_ind', 'related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report', 'message_clean', 'lang', 'tokens'],
      dtype='object')

In [9]:
#I will have to deal with this in the previous notebooks
df.loc[df['related']==2,'related']=1

In [11]:
X = df[['tokens','genre']].copy()
y = df[['related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report']].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10,random_state=42)

In [1]:
# GridSearch Pipeline - to find the best parameters

preprocessor = ColumnTransformer(
    [('genre_cat', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['genre']),
     ('description_tfidf', TfidfVectorizer(), 'tokens')],
    remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',MultiOutputClassifier(LinearSVC()))
    ])     


param_grid = {
    # try different feature engineering parameters
    'preprocessor__description_tfidf__ngram_range':[(1,1),(1,2),(1,3)],
    'classifier__estimator__penalty': ['l2','l1'],
    'classifier__estimator__C': [1,10,100],
}

grid_search = GridSearchCV(pipeline, param_grid,
                           cv=5, n_jobs=6)

grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)


# train classifier
#pipeline.fit(X_train,y_train)

# predict on test data
#y_pred = pipeline.predict(X_test)

NameError: name 'ColumnTransformer' is not defined

In [43]:
grid_search.best_params_

{'classifier__estimator__C': 1,
 'classifier__estimator__penalty': 'l2',
 'preprocessor__description_tfidf__ngram_range': (1, 2)}

In [49]:
print(accuracy_score(y_test,y_pred))

0.2928155339805825


In [50]:
print(metrics.accuracy_score(y_test,y_pred))

0.2928155339805825


In [47]:
print(metrics.classification_report(y_test, y_pred,target_names= y_test.columns.values))

                        precision    recall  f1-score   support

               related       0.86      0.93      0.89      1965
               request       0.69      0.66      0.68       426
                 offer       0.00      0.00      0.00        12
           aid_related       0.70      0.76      0.72      1064
          medical_help       0.58      0.37      0.45       217
      medical_products       0.71      0.31      0.44       150
     search_and_rescue       0.73      0.26      0.38        74
              security       1.00      0.03      0.06        34
              military       0.52      0.29      0.37        76
                 water       0.74      0.74      0.74       159
                  food       0.82      0.74      0.78       298
               shelter       0.72      0.63      0.67       235
              clothing       0.88      0.39      0.54        36
                 money       0.61      0.20      0.31        83
        missing_people       0.40      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Final model

In [12]:
preprocessor = ColumnTransformer(
    [('genre_cat', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['genre']),
     ('description_tfidf', TfidfVectorizer(ngram_range=(1,2)), 'tokens')],
    remainder='passthrough')

pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('classifier',MultiOutputClassifier(LinearSVC()))
    ])     

# train classifier
pipeline.fit(X_train,y_train)

# predict on test data
y_pred = pipeline.predict(X_test)

In [13]:
print(metrics.accuracy_score(y_test,y_pred))

0.2998058252427184


In [14]:
print(metrics.classification_report(y_test, y_pred,target_names= y_test.columns.values))

                        precision    recall  f1-score   support

               related       0.86      0.93      0.89      1967
               request       0.76      0.66      0.70       461
                 offer       0.00      0.00      0.00         9
           aid_related       0.70      0.78      0.74      1080
          medical_help       0.65      0.36      0.47       214
      medical_products       0.55      0.31      0.39       117
     search_and_rescue       0.73      0.13      0.23        60
              security       0.00      0.00      0.00        58
              military       0.55      0.32      0.40        92
                 water       0.67      0.69      0.68       156
                  food       0.80      0.77      0.78       309
               shelter       0.72      0.64      0.68       238
              clothing       0.73      0.49      0.59        45
                 money       0.62      0.28      0.38        54
        missing_people       0.50      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
