# ML Pipeline Preparation

## 1. Import libraries and load data from database
* Import Python libraries
* Load dataset from database 
* Define feature and target variables X and Y

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# download nltk needed packages
import nltk
nltk.download(['punkt', 'wordnet',  'stopwords', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# load data from database
engine = create_engine('sqlite:///disaster_response.db')
df = pd.read_sql_table('labeled_messages', engine)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = df['message']
y = df.drop(['message', 'genre', 'id', 'original'], axis=1)

## 2. Tokenization function to process text data

In [5]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from string import punctuation

In [6]:
def tokenize(text):

    # normalize case and remove punctuation
    remove_punc_table = str.maketrans('', '', punctuation)
    text = text.translate(remove_punc_table).lower()
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    stop_words = nltk.corpus.stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word).lower().strip() for word in tokens if word not in stop_words]

In [7]:
tokenize(df['message'][0])

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti']

## 3. Build a machine learning pipeline

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

In [9]:
forest_clf = MultiOutputClassifier(RandomForestClassifier())

pipeline = Pipeline([
                    ('cvect', CountVectorizer(tokenizer = tokenize)),
                    ('tfidf', TfidfTransformer()),
                    ('clf', forest_clf )
                    ])

## 4. Train pipeline

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cvect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize a...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

## 5. Test the model
Report the f1 score, precision and recall for each output category of the dataset.

In [12]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [13]:
def evaluate_model(y_pred, y_test, full_report=False):
    
    if ( full_report ):
        print(classification_report(y_test.values, y_pred, target_names = y.columns.values))
        
    else:
        print('Accuracy: ', accuracy_score(y_test, y_pred))
        print('Precision: ', precision_score(y_test, y_pred, average='weighted'))
        print('Recall: ', recall_score(y_test, y_pred, average='weighted'))
        print('f-score: ', f1_score(y_test, y_pred, average='weighted'))

In [15]:
y_pred = pipeline.predict(X_test)

In [16]:
evaluate_model(y_pred, y_test, full_report=True)

                        precision    recall  f1-score   support

               related       0.84      0.94      0.89      4937
               request       0.82      0.50      0.62      1087
                 offer       0.00      0.00      0.00        28
           aid_related       0.76      0.70      0.73      2686
          medical_help       0.70      0.06      0.11       497
      medical_products       0.83      0.08      0.15       349
     search_and_rescue       0.69      0.07      0.13       152
              security       0.33      0.01      0.02       107
              military       0.80      0.05      0.10       221
           child_alone       0.00      0.00      0.00         0
                 water       0.89      0.37      0.52       422
                  food       0.85      0.64      0.73       798
               shelter       0.80      0.29      0.43       567
              clothing       0.77      0.17      0.28        99
                 money       0.83      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 6. Improve the model
Use grid search to find better parameters.

In [14]:
from sklearn.model_selection import GridSearchCV

In [15]:
pipeline.get_params()

{'memory': None,
 'steps': [('cvect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x0000028DC5FC31F8>,
                   vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                          ccp_alpha=0.0,
                                                          class_weight=None,
                                                          criterion='gini',
                                                          max_dep

In [16]:
parameters = {
     #'cvect__max_df': (0.5, 0.75, 1.0),
     #'cvect__max_features': (None, 5000, 10000),
     #'cvect__ngram_range': ((1, 1), (1, 2)), 
     #'tfidf__use_idf': (True, False),
     #'tfidf__norm': ('l1', 'l2'),
     'clf__estimator__min_samples_leaf': [2, 5, 10],
     'clf__estimator__max_depth': [10, 50, None]
}

cv = GridSearchCV(pipeline, parameters, cv = 3, n_jobs=4, verbose = 5)

In [20]:
cv.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:  6.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        pre

In [21]:
cv.best_estimator_

Pipeline(memory=None,
         steps=[('cvect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize a...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

## 7. Test the model

In [22]:
y_pred = cv.predict(X_test)

In [23]:
evaluate_model(y_pred, y_test)

Accuracy:  0.2419896246566982
Precision:  0.767881163014764
Recall:  0.4909828454132252
f-score:  0.5202376528064439


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  average, "true nor predicted", 'F-score is', len(true_sum)


## 8. Try improving the model further.
* try other machine learning algorithms
* add other features besides the TF-IDF

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

In [18]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            
            if not pos_tags: continue
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True

        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [19]:
pipeline_v2 = Pipeline([
        
        ('features', FeatureUnion([
        
            ('text_pipeline', Pipeline([
                ('cvect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier()) )
    ])

In [20]:
pipeline_v2.get_params()

{'memory': None,
 'steps': [('features', FeatureUnion(n_jobs=None,
                transformer_list=[('text_pipeline',
                                   Pipeline(memory=None,
                                            steps=[('cvect',
                                                    CountVectorizer(analyzer='word',
                                                                    binary=False,
                                                                    decode_error='strict',
                                                                    dtype=<class 'numpy.int64'>,
                                                                    encoding='utf-8',
                                                                    input='content',
                                                                    lowercase=True,
                                                                    max_df=1.0,
                                                                    max_f

In [23]:
parameters_v2 = {
     'features__text_pipeline__cvect__max_df': (0.5, 0.75, 1.0),
     #'features__text_pipeline__cvect__max_features': (None, 5000, 10000),
     'features__text_pipeline__cvect__ngram_range': ((1, 1), (1, 2)), 
     #'features__text_pipeline__tfidf__use_idf': (True, False),
     #'features__text_pipeline__tfidf__norm': ('l1', 'l2'),
     'clf__estimator__min_samples_leaf': [2, 5, 10],
     'clf__estimator__max_depth': [10, 50, None]
}

cv_v2 = GridSearchCV(pipeline_v2, parameters_v2, cv = 3, n_jobs=4, verbose = 5)

In [None]:
cv_v2.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  3.5min


In [None]:
cv_v2.best_estimator_

In [None]:
y_pred_v2 = cv_v2.predict(X_test)

In [None]:
evaluate_model(y_pred_v2, y_test)

## 9. Export the model as a pickle file

In [None]:
import joblib

In [None]:
joblib.dump(cv_v2.best_estimator_, 'classifier.pkl')