# ML Pipeline Preparation

## 1. Import libraries and load data from database
* Import Python libraries
* Load dataset from database 
* Define feature and target variables X and Y

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# download nltk needed packages
import nltk
nltk.download(['punkt', 'wordnet',  'stopwords', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ahmed\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# load data from database
engine = create_engine('sqlite:///../data/labeled_messages_db.sqlite3')
df = pd.read_sql_table('labeled_messages', engine)
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
X = df['message']
y = df.drop(['message', 'genre', 'id', 'original'], axis=1)
category_names = list(df.columns[4:])

## 2. Tokenization function to process text data

In [5]:
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from string import punctuation

In [6]:
def tokenize(text):

    # normalize case and remove punctuation
    remove_punc_table = str.maketrans('', '', punctuation)
    text = text.translate(remove_punc_table).lower()
    
    # tokenize text
    tokens = word_tokenize(text)
    
    # lemmatize and remove stop words
    stop_words = nltk.corpus.stopwords.words("english")
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word).lower().strip() for word in tokens if word not in stop_words]

In [7]:
tokenize(df['message'][0])

['weather', 'update', 'cold', 'front', 'cuba', 'could', 'pas', 'haiti']

## 3. Build a machine learning pipeline

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

In [9]:
forest_clf = MultiOutputClassifier(RandomForestClassifier())

pipeline = Pipeline([
                    ('cvect', CountVectorizer(tokenizer = tokenize)),
                    ('tfidf', TfidfTransformer()),
                    ('clf', forest_clf )
                    ])

## 4. Train pipeline

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cvect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize a...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

## 5. Test the model
Report the f1 score, precision and recall for each output category of the dataset.

In [15]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [16]:
def evaluate_model(y_pred, y_test):
    for i in range(len(category_names)):
        print('Category: {}'.format(category_names[i]))
        print(classification_report(y_test.iloc[:, i].values, y_pred[:, i]))
        print('Accuracy: {}\n\n'.format(accuracy_score(y_test.iloc[:, i].values, y_pred[:, i])))

In [17]:
y_pred = pipeline.predict(X_test)

In [18]:
evaluate_model(y_pred, y_test)

Category: related
              precision    recall  f1-score   support

           0       0.69      0.49      0.57      1549
           1       0.86      0.93      0.89      5005

    accuracy                           0.83      6554
   macro avg       0.77      0.71      0.73      6554
weighted avg       0.82      0.83      0.82      6554

Accuracy: 0.8265181568507781


Category: request
              precision    recall  f1-score   support

           0       0.90      0.98      0.94      5438
           1       0.81      0.49      0.61      1116

    accuracy                           0.89      6554
   macro avg       0.85      0.73      0.77      6554
weighted avg       0.89      0.89      0.88      6554

Accuracy: 0.893042416844675


Category: offer
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      6520
           1       0.00      0.00      0.00        34

    accuracy                           0.99      6554
   macro avg   

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6240
           1       0.80      0.18      0.29       314

    accuracy                           0.96      6554
   macro avg       0.88      0.59      0.64      6554
weighted avg       0.95      0.96      0.95      6554

Accuracy: 0.9584986267927983


Category: other_aid
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      5674
           1       0.61      0.03      0.06       880

    accuracy                           0.87      6554
   macro avg       0.74      0.52      0.50      6554
weighted avg       0.83      0.87      0.81      6554

Accuracy: 0.8674092157461093


Category: infrastructure_related
              precision    recall  f1-score   support

           0       0.93      1.00      0.97      6117
           1       0.22      0.00      0.01       437

    accuracy                           0.93      6554
   macro avg 

## 6. Improve the model
Use grid search to find better parameters.

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
pipeline.get_params()

{'memory': None,
 'steps': [('cvect',
   CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                   lowercase=True, max_df=1.0, max_features=None, min_df=1,
                   ngram_range=(1, 1), preprocessor=None, stop_words=None,
                   strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=<function tokenize at 0x000001F3882F3D38>,
                   vocabulary=None)),
  ('tfidf',
   TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)),
  ('clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True,
                                                          ccp_alpha=0.0,
                                                          class_weight=None,
                                                          criterion='gini',
                                                          max_dep

In [21]:
parameters = {
     #'cvect__max_df': (0.5, 0.75, 1.0),
     #'cvect__max_features': (None, 5000, 10000),
     #'cvect__ngram_range': ((1, 1), (1, 2)), 
     #'tfidf__use_idf': (True, False),
     #'tfidf__norm': ('l1', 'l2'),
     'clf__estimator__min_samples_leaf': [2, 5, 10],
     'clf__estimator__max_depth': [10, 50, None]
}

cv = GridSearchCV(pipeline, parameters, cv = 3, n_jobs=4, verbose = 5)

In [20]:
cv.fit(X_train, y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  27 out of  27 | elapsed:  6.8min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        pre

In [21]:
cv.best_estimator_

Pipeline(memory=None,
         steps=[('cvect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize a...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

## 7. Test the model

In [22]:
y_pred = cv.predict(X_test)

In [None]:
evaluate_model(y_pred, y_test)

## 8. Try improving the model further.
* try other machine learning algorithms
* add other features besides the TF-IDF

In [22]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import FeatureUnion

In [23]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            
            if not pos_tags: continue
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True

        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)

In [24]:
pipeline_v2 = Pipeline([
        
        ('features', FeatureUnion([
        
            ('text_pipeline', Pipeline([
                ('cvect', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ])),

            ('starting_verb', StartingVerbExtractor())
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier()) )
    ])

In [25]:
pipeline_v2.get_params()

{'memory': None,
 'steps': [('features', FeatureUnion(n_jobs=None,
                transformer_list=[('text_pipeline',
                                   Pipeline(memory=None,
                                            steps=[('cvect',
                                                    CountVectorizer(analyzer='word',
                                                                    binary=False,
                                                                    decode_error='strict',
                                                                    dtype=<class 'numpy.int64'>,
                                                                    encoding='utf-8',
                                                                    input='content',
                                                                    lowercase=True,
                                                                    max_df=1.0,
                                                                    max_f

In [26]:
parameters_v2 = {
     'features__text_pipeline__cvect__max_df': (0.5, 0.75, 1.0),
     #'features__text_pipeline__cvect__max_features': (None, 5000, 10000),
     'features__text_pipeline__cvect__ngram_range': ((1, 1), (1, 2)), 
     #'features__text_pipeline__tfidf__use_idf': (True, False),
     #'features__text_pipeline__tfidf__norm': ('l1', 'l2'),
     'clf__estimator__min_samples_leaf': [2, 5, 10],
     'clf__estimator__max_depth': [10, 50, None]
}

cv_v2 = GridSearchCV(pipeline_v2, parameters_v2, cv = 3, n_jobs=4, verbose = 5)

In [None]:
cv_v2.fit(X_train, y_train)

In [None]:
cv_v2.best_estimator_

In [None]:
y_pred_v2 = cv_v2.predict(X_test)

In [None]:
evaluate_model(y_pred_v2, y_test)

## 9. Export the model as a pickle file

In [None]:
import joblib

In [None]:
joblib.dump(cv_v2.best_estimator_, 'classifier.pkl')