# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import sqlite3
from sqlalchemy import create_engine
import matplotlib.pyplot as plt

# import statements
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
#from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import random as sparse_random
from tqdm import tqdm
import pickle

%matplotlib inline 



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data from database
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('DisasterResponse', 'sqlite:///DisasterResponse.db')
#df.head()

# Extract messages
X = df[['id','message','original','genre']]

columns_categories = df.columns.drop(['id','message','original','genre'])
print(columns_categories)

# Extract categories columns
Y = df[columns_categories]


Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'child_alone', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')


In [3]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Print the dataframe shape

print(X.shape)
X.head()


(26216, 4)


Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct


In [5]:
Y.head()
#print(Y.loc[0])

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Write a tokenization function to process your text data

In [6]:
# This function obtains all the tokens from a string (text parameter) and return them in a list

def tokenize(text):
    """
    This function obtain the tokens from a string
    
    Parameters: text is the string from the tokens are obtained
    Return:     clean_tokens as the list of tokens 
    """
    
    # Normalize text and remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
    
    stop_words = stopwords.words("english")
    lemmatizer = WordNetLemmatizer()

    # Tokenize text
    tokens = word_tokenize(text)
    #print("\nTokens=", tokens)
    
    # Remove stop words and lemmatize
    #tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    
    #print("\nStop words removal and lematizer =", clean_tokens)
    
    return clean_tokens

In [7]:
# Code for testing the tokenize function

text = "Weather update - a cold front from Cuba that couldn't pass over Haiti"
#print(text)

#print(tokenize(text))

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [8]:
# Create a pipeline for extracting the tokens, get the tfidf matrix and to create a multiouput classifier for training 
# a multi-levels vector y=[y1,y1,y3,...]

pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier(max_depth=3, n_estimators= 20))) 
    ])


### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [9]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X.message, Y)

#print(X_train.shape,y_train.shape)

# train classifier
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...oob_score=False, random_state=None, verbose=0,
            warm_start=False),
           n_jobs=1))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [10]:
# predict on test data through the previosly created pipeline

y_pred = pipeline.predict(X_test)

In [11]:
# Function to display the predicted values with the pipeline, and reports the scores for 
# the multiclass-multioutput-multilabel classifier: f1 score, precision and recall

def display_results(y_test, y_pred):
    """
    Function to display the predicted values with the pipeline, and reports the scores for the
    multiclass-multioutput-multilabel classifier: f1 score, precision and recall
    
    Parameters: y_test is the vector with the testing labels
                y_pred is the vector with the predicted labels
    """
    labels = np.unique(y_pred)
    y_test2 = np.array(y_test)
    y_pred2 = np.array(y_pred)
    
    columnsTest = y_test.shape[1]
    columnsPred = y_pred.shape[1]
    
    # Obtain the precision, recall, and F1 metrics for each feature in y_pred
        
    if columnsTest == columnsPred:
        for i in range(columnsTest):
            print(metrics.classification_report(y_test2[:,i],y_pred2[:,i]))
            print(i)
    else:
        print("Columns number in y_test and y_pred are different.")
    return

In [12]:
# Display results, reporting the f1 score, precision and recall for each 
# output category of the dataset

display_results(y_test, y_pred)


             precision    recall  f1-score   support

          0       0.00      0.00      0.00      1561
          1       0.75      1.00      0.86      4939
          2       0.00      0.00      0.00        54

avg / total       0.57      0.75      0.65      6554

0
             precision    recall  f1-score   support

          0       0.84      1.00      0.91      5490
          1       0.00      0.00      0.00      1064

avg / total       0.70      0.84      0.76      6554

1
             precision    recall  f1-score   support

          0       0.99      1.00      1.00      6521
          1       0.00      0.00      0.00        33

avg / total       0.99      0.99      0.99      6554

2
             precision    recall  f1-score   support

          0       0.60      1.00      0.75      3911
          1       0.96      0.02      0.03      2643

avg / total       0.74      0.60      0.46      6554

3
             precision    recall  f1-score   support

          0       0.92   

  'precision', 'predicted', average, warn_for)


### 6. Improve your model
Use grid search to find better parameters. 

In [None]:
# The next code defines a pipeline for extracting the tokens, get the tfidf matrix and to create a multiouput classifier for training 
# a multi-levels vector y=[y1,y1,y3,...] and test the hyperparameters with the GridSearchCV() function to find the best ones

mclf = MultiOutputClassifier(RandomForestClassifier())

pipeline = Pipeline(steps=[
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('mclf', mclf) 
    ])


parameters = {
    'vect__ngram_range':[(1,1), (1,2)], # Allow unigrams, bigrams or both.
    'tfidf__norm':('l1', 'l2'), # Test if l1, l2 or None train better
    "mclf__estimator__max_depth": [4], # Tree depth in the forest
    "mclf__estimator__n_estimators": [10, 20], # Number of trees in the forest
}

cv = GridSearchCV(estimator=pipeline, param_grid=parameters,refit=True,verbose=2,n_jobs=-1)

cv.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 1) 
[CV]  mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 1), total=  15.7s
[CV] mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   25.4s remaining:    0.0s


[CV]  mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 1), total=  15.8s
[CV] mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 1) 
[CV]  mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 1), total=  15.8s
[CV] mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 2) 
[CV]  mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 2), total=  18.9s
[CV] mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 2) 
[CV]  mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 2), total=  18.9s
[CV] mclf__estimator__max_depth=4, mclf__estimator__n_estimators=10, tfidf__norm=l1, vect__ngram_range=(1, 2) 
[CV]  mclf__estimator__max_depth=4, mclf__estimator_

In [None]:
print("Best parameter (CV score=%0.3f):" % cv.best_score_)
print(cv.best_params_)

# Make the predictions with the gridsearch pipeline

y_pred = cv.predict(X_test)

### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [None]:
# Display results, reporting the f1 score, precision and recall for each 
# output category of the dataset using the best hyperparameters which resulted from the previous classifier training

display_results(y_test, y_pred)

### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [None]:
#print(svd_2000.shape)

#print(X_train.shape,y_train.shape)
#print(X_test.shape,y_test.shape)

In [None]:
def textLenghExtractor(text):
    """
    Function to calculate the lenght of an array of strings
    
    Parameters: text is the array of strings
    Return:     Array of lenghts
    """
    print(len(text))
    
    #mylen = np.empty([len(text)], dtype=int)
    #print("len 1", mylen)
    
    mylen = np.vectorize(len)(text)
    
    print("mylen", mylen.shape)
    
    mylen2 = mylen.reshape(-1,1)
    print("mylen2", mylen2)
    print("mylen2 shape", mylen2.shape)
    
    return mylen2

#arr = np.array(['Hello', 'foo', 'and', 'whatsoever']) 

#print(textLenghExtractor(arr))

#messagesLens = FunctionTransformer(textLenghExtractor)


In [None]:
# The following class defines and extractor for calculating the lenghts of a strings array.

class TextLenghExtractor(BaseEstimator, TransformerMixin):
    """
    Class for defining an extractor for calculating the lenghts of a strings array
    
    """

    def fit(self, X, y=None):
        """
        Function for fitting the transformer

        Parameters: self is the same extractor
                    X is an array of strings
        Return:     The extractor
        """
        return self

    def transform(self, text):
        """
        Function for calculating the lenghts over and array of strings

        Parameters: self is the same extractor
                    X is an 2D array
        Return:     The extractor
        """
        #print(text.shape)
        lens = []
        for x in text:
            lens.append(len(x))
            #print(len(x), lens)
        lengths = np.asarray(lens).reshape(-1,1)
        return lengths

textlen = TextLenghExtractor()
textlen.fit(X_train)

#X = np.array(['Implementing', 'a', 'Custom', 'Transformer', 'from', 'SCIKIT-LEARN'])

lengs = textlen.transform(X_train)

#print(lengs.shape)
#print(len(lengs))

In [None]:
# The following defines a feature union transformer with a pipeline for obtaining the tokens and the tfidf, 
# then to calculate the lenghts of the original X_train, in parallel

fu = FeatureUnion([ 
               ('nlp_pipeline', Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)),
                            ('tfidf', TfidfTransformer()) ]) ), 
               ('textlen', textlen )
             ]) #End of duplas list for FeatureUnion
fu.fit(X_train)
x = fu.transform(X_train)

#print("feature union= ",x.shape)

In [None]:
# The next code defines a pipeline for extracting the tokens, get the tfidf matrix and to create a multiouput classifier for training 
# a multi-levels vector y=[y1,y1,y3,...] and test the hyperparameters with the GridSearchCV() function to find the best ones
# then, it tests two different classifiers and different hyperparamenters for each of them, and finds the best ones

pipeline3 = Pipeline([
            ('features', FeatureUnion([ 
               ('nlp_pipeline', Pipeline([ ('vect', CountVectorizer(tokenizer=tokenize)),
                            ('tfidf', TfidfTransformer()) ]) ), 
               ('textlen', textlen ),
                ]) #End of duplas list for FeatureUnion
            ),
            ('estimator', MultiOutputClassifier(ExtraTreesClassifier(random_state=0, bootstrap=True, max_depth=3))),
            #('estimator', MultiOutputClassifier(RandomForestClassifier())),
            
            ])

#pipeline3.fit(X_train, y_train)

parameters = [
                {
                'features__nlp_pipeline__vect__ngram_range':[(1,1),(1,2)],          # Allow unigrams, bigrams or both.
                'features__nlp_pipeline__tfidf__norm':['l2'],                 # Test if l1, l2 or None train better
                'estimator':[MultiOutputClassifier(ExtraTreesClassifier(random_state=0, bootstrap=True, max_depth=3))],
                'estimator__estimator__n_estimators': [10],
                },
                {
                'features__nlp_pipeline__vect__ngram_range':[(1,1),(1,2)],          # Allow unigrams, bigrams or both.
                'features__nlp_pipeline__tfidf__norm':['l2'],                 # Test if l1, l2 or None train better
                'estimator':[MultiOutputClassifier(KNeighborsClassifier())],
                'estimator__estimator__n_neighbors': [3],
                }
            ]

        
cv = GridSearchCV(estimator=pipeline3, param_grid=parameters,refit=True,verbose=2,n_jobs=-1)

cv.fit(X_train, y_train)

In [None]:
# Print out the best hyperparameters results
print("Best parameter (CV score=%0.3f):" % cv.best_score_)
print(cv.best_params_)

# Make the prediction for the testing data
y_pred = cv.predict(X_test)

# Display results, reporting the f1 score, precision and recall for each 
# output category of the dataset
display_results(y_test, y_pred)

### 9. Export your model as a pickle file

In [None]:
# Exporting the model to a file

pickle.dump(cv, open('BestModelAndGridSearch.pkl','wb'))

### 10. Use this notebook to complete `train.py`
Use the template file attached in the Resources folder to write a script that runs the steps above to create a database and export a model based on a new dataset specified by the user.