In [1]:
# import libraries
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
import pickle
import re

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from joblib import parallel_backend

from imblearn.ensemble import BalancedRandomForestClassifier

import warnings
warnings.simplefilter('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yvesd\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yvesd\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yvesd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data from database
engine = create_engine(r'sqlite:///data/DisasterResponse.db', pool_pre_ping=True)
df = pd.read_sql_table('CleanData', engine)
X = df.message
Y = df[df.columns[4:]]

In [3]:
Y.drop('child_alone', axis=1, inplace=True)

In [4]:
def tokenize(text):
    """
    Normalize and tokenize message strings.
    
    Args:
    text: String - message text to process
    Returns:
    clean_tokens: list of strings - list of tokens from the message
    """
    # normalize case and remove punctuation
    text = text = re.sub('\W', ' ', text.lower())
    
    tokens = word_tokenize(text)
    stop_words = stopwords.words("english")
    
    # Reduce words to their stems
    clean_tokens = [PorterStemmer().stem(tok).strip() for tok in tokens if tok not in stop_words]

    return clean_tokens

In [5]:
def report_results(Y_test, Y_pred):
    """Report precision, recall  and f1_score for the Machine Learning Model."""
        
    results = pd.DataFrame(columns= ['category', 'precision', 'recall', 'f1-score'])
        
    for i, category in enumerate(Y_test.columns):
        y_true = Y_test.iloc[:,i].values
        y_pred = Y_pred[:,i]
        
        row = {'category':category, 
               'precision':precision_score(y_true, y_pred, zero_division=0, average='macro'), 
               'recall':recall_score(y_true, y_pred, zero_division=0, average='macro'), 
               'f1-score':f1_score(y_true, y_pred, zero_division=0, average='macro')}
        results = results.append(row, ignore_index=True)
    
    median_values = {'category':'median_values', 
               'precision':results['precision'].median(), 
               'recall':results['recall'].median(), 
               'f1-score':results['f1-score'].median()}
    results = results.append(median_values, ignore_index=True)
    
    return results

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [7]:
balanced_pipeline = Pipeline([
               ('vect', CountVectorizer(tokenizer=tokenize) ),
               ('tfidf', TfidfTransformer() ),
               ('clf', MultiOutputClassifier(BalancedRandomForestClassifier(n_jobs=-1) ))
                ])

In [8]:
balanced_pipeline.set_params(
    vect__ngram_range=(1,1),
    clf__estimator__min_samples_leaf= 5,
    clf__estimator__class_weight='balanced',
    clf__estimator__n_estimators=200)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x0000020ECDE1B9D0>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 MultiOutputClassifier(estimator=BalancedRandomForestClassifier(class_weight='balanced',
                                                                                min_samples_leaf=5,
                                                                                n_estimators=200,
                                                                                n_jobs=-1)))])

In [9]:
balanced_pipeline.fit(X_train, Y_train)
Y_pred = balanced_pipeline.predict(X_test)

In [11]:
print('Writing results of BalancedPipeline to DB in table "BalancedPipeline".')
report_results(Y_test, Y_pred).to_sql('BalancedPipeline', engine, index=False, if_exists='replace')
print('Saving model in pickle files.')
pickle.dump(balanced_pipeline, open('balanced_model.pkl', 'wb'))

Writing results of BalancedPipeline to DB in table "BalancedPipeline".
Saving model in pickle files.
