In [None]:
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords'])

In [54]:
import sys
# import libraries
import numpy as np
import pandas as pd
import nltk
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import pickle

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.externals import joblib 
from sklearn.metrics import f1_score, classification_report, make_scorer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

class BasicTextAnalytics(BaseEstimator, TransformerMixin):
    '''
    Class for returning some basic numerical data for text analysis to include in 
    modelling. Such as: 
    - Number of sentences
    - Number of words
    - Number of nouns
    - Number of verbs
    - Number of adjectives
    A lot of the above were taken from ideas found here: 
    https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/
    '''
    pos_family = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
    }

    # function to check and get the part of speech tag count of a words in a given sentence
    def check_pos_tag(self, text, flag):
        '''
        Returns the count of a given NL pos_tag, based on user selection. E.g. number of nouns.
        INPUTS
        text - the given text to analyse
        flag - pos family to analyse, one of 'noun', 'pron' , 'verb', 'adj' or 'adv'
        '''
        count = 0
        try:
            wiki = textblob.TextBlob(text)
            for tup in wiki.tags:
                ppo = list(tup)[1]
                if ppo in pos_family[flag]:
                    count += 1
        except:
            pass
        return count
    
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        trainDF = pd.DataFrame()
        trainDF['text'] = X
        trainDF['word_count'] = trainDF['text'].apply(lambda x: len(x.split()))
        trainDF['title_word_count'] = trainDF['text'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
        trainDF['noun_count'] = trainDF['text'].apply(lambda x: self.check_pos_tag(x, 'noun'))
        trainDF['verb_count'] = trainDF['text'].apply(lambda x: self.check_pos_tag(x, 'verb'))
        trainDF['adj_count'] = trainDF['text'].apply(lambda x: self.check_pos_tag(x, 'adj'))
        trainDF['adv_count'] = trainDF['text'].apply(lambda x: self.check_pos_tag(x, 'adv'))
        trainDF['pron_count'] = trainDF['text'].apply(lambda x: self.check_pos_tag(x, 'pron'))
        
        return trainDF.drop('text',axis=1)

def load_data(database_filepath):
    '''Imports the "InsertTableName" table from a specified database file
    Returns X and Y datasets as pandas DataFrames as well as a list of column names
    (column names not currently in use but may be useful for later analyses of 
    feature importance, etc.)'''
    #  load from database
    engine = create_engine('sqlite:///{}'.format(database_filepath))
    df = pd.read_sql_table('InsertTableName', engine)

    # split input and response variables
    category_names = set(df.columns) - set({'id',
                                            'message',
                                            'original',
                                            'related',
                                            'genre',
                                            'genre_direct',
                                            'genre_news',
                                            'genre_social'})
    category_names = list(category_names)
    print(category_names)
    X = df['message']
    Y = df[category_names]
    return X, Y, category_names

def tokenize(text):
    '''tokenizing function which splits given text into words, removing stop words and spaces 
    as well as outputting in lower case'''
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    # lemmatize and remove stop words
    stop_words = stopwords.words("english")
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    clean_tokens = [tok.lower().strip() for tok in tokens]

    return clean_tokens


def build_model():
    ''' Build and scale engineered features separate to Natural Language transformations
    Creates preprocessing pipelines for both numeric and text data, then completes a quick
    Grid search over RandomForestClassifier key parameters.
    Note: gridsearch has not been applied to text transformation hyperparameters based on previous
    searches showing minimal impacts in tuning these. 
    '''
    pipeline_model = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize, 
                                        ngram_range=(1, 2),
                                        max_features=5000,
                                        max_df=0.5)),
                ('tfidf', TfidfTransformer())
            ])),

            ('numerical_pipeline', Pipeline([
                ('analytics', BasicTextAnalytics()),
                ('norm', StandardScaler())
                ]))
        ])),

        ('clf', MultiOutputClassifier(RandomForestClassifier(n_estimators=100, random_state=42, max_depth=100)))
    ])
    # specify parameters for grid search
    parameters = {
        'clf__estimator__min_samples_split' : [8,16],
    }

    # create grid search object
    # using f1 score rather than auc because of the significant imbalance in class distributions
    # per https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/
    scorer = make_scorer(f1_score, average = 'weighted')
    cv = GridSearchCV(pipeline_model, parameters, scoring = scorer, verbose=3)

    return cv

def evaluate_model(model, X_test, Y_test):
    '''
    Evalutes the model using sklearns 'classification report' function to return precision and recall for each class
    Inputs: model - model or pipeline contructed through sklearn (or other ML tool with a 
                        "predict" function avaialable for the object type)
            X_test - test set data [pandas dataframe/series]
            Y_test - correct categories for the test data [pandas dataframe/series]
    '''
    y_pred = model.predict(X_test)
    
    for index, feature in enumerate(Y_test.columns):
        print(feature)
        print(classification_report(Y_test[feature], y_pred[:, index]))
        print('f1 score: {}'.format(f1_score(Y_test[feature], y_pred[:, index], average='weighted')))
    
    pass


def save_model(model, model_filepath):
    '''
    Saves the model trained within the "main" function.
    '''
    pickle.dump(model, open(model_filepath, 'wb'))


In [55]:
database_filepath = 'data/DisasterResponse.db'
model_filepath = 'models/clasisfier3.pkl'
print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4)

Loading data...
    DATABASE: data/DisasterResponse.db
['buildings', 'other_weather', 'search_and_rescue', 'floods', 'clothing', 'hospitals', 'water', 'food', 'shops', 'medical_help', 'aid_centers', 'storm', 'direct_report', 'offer', 'military', 'electricity', 'aid_related', 'tools', 'cold', 'missing_people', 'earthquake', 'money', 'shelter', 'fire', 'infrastructure_related', 'other_infrastructure', 'security', 'request', 'weather_related', 'medical_products', 'refugees', 'other_aid', 'death', 'transport']


In [56]:
print('Building model...')
model = build_model()

Building model...


In [57]:
print('Training model...')
model.fit(X_train, Y_train)

print('Evaluating model...')
evaluate_model(model, X_test, Y_test)
# Note: used to also include parameter category_names

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

print('Trained model saved!')


Training model...
Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] clf__estimator__min_samples_split=8 .............................


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)
  Xt = transform.transform(Xt)


[CV]  clf__estimator__min_samples_split=8, score=0.4961378476499129, total= 2.6min
[CV] clf__estimator__min_samples_split=8 .............................

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.1min remaining:    0.0s





  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  5.7min remaining:    0.0s


[CV]  clf__estimator__min_samples_split=8, score=0.4929992408288853, total= 2.0min
[CV] clf__estimator__min_samples_split=8 .............................

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)
  Xt = transform.transform(Xt)



[CV]  clf__estimator__min_samples_split=8, score=0.49172140606020626, total= 2.0min
[CV] clf__estimator__min_samples_split=16 ............................


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)


[CV]  clf__estimator__min_samples_split=16, score=0.4960727136651752, total= 1.9min
[CV] clf__estimator__min_samples_split=16 ............................


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)


[CV]  clf__estimator__min_samples_split=16, score=0.4940717577506172, total= 1.9min
[CV] clf__estimator__min_samples_split=16 ............................


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)
  Xt = transform.transform(Xt)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 15.1min finished


[CV]  clf__estimator__min_samples_split=16, score=0.4883931812596443, total= 1.8min


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Evaluating model...


  Xt = transform.transform(Xt)


buildings
              precision    recall  f1-score   support

           0       0.95      1.00      0.98      9924
           1       0.83      0.15      0.26       563

   micro avg       0.95      0.95      0.95     10487
   macro avg       0.89      0.57      0.62     10487
weighted avg       0.95      0.95      0.94     10487

f1 score: 0.9369736709983834
other_weather
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      9911
           1       0.51      0.03      0.07       576

   micro avg       0.95      0.95      0.95     10487
   macro avg       0.73      0.52      0.52     10487
weighted avg       0.92      0.95      0.92     10487

f1 score: 0.9219554372267377
search_and_rescue
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     10196
           1       0.65      0.10      0.18       291

   micro avg       0.97      0.97      0.97     10487
   macro avg       0.81     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



f1 score: 0.9855768435912041
water
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      9853
           1       0.80      0.59      0.68       634

   micro avg       0.97      0.97      0.97     10487
   macro avg       0.89      0.79      0.83     10487
weighted avg       0.96      0.97      0.96     10487

f1 score: 0.9638424841098511
food
              precision    recall  f1-score   support

           0       0.97      0.98      0.97      9344
           1       0.81      0.71      0.76      1143

   micro avg       0.95      0.95      0.95     10487
   macro avg       0.89      0.85      0.87     10487
weighted avg       0.95      0.95      0.95     10487

f1 score: 0.949526828907321
shops
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10442
           1       0.00      0.00      0.00        45

   micro avg       1.00      1.00      1.00     10487
   macro avg       0.50 

f1 score: 0.9508204171281999
refugees
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     10150
           1       0.68      0.08      0.14       337

   micro avg       0.97      0.97      0.97     10487
   macro avg       0.82      0.54      0.56     10487
weighted avg       0.96      0.97      0.96     10487

f1 score: 0.9572899130827973
other_aid
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      9120
           1       0.60      0.04      0.07      1367

   micro avg       0.87      0.87      0.87     10487
   macro avg       0.73      0.52      0.50     10487
weighted avg       0.84      0.87      0.82     10487

f1 score: 0.8184516987751452
death
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     10021
           1       0.80      0.38      0.51       466

   micro avg       0.97      0.97      0.97     10487
   macro avg    

In [53]:
# load model
pickle.dump(model2, open(model_filepath,'wb'))

PicklingError: Can't pickle <function tokenize at 0x0000018ED7D4B268>: it's not the same object as __main__.tokenize