In [1]:
import sys
# import libraries
import numpy as np
import pandas as pd
import nltk
import numpy as np
import pandas as pd
import sqlalchemy
import pickle

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.externals import joblib 
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_features = ['word_count','title_word_count','noun_count','verb_count','adj_count',
                    'adv_count','pron_count','genre_direct','genre_news','genre_social']


In [2]:
def load_data(database_filepath):
    # load from database
    engine = sqlalchemy.create_engine('sqlite:///{}'.format(database_filepath))
    df = pd.read_sql_table('InsertTableName', engine)
    
    # split input and response variables
    X = df[['message'] + numeric_features]
    #X = df['message']
    Y = df.drop(['id','message','original','genre'] + numeric_features, axis=1)
    category_names = Y.columns
    return X, Y, category_names


In [3]:
def tokenize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    # lemmatize and remove stop words
    stop_words = stopwords.words("english")
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    clean_tokens = [tok.lower().strip() for tok in tokens]

    return clean_tokens


In [4]:
def build_model():
    # Scale engineered features separate to Natural Language functions
    # We create the preprocessing pipelines for both numeric and text data.
    
    text_transformer = Pipeline([
                ('vect', CountVectorizer(tokenizer=tokenize, 
                                         ngram_range=(1, 2), 
                                         max_features=5000, 
                                         max_df=0.5,
                                        )),
                ('tfidf', TfidfTransformer(use_idf= True))
            ])
    
    preprocessor = ColumnTransformer(
                    transformers=[
                        ('num', StandardScaler(), numeric_features),
                        ('txt', text_transformer, 'message')
                        ])

    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    pipeline_model = Pipeline([
        ('preprocessor', preprocessor),
        #('clf', MultiOutputClassifier(LinearSVC(C=1.0, multi_class='crammer_singer', dual=False, random_state=42,max_iter = 10000)))
        ('clf', MultiOutputClassifier(RandomForestClassifier(min_samples_split=8, n_estimators=100, random_state=42)))
        ])

    return pipeline_model


In [9]:
def evaluate_model(model, X_test, Y_test):
    # Note: this function used to have a variable "category_names" per the default
    # I'm not sure it was necessary so I have removed it.

    y_pred = model.predict(X_test)
    
    for index, column in enumerate(Y_test.columns):
        print(column)
        print('f1 score: {}'.format(f1_score(Y_test[column].values, y_pred[:,index], average='weighted')))
        print('AUC score: {}'.format(roc_auc_score(Y_test[column].values, y_pred[:,index], average='weighted')))
        print('Class report: {}'.format(classification_report(Y_test[column].values, y_pred[:,index])))
    pass

In [6]:
def save_model(model, model_filepath):
    joblib.dump(model, model_filepath) 

In [7]:
def main(database_filepath, model_filepath):
    print('Loading data...\n    DATABASE: {}'.format(database_filepath))
    X, Y, category_names = load_data(database_filepath)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    print('Building model...')
    model = build_model()

    print('Training model...')
    #print(X_train.head())
    model.fit(X_train, Y_train)

    print('Evaluating model...')
    evaluate_model(model, X_test, Y_test)
    # Note: used to also include parameter category_names

    print('Saving model...\n    MODEL: {}'.format(model_filepath))
    save_model(model, model_filepath)

    print('Trained model saved!')

def failstmt():
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py /data/DisasterResponse.db classifier.pkl')


In [56]:
main('data/DisasterResponse.db', 'models/model.pkl')

Loading data...
    DATABASE: data/DisasterResponse.db
Building model...
Training model...
                                                 message  word_count  \
6618    Good morning! I would like to congratulate you.            8   
23819  KRCS plans to support those who have been affe...          37   
21840  At Mahadampa School, children crowd round the ...          26   
20100  The long-term plan will focus on groundwater e...          29   
23707  A long-running separatist rebellion had thinne...          11   

       title_word_count  noun_count  verb_count  adj_count  adv_count  \
6618                  2           0           0          0          0   
23819                 0           0           0          0          0   
21840                 3           0           0          0          0   
20100                 3           0           0          0          0   
23707                 1           0           0          0          0   

       pron_count  
6618            0

  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Evaluating model...


  res = transformer.transform(X)


ValueError: multiclass-multioutput is not supported

In [8]:
database_filepath = 'data/DisasterResponse.db'
model_filepath = 'models/model.pkl'

print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
#print(category_names)

print('Building model...')
model = build_model()

print('Training model...')
#print(X_train.head())
model.fit(X_train, Y_train)

print('Evaluating model...')
evaluate_model(model, X_test, Y_test)
# Note: used to also include parameter category_names

print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

print('Trained model saved!')

Loading data...
    DATABASE: data/DisasterResponse.db
Building model...
Training model...


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Evaluating model...


  res = transformer.transform(X)


0.6628122832128838
f1 score: 0.7875380604683766
AUC score: 0.6628122832128838
Class report:               precision    recall  f1-score   support

           0       0.75      0.37      0.49      1299
           1       0.82      0.96      0.88      3945

   micro avg       0.81      0.81      0.81      5244
   macro avg       0.78      0.66      0.69      5244
weighted avg       0.80      0.81      0.79      5244

0.7639821681526541
f1 score: 0.8956037339363213
AUC score: 0.7639821681526541
Class report:               precision    recall  f1-score   support

           0       0.92      0.97      0.94      4364
           1       0.81      0.55      0.66       880

   micro avg       0.90      0.90      0.90      5244
   macro avg       0.86      0.76      0.80      5244
weighted avg       0.90      0.90      0.90      5244

0.5
f1 score: 0.9939971522135932
AUC score: 0.5
Class report:               precision    recall  f1-score   support

           0       1.00      1.00      1.00  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



0.7510873131879604
f1 score: 0.94143679434714
AUC score: 0.7510873131879604
Class report:               precision    recall  f1-score   support

           0       0.96      0.99      0.97      4789
           1       0.80      0.51      0.63       455

   micro avg       0.95      0.95      0.95      5244
   macro avg       0.88      0.75      0.80      5244
weighted avg       0.94      0.95      0.94      5244

0.6910494654397094
f1 score: 0.9867049307620095
AUC score: 0.6910494654397094
Class report:               precision    recall  f1-score   support

           0       0.99      1.00      0.99      5166
           1       0.70      0.38      0.50        78

   micro avg       0.99      0.99      0.99      5244
   macro avg       0.84      0.69      0.74      5244
weighted avg       0.99      0.99      0.99      5244

0.5302398128290116
f1 score: 0.9701652164549394
AUC score: 0.5302398128290116
Class report:               precision    recall  f1-score   support

           0    

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



0.849852092647644
f1 score: 0.8893908220839805
AUC score: 0.849852092647644
Class report:               precision    recall  f1-score   support

           0       0.91      0.94      0.93      3808
           1       0.83      0.76      0.79      1436

   micro avg       0.89      0.89      0.89      5244
   macro avg       0.87      0.85      0.86      5244
weighted avg       0.89      0.89      0.89      5244

0.7359619398328885
f1 score: 0.9447324659236892
AUC score: 0.7359619398328885
Class report:               precision    recall  f1-score   support

           0       0.95      1.00      0.97      4801
           1       0.91      0.48      0.63       443

   micro avg       0.95      0.95      0.95      5244
   macro avg       0.93      0.74      0.80      5244
weighted avg       0.95      0.95      0.94      5244

0.8206134231837259
f1 score: 0.9456789243727683
AUC score: 0.8206134231837259
Class report:               precision    recall  f1-score   support

           0    

In [59]:
y_pred = model.predict(X_test)

  res = transformer.transform(X)


In [72]:
y_pred.shape

(5236, 10)

In [74]:
Y_test.shape

(5236, 10)

In [75]:
for index, column in enumerate(Y_test.columns):
    print(column)
    print(roc_auc_score(Y_test[column].values, y_pred[:,index]))

genre_direct
0.9429103662331009
genre_news
0.9531169295040262
genre_social
0.953631261691956
word_count


ValueError: multiclass format is not supported

In [76]:
Y_test.columns

Index(['genre_direct', 'genre_news', 'genre_social', 'word_count',
       'title_word_count', 'noun_count', 'verb_count', 'adj_count',
       'adv_count', 'pron_count'],
      dtype='object')