In [29]:
import sys
# import libraries
import pandas as pd
from sqlalchemy import create_engine
import nltk
import re
nltk.download(['punkt', 'wordnet','stopwords'])
import joblib
 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


def load_data(database_filepath):
    engine = create_engine('sqlite:///{}'.format(database_filepath))


    df = pd.read_sql("SELECT * FROM Message", engine)

    X = df['message']
    y = df.drop(columns=['id','message','original','genre'])

    return X,y,list(y.columns)


def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens
    



def build_model():


    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=10)))
    ])

    parameters = {
        'vect__ngram_range': ((1, 1), (1, 2)),

        'clf__estimator__min_samples_split': [2, 3, 4]

    }


    cv = GridSearchCV(pipeline, param_grid=parameters)

    return cv

def evaluate_model(model, X_test, Y_test, category_names):

    y_pred = model.best_estimator_.predict(X_test)
    y_pred_df = pd.DataFrame(y_pred,columns=Y_test.columns)

    for col in category_names:
        print(classification_report(Y_test[col], y_pred_df[col]))


def save_model(model, model_filepath):
    joblib.dump(model.best_estimator_, model_filepath)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#models/train_classifier.py
database_filepath ='data/DisasterResponse.db'
model_filepath ='models/classifier.pkl'

In [5]:
import os
os.chdir('..')

In [6]:
print('Loading data...\n    DATABASE: {}'.format(database_filepath))
X, Y, category_names = load_data(database_filepath)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

Loading data...
    DATABASE: data/DisasterResponse.db


In [7]:
X_train.head()

16873    Vaccination coverage in Afghanistan has been l...
18123    IOM Banda Aceh's truck fleet currently compris...
164      Hello, we live in La Plaine, close to Route Na...
8549     God bless you, I don't understand what you mean. 
19733    Instead, he said Kashmiris should be allowed t...
Name: message, dtype: object

In [8]:

print('Building model...')
model = build_model()


Building model...


In [9]:
print('Training model...')
model.fit(X_train, Y_train)
        

Training model...


GridSearchCV(estimator=Pipeline(steps=[('vect',
                                        CountVectorizer(tokenizer=<function tokenize at 0x0000021CEF660828>)),
                                       ('tfidf', TfidfTransformer()),
                                       ('clf',
                                        MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=10)))]),
             param_grid={'clf__estimator__min_samples_split': [2, 3, 4],
                         'vect__ngram_range': ((1, 1), (1, 2))})

In [12]:
model.best_params_

{'clf__estimator__min_samples_split': 2, 'vect__ngram_range': (1, 2)}

In [18]:
y_pred = model.best_estimator_.predict(X_test)

In [19]:
y_pred

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
print('Evaluating model...')
evaluate_model(model, X_test, Y_test, category_names)

In [32]:
model_filepath

'models/classifier.pkl'

In [35]:
joblib.dump(model.best_estimator_, model_filepath)

PicklingError: Can't pickle <function tokenize at 0x0000021CEF660828>: it's not the same object as __main__.tokenize

In [34]:
import pickle
pickle.dump(model.best_estimator_,open(model_filepath,'wb'))

PicklingError: Can't pickle <function tokenize at 0x0000021CEF660828>: it's not the same object as __main__.tokenize

In [30]:
print('Saving model...\n    MODEL: {}'.format(model_filepath))
save_model(model, model_filepath)

Saving model...
    MODEL: models/classifier.pkl


PicklingError: Can't pickle <function tokenize at 0x0000021CEF660828>: it's not the same object as __main__.tokenize

In [None]:
        print('Saving model...\n    MODEL: {}'.format(model_filepath))
        save_model(model, model_filepath)

        print('Trained model saved!')

    else:
        print('Please provide the filepath of the disaster messages database '\
              'as the first argument and the filepath of the pickle file to '\
              'save the model to as the second argument. \n\nExample: python '\
              'train_classifier.py ../data/DisasterResponse.db classifier.pkl')


if __name__ == '__main__':
    main()