In [None]:
!pip install nltk
!pip install plotly

In [None]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger','stopwords'])

In [None]:
import sys
import os
import re
from sqlalchemy import create_engine
import pickle
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)

from nltk.corpus import stopwords

from sklearn.metrics import confusion_matrix
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier

In [None]:
# load data from database
database_filepath = "../data/disaster_response_db.db"
engine = create_engine('sqlite:///' + database_filepath)
table_name = os.path.basename(database_filepath).replace(".db","") + "_table"
df = pd.read_sql_table(table_name,engine)

In [None]:
df = pd.read_csv('/Users/chiguo/DATA_SCIENCE/Proj/Disaster_Response_Pipeline/data/disaster_response_table.csv')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# Remove child alone column as it has all zeros only
df = df.drop(['child_alone'],axis=1)

In [None]:
df.groupby("related").size()

In [None]:
df[df['related']==2].head()

In [None]:
# after reviewing the number of msg and their content for those with related as 2
# here we consider them as valid msg and replace 2 with 1 
df['related'] = df['related'].apply(lambda x: 1 if x == 2 else x)
df.groupby("related").size()

In [None]:
# Extract X and y variables from the data for the modelling
X = df['message']
y = df.iloc[:,4:]

In [None]:
print(stopwords.words("english"))

In [None]:
# Build a function to normalize, tokenize and lemmatize the text data
def tokenize(text, url_place_holder_string="urlplaceholder"):
    """
    Tokenize the text function
    
    Arguments:
        text -> Messages to be tokenized and lemmatized
    Output:
        clean_tokens -> List of tokens extracted from the provided text
    """
    
    # Replace all urls with a urlplaceholder string
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    # Extract all the urls from the provided text 
    detected_urls = re.findall(url_regex, text)
    # Replace url with a url placeholder string
    for detected_url in detected_urls:
        text = text.replace(detected_url, url_place_holder_string)

    # remove punctiation
    text = re.sub(r'[^a-zA-Z0-9]', " ", text)
    
    # Extract the word tokens from the input text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words if any
    words = [w for w in tokens if w not in stopwords.words("english")]
    
    # Lemmatizer to map the words back to its root
    lemmatizer = nltk.WordNetLemmatizer()

    # List of clean tokens
    cleaned_tokens = [lemmatizer.lemmatize(w).lower().strip() for w in words]
    return cleaned_tokens


In [None]:
# Build a custom transformer which extract the starting verb of a sentence
class StartingVerbExtractor(BaseEstimator, TransformerMixin):
    """
    Tokenize the text function
    
    Arguments:
        text -> Messages to be tokenized and lemmatized
    Output:
        cleaned_tokens -> List of tokens extracted from the provided text
    """
    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            if first_tag in ['VB', 'VBP', 'VBD', 'VBG','VBZ', 'VBN'] or first_word == 'RT':
                return True
        return False

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)


In [None]:
# Build a machine learning pipeline

pipeline_1 = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('starting_verb', StartingVerbExtractor())
    ])),

    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

pipeline_2 = Pipeline([
    ('features', FeatureUnion([

        ('text_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('starting_verb', StartingVerbExtractor())
    ])),

    ('clf', MultiOutputClassifier(AdaBoostClassifier()))
])

parameters_1 = {
    'clf__estimator__min_samples_split': [2, 4]
    #'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
    #'clf__estimator__n_estimators': [50, 100, 200],
    #'clf__estimator__min_samples_split': [2, 3, 4]
}

parameters_2 = {
    'clf__estimator__learning_rate': [0.01, 0.02, 0.05],
    'clf__estimator__n_estimators': [10, 20, 40]
}
    
grid_1 = GridSearchCV(pipeline_1, param_grid=parameters_1, scoring='f1_micro', n_jobs=-1)
grid_2 = GridSearchCV(pipeline_2, param_grid=parameters_2, scoring='f1_micro', n_jobs=-1)


In [None]:
pipeline_1.get_params().keys()
pipeline_2.get_params().keys()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
grid_1.fit(X_train, y_train)
#grid_2.fit(X_train, y_train)

In [None]:
grid_1.grid_scores_
#grid_2.grid_scores_

In [None]:
#finding the best paramesters based on grip search
print(grid_1.best_params_)
#print(grid_2.best_params_)

In [None]:
optimised_model_1 = grid_1.best_estimator_
#optimised_model_2 = grid_2.best_estimator_

In [None]:
#y_prediction_train_1 = optimised_model_1.predict(X_train)
y_prediction_test_1 = optimised_model_1.predict(X_test)
#y_prediction_train_2 = optimised_model_2.predict(X_train)
#y_prediction_test_2 = optimised_model_2.predict(X_test)

In [None]:
print(classification_report(y_test.values, y_prediction_test_1, target_names=y.columns.values))
#print(classification_report(y_train.values, y_prediction_train_1, target_names=y.columns.values))
#print(classification_report(y_test.values, y_prediction_test_2, target_names=y.columns.values))
#print(classification_report(y_train.values, y_prediction_train_2, target_names=y.columns.values))

In [None]:
# Export the trained model as a pickle file
with open('clf.pkl', 'wb') as file:
    pickle.dump(optimised_model_1, file)