In [1]:
# import libraries
import sys
import pandas as pd
from sqlalchemy import create_engine


def load_dataset(path, index_col):
    '''
    Loads a dataset from a CSV file

            Parameters:
                    path(str): CSV file path
                    index_col (str): name of column to use as index

            Returns:
                    df (pd.DataFrame): DataFrame of the dataset
    '''
    df = None
    try:
        df = pd.read_csv(path, index_col=index_col)
    except:
        print('Error while reading dataset: ' + path)
    return df


def clean_categories(categories: pd.DataFrame):
    '''
    Clean categories dataset columns and values

            Parameters:
                    categories(pd.DataFrame): CSV file path

            Returns:
                    categories (pd.DataFrame): DataFrame of the dataset
    '''
    # create a dataframe of the 36 individual category columns
    categories = categories.categories.str.split(pat=';', expand=True)

    # select the first row of the categories dataframe
    row = categories.iloc[0]

    # Removing the numerical part of the column name using str split
    category_colnames = list(row.str.split('-').str[0])

    # rename the columns of `categories`
    categories.columns = category_colnames

    # Convert category values to numeric for the newly created columns
    for column in categories:
        # set each value to be the last character of the string
        categories[column] = categories[column].str.split('-').str[1]

        # convert column from string to numeric
        categories[column] = pd.to_numeric(categories[column])
    
    # Remove rows that have value 2 in related column
    # There are 193 valuess. which is insignificant if we drop, 
    # given that they don't hold a special meaning
    categories = categories.drop(index=categories[categories.related == 2].index)
    
    return categories


def create_dataframe(messages, categories):
    '''
    Create full dataset combining messages and categories

            Parameters:
                    messages(pd.DataFrame): the messages dataset
                    categories(pd.DataFrame): the categories dataset

            Returns:
                    df (pd.DataFrame): DataFrame containing the full dataset
    '''
    # merge datasets
    df = pd.merge(messages, categories, how='inner', left_on='id', right_on='id')

    print('Removing duplicates')
    # check number of duplicates
    dups = df.shape[0] - df.drop_duplicates().shape[0]
    if dups > 0:
        # drop duplicates
        df = df.drop_duplicates()
        print('Removed {} duplicates'.format(dups))
    else:
        print('No duplicates found!')
    return df


def write_to_db(df: pd.DataFrame, db_name):
    '''
    Write dataframe to database

            Parameters:
                    df(pd.DataFrame): the dataframe to be written to db
                    db_name(str): name of the database
    '''
    try:
        engine = create_engine('sqlite:///' + db_name)
        df.to_sql('messages_and_categories', engine, index=False, if_exists='replace')
    except:
        print('Failed to write to database')


def main():
    # default paths in case no arguments were provided
    messages_path = './data/messages.csv'
    categories_path = './data/categories.csv'
    database_name = './data/DisasterResponse.db'

    if len(sys.argv) == 4:
        messages_path = sys.argv[1]
        categories_path = sys.argv[2]
        database_name = sys.argv[3]

    print('Starting ETL pipeline')

    # load messages dataset
    print('Loading "{}"'.format(messages_path))
    messages = load_dataset(messages_path, 'id')
    # load categories dataset
    print('Loading "{}"'.format((categories_path)))
    categories = load_dataset(categories_path, 'id')

    # clean categories
    print('Cleaning categories')
    categories = clean_categories(categories)

    # create dataset dataframe
    print('Creating dataset dataframe')
    df = create_dataframe(messages, categories)

    # write to database
    print('Writing to database')
    write_to_db(df, database_name)

    print('Finished ETL pipeline')

In [2]:
if __name__ == '__main__':
    main()

Starting ETL pipeline
Loading "./data/messages.csv"
Loading "./data/categories.csv"
Cleaning categories
Creating dataset dataframe
Removing duplicates
Removed 154 duplicates
Writing to database
Finished ETL pipeline


In [4]:
# Import libraries
import pandas as pd
import numpy as np
import pickle
import sys
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

stopwords_list = stopwords.words("english")

# Load data from database
def load_from_db(database_name):
    '''
    Load the dataset generated from ETL pipeline

            Returns:
                    X (pd.DataFrame): DataFrame containing the dataset
                    y (pd.DataFrame): Labels of the data
    '''
    engine = create_engine('sqlite:///' + database_name)
    df = pd.read_sql('messages_and_categories', engine)
    X = df.message
    y = df.drop(['message', 'original', 'genre'], axis=1)
    return X, y



def tokenize(text):
    '''
    Tokenize function performs multiple preprocessing steps:
        - Converts text to lower case
        - Replaces any special character
        - Tokenizes text
        - Removes stop words
        - Lemmatize
        - Stem

            Parameters:
                    text (str): a message to be tokenized

            Returns:
                    text (str): Preprocessed and tokenized text
    '''
    # Normalize
    text = text.lower()
    text = text.replace(r"([^a-zA-Z0-9])", ' ')

    # Tokenize
    text = word_tokenize(text)

    # Remove stop words, stem and lemmatize.
    # Those were combined in order not to iterate multiple times
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    text = [stemmer.stem(lemmatizer.lemmatize(w.strip()))
            for w in text if w not in stopwords_list]
    return text


# Display class-level performance metrics for the best model
def display_results(cv, y_test, y_pred):
    '''
    Display the model metrics; Accuracy, F1 score, Precision, Recall
    In case the pipeline is the result of some search algorithm the best_params are displayed.

            Parameters:
                    cv (Pipeline): The pipeline containing the model
                    y_test (numpy.array): Ground truth labels for test data
                    y_pred (numpy.array): Model output predictions
    '''

    for i in range(y_test.shape[1]):
        col = y_test.iloc[:, i]

        print('Label name: ', col.name)
        print('Label values:', np.unique(col))
        print('Accuracy: ', accuracy_score(col, y_pred[:, i]))
        print('F1 score: ', f1_score(col, y_pred[:, i], zero_division=0))
        print('Precision:', precision_score(col, y_pred[:, i], zero_division=0))
        print('recall:', recall_score(col, y_pred[:, i], zero_division=0))
        print('-----------------------')

    # This is in case what was being passed is just a model not as a result of gridserch
    if hasattr(cv, 'best_params_'):
        print("\nBest Model Parameters:", cv.best_params_)


def build_model(search_method=None):
    '''
    Build ml pipeline. In case a search method was specified it will be used to fit the pipeline

            Parameters:
                    search_method (str): The search method to be used if any (default: None)
                        'grid': GridSearchCV
                        'randomized': RandomizedSearchCV
                        None: no search method is to be used

            Returns:
                    cv (Pipeline): The pipeline containing the model
    '''

    pipeline = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier(n_jobs=-1), n_jobs=-1)),
    ])

    # The parameters were commented out because the execution time was very long on my machine
    # Only kept one to showcase that the implementation works
    parameters = {
        # 'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__max_df': (0.5, 0.75, 1.0),
        # 'vect__max_features': (None, 5000, 10000),
        'tfidf__use_idf': (True, False)
    }

    if search_method == 'grid':
        print('Using GridSearchCV')
        cv = GridSearchCV(pipeline, param_grid=parameters, scoring='f1_micro', verbose=3)
    elif search_method == 'randomized':
        print('Using RandomizedSearchCV')
        cv = RandomizedSearchCV(pipeline, param_distributions=parameters, verbose=2)
    else:
        print('Using the pipeline without a search method')
        # if no grid search is needed, just return the pipeline
        cv = pipeline

    return cv


# Save model to file
def save_model(model, path):
    '''
    Save specified model to path

            Parameters:
                    model (Pipeline): The pipeline containing the model
                    path (str): path to write the pickle file

    '''
    with open(path, 'wb') as file:
        pickle.dump(model, file)


def main():
    # default paths in case no arguments were provided
    database_name = './data/DisasterResponse.db'
    model_file_path = './best_model.pkl'


    if len(sys.argv) == 4:
        database_name = sys.argv[1]
        model_file_path = sys.argv[2]

    print('Starting ML pipeline')

    # Load data from database
    print('Loading data from database')
    X, y = load_from_db(database_name)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # build classifier
    print('Fitting model')
    model = build_model(search_method='grid')
    model.fit(X_train, y_train)

    # predict on test data
    print('Generating predictions')
    y_pred = model.predict(X_test)

    # display results
    print('Model scores per class:')
    display_results(model, y_test, y_pred)

    # Get the best model, in case GridSearch/RandomizedSearch was used. Which is the best_estimator_.
    # Otherwise, it is just the model itself
    best_model = model
    if hasattr(model, 'best_estimator_'):
        best_model = model.best_estimator_

    # Save model
    print('Saving model')
    save_model(best_model, model_file_path)

    print('Finished ML pipeline')

if __name__ == '__main__':
    main()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alien\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alien\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alien\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Starting ML pipeline
Loading data from database
Fitting model
Using GridSearchCV
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] tfidf__use_idf=True, vect__max_df=0.5 ...........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  tfidf__use_idf=True, vect__max_df=0.5, score=0.638, total=  27.0s
[CV] tfidf__use_idf=True, vect__max_df=0.5 ...........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   26.9s remaining:    0.0s


[CV]  tfidf__use_idf=True, vect__max_df=0.5, score=0.647, total=  27.5s
[CV] tfidf__use_idf=True, vect__max_df=0.5 ...........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   54.4s remaining:    0.0s


[CV]  tfidf__use_idf=True, vect__max_df=0.5, score=0.638, total=  27.6s
[CV] tfidf__use_idf=True, vect__max_df=0.5 ...........................
[CV]  tfidf__use_idf=True, vect__max_df=0.5, score=0.640, total=  27.7s
[CV] tfidf__use_idf=True, vect__max_df=0.5 ...........................
[CV]  tfidf__use_idf=True, vect__max_df=0.5, score=0.637, total=  27.7s
[CV] tfidf__use_idf=True, vect__max_df=0.75 ..........................
[CV]  tfidf__use_idf=True, vect__max_df=0.75, score=0.632, total=  27.6s
[CV] tfidf__use_idf=True, vect__max_df=0.75 ..........................
[CV]  tfidf__use_idf=True, vect__max_df=0.75, score=0.640, total=  27.7s
[CV] tfidf__use_idf=True, vect__max_df=0.75 ..........................
[CV]  tfidf__use_idf=True, vect__max_df=0.75, score=0.640, total=  27.7s
[CV] tfidf__use_idf=True, vect__max_df=0.75 ..........................
[CV]  tfidf__use_idf=True, vect__max_df=0.75, score=0.635, total=  27.8s
[CV] tfidf__use_idf=True, vect__max_df=0.75 ......................

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 14.1min finished


Generating predictions
Model scores per class:
Label name:  related
Label values: [0 1]
Accuracy:  0.8192879098360656
F1 score:  0.8894806924101198
Precision: 0.8309673642616713
recall: 0.9568587799123695
-----------------------
Label name:  request
Label values: [0 1]
Accuracy:  0.897797131147541
F1 score:  0.6246472248353716
Precision: 0.8258706467661692
recall: 0.5022692889561271
-----------------------
Label name:  offer
Label values: [0 1]
Accuracy:  0.9966700819672131
F1 score:  0.0
Precision: 0.0
recall: 0.0
-----------------------
Label name:  aid_related
Label values: [0 1]
Accuracy:  0.7784323770491803
F1 score:  0.7215963952365625
Precision: 0.7460898502495841
recall: 0.6986600186974136
-----------------------
Label name:  medical_help
Label values: [0 1]
Accuracy:  0.9216188524590164
F1 score:  0.10263929618768328
Precision: 0.6363636363636364
recall: 0.05582137161084529
-----------------------
Label name:  medical_products
Label values: [0 1]
Accuracy:  0.953125
F1 score: 