In [1]:
# import libraries
import pandas as pd
import numpy as np
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.base import BaseEstimator, TransformerMixin


from sqlalchemy import create_engine



class DenseTfIdf(BaseEstimator, TransformerMixin):
    '''
    Docstring for DenseTfIdf
    '''
    def __init__(self, smooth_idf=True):
        self.smooth_idf = smooth_idf
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return TfidfTransformer(smooth_idf=self.smooth_idf).fit_transform(X).toarray()


def show_results(column, predictions, actual_label):
    print('Results for ', column)
    accuracy = accuracy_score(actual_label, predictions)
    precision = precision_score(actual_label, predictions)
    recall = recall_score(actual_label, predictions)
    f1 = f1_score(actual_label, predictions)
    print('Accuracy Score: {:.2f}    Precision: {:.2f}    Recall: {:.2f}    F1-Score: {:.2f}\n'.format(accuracy, precision, recall, f1))

def tokenize(text):
    '''
    Input:
        - text = Text data (responses from disasters)
        - stop_words = List of stop words from the text language (English in this exercise)
        - lemmatizer = Lemmatizer object from nltk library to reduce words to their stem forms
    Ouput: 
        - A list of word tokens with preprocessings. Preprocessings done: 
            1. Punctuations removed (only alphanumeric characters left)
            2. Forced lowercase
            3. Leading/trailing space removed
            4. Exclude english stopwords
            5. Lemmatized to its stem word
    '''
    stop_words = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    
    alphanumeric_pattern = re.compile('[^a-zA-Z0-9 ]')
    alphanumeric_text = re.sub(alphanumeric_pattern, ' ', text)
    alphanumeric_text_lower = alphanumeric_text.lower()
    
    tokens = word_tokenize(alphanumeric_text_lower)
    
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return tokens
    

# load data from database
engine = create_engine('sqlite:///disasterpipeline.db')
df = pd.read_sql('SELECT * FROM messages', engine)
X = df['message']
y = df[df.columns[4:]]



[nltk_data] Downloading package punkt to
[nltk_data]     /home/aldistefanus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aldistefanus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/aldistefanus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
y_train

Unnamed: 0,categories,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
15272,related-1;request-0;offer-0;aid_related-1;medi...,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15797,related-1;request-0;offer-0;aid_related-1;medi...,1,0,0,1,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
7072,related-0;request-0;offer-0;aid_related-0;medi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3261,related-1;request-0;offer-0;aid_related-1;medi...,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23655,related-1;request-0;offer-0;aid_related-1;medi...,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9539,related-0;request-0;offer-0;aid_related-0;medi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7174,related-0;request-0;offer-0;aid_related-0;medi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15219,related-1;request-0;offer-0;aid_related-1;medi...,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15259,related-0;request-0;offer-0;aid_related-0;medi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20312,related-0;request-0;offer-0;aid_related-0;medi...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [2]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize))
    ,('tf-idf', DenseTfIdf())
    ,('multi_output_clf', MultiOutputClassifier(estimator=GaussianNB()))  
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

y_preds = pipeline.predict(X_test)

for index, col in enumerate(y.columns):

    predictions = y_preds[:, index]
    actual_label = y_test[:, index]

    show_results(col, predictions, actual_label)

TypeError: '<' not supported between instances of 'int' and 'str'