In [2]:
import nltk
nltk.download('stopwords')
nltk.download(['punkt', 'wordnet'])
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ai2318\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ai2318\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ai2318\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ai2318\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
from sqlalchemy import create_engine
import pandas as pd
import sqlite3
import numpy as np


In [4]:
# load data from database
engine = create_engine(
    'sqlite:///H:\\disaster-response-pipeline\\data\\disaster_record.db')

#read table and separate X and Y features
df = pd.read_sql_table('disaster_table', engine)

X = df.iloc[:, 1].values
y = df.iloc[:,5:40].values

In [5]:
X

array(['Weather update - a cold front from Cuba that could pass over Haiti',
       'Is the Hurricane over or is it not over',
       'Looking for someone but no name', ...,
       "Proshika, operating in Cox's Bazar municipality and 5 other unions, Ramu and Chokoria, assessment, 5 kg rice, 1,5 kg lentils to 700 families.",
       'Some 2,000 women protesting against the conduct of the elections were teargassed as they tried to converge on the local electoral commission offices in the southern oil city of Port Harcourt.',
       'A radical shift in thinking came about as a result of this meeting, recognizing that HIV/AIDS is at the core of the humanitarian crisis and identifying the crisis itself as a function of the HIV/AIDS pandemic.'],
      dtype=object)

In [10]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, validation_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, multilabel_confusion_matrix
from sklearn.model_selection import GridSearchCV

stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

#Tokenize message
def tokenize(text):
    #remove punctuation
    text = re.sub(r"[^a-zA-Z0-9]"," ", text.lower())

    #tokenize text
    tokens = word_tokenize(text)


    #lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)


In [8]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])


In [9]:
pipeline.fit(X_train, y_train)
y_pred1 = pipeline.predict(X_test)


In [17]:
from sklearn.metrics import classification_report
labels = list(df.iloc[:,4:39].columns)

print(classification_report(y_test, y_pred1, output_dict=False, target_names =labels))


# for index, label in enumerate(labels):
#     classification = classification_report(y_test[:,index], y_pred1[:,index]);
#     print(label,"\n",classification)

                        precision    recall  f1-score   support

               related       0.83      0.52      0.64      1125
               request       0.00      0.00      0.00        32
                 offer       0.76      0.70      0.73      2664
           aid_related       0.62      0.09      0.16       483
          medical_help       0.68      0.08      0.14       320
      medical_products       0.77      0.06      0.11       175
     search_and_rescue       0.00      0.00      0.00        98
              security       0.71      0.05      0.09       203
              military       0.00      0.00      0.00         0
           child_alone       0.91      0.37      0.53       427
                 water       0.83      0.59      0.69       691
                  food       0.84      0.38      0.53       560
               shelter       0.57      0.09      0.15        91
              clothing       0.67      0.03      0.06       139
                 money       1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#get parameters for pipeline
pipeline.get_params().keys()


In [29]:
#create gridsearch to improve params
from sklearn.model_selection import GridSearchCV
parameters = {
    'clf__estimator__max_depth': [2,3,4,5,6],
    'clf__estimator__min_samples_split': [2, 3,4,5],
    'clf__estimator__n_estimators': [100, 200, 500]
}

cv = GridSearchCV(pipeline, param_grid=parameters)


In [30]:
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)


In [None]:
cv.best_params_


In [None]:
cv.best_score_


# Compare original to HP

In [None]:
pipeline_HP = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])


In [None]:
pipeline_HP.fit(X_train, y_train,
                clf__estimator__max_depth=,
                clf__estimator__min_samples_split=,
                clf__estimator__n_estimators=)
y_pred_HP = pipeline_HP.predict(X_test)


In [None]:
for index, label in enumerate(labels):
    classification = classification_report(y_test[:,index], y_pred[:,index]);
    print(label,"\n",classification)

In [None]:
labels = list(df.iloc[:,4:40].columns)

for index, label in enumerate(labels):
    classification = classification_report(y_test[:,index], y_pred1[:,index]);
    print(label,"\n",classification)

# Model Improvement

## Algorithm Change

In [None]:
#Kneighborclassifier
from sklearn.neighbors import KNeighborsClassifier

pipeline_KN = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('kne', MultiOutputClassifier(KNeighborsClassifier()))
])


In [None]:
#Naive-bayes
from sklearn.naive_bayes import MultinomialNB

pipeline_nb = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('nb', MultiOutputClassifier(MultinomialNB()))
])


In [12]:
#XGBoost Classifier
from xgboost import XGBClassifier

pipeline_xg = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('xg', MultiOutputClassifier(XGBClassifier()))
])


In [None]:
#Check Score
pipeline_KN.fit(X_train, y_train)
y_pred_KN = pipeline_KN.predict(X_test)

print(classification_report(y_test, y_pred_KN))


In [None]:
pipeline_nb.fit(X_train, y_train)
y_pred_nb = pipeline_nb.predict(X_test)

print(classification_report(y_test, y_pred_nb))


In [None]:
pipeline_xg.fit(X_train, y_train)
y_pred_xg = pipeline_nb.predict(X_test)

print(classification_report(y_test, y_pred_xg))


Naive Bayes doesn't have tuning capabilities like XGBoost does, so we will go with XG. We use the sampling avg for multi-label

In [13]:
parameters = {
    'xg__estimator__min_child_weight': [0, 1, 5],
    'xg__estimator__n_estimators': np.arange(100, 1000, 100),
    'xg__estimator__learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'xg__estimator__gamma': [0, 1, 2],
    'xg__estimator__max_depth': [1, 2, 4]
}

cv = GridSearchCV(pipeline_xg, param_grid=parameters,
                  scoring='f1_macro', cv=2, n_jobs=-1)


In [None]:
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)
