In [1]:
# import libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
#from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SEAY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SEAY\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
# load data from database
engine = create_engine('sqlite:///InsertDatabaseName.db')
df = pd.read_sql_table('clean_message_dataset', engine)
X = df['message'] # the message is the predictor
Y = df.iloc[:,4:] # the 36 categories is what we are trying to predict the message falls in.

In [3]:
def tokenize(text):
    words = word_tokenize(text)
    words = [w for w in words if w not in stopwords.words("english")] # removing stop words
    return words

In [4]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, Y)
pipeline.fit(X_train, y_train)
predicted = pipeline.predict(X_test)

In [6]:
for col in range(0,36,1):
    print(y_test.columns[col], col)
    print(classification_report(y_test.iloc[:,col], predicted[:,col]))

related 0
              precision    recall  f1-score   support

           0       0.74      0.35      0.48      1516
           1       0.82      0.96      0.89      4987
           2       0.57      0.08      0.14        51

    accuracy                           0.81      6554
   macro avg       0.71      0.46      0.50      6554
weighted avg       0.80      0.81      0.79      6554

request 1
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      5453
           1       0.85      0.50      0.63      1101

    accuracy                           0.90      6554
   macro avg       0.88      0.74      0.78      6554
weighted avg       0.90      0.90      0.89      6554

offer 2
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6522
           1       0.00      0.00      0.00        32

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      1.00      0.98      6351
           1       0.82      0.04      0.08       203

    accuracy                           0.97      6554
   macro avg       0.89      0.52      0.53      6554
weighted avg       0.97      0.97      0.96      6554

death 17
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6237
           1       0.91      0.10      0.18       317

    accuracy                           0.96      6554
   macro avg       0.94      0.55      0.58      6554
weighted avg       0.95      0.96      0.94      6554

other_aid 18
              precision    recall  f1-score   support

           0       0.87      1.00      0.93      5704
           1       0.61      0.02      0.04       850

    accuracy                           0.87      6554
   macro avg       0.74      0.51      0.48      6554
weighted avg       0.84      0.87      0.82      655

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
parameters = {
    'vect__strip_accents': ['ascii','unicode', None],
    'vect__lowercase': [True, False],
    'tfidf__sublinear_tf': [True, False],
}

cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)
predicted_grid = cv.predict(X_test)

In [8]:
for col in range(0,36,1):
    print(y_test.columns[col], col)
    print(classification_report(y_test.iloc[:,col], predicted_grid[:,col]))

related 0
              precision    recall  f1-score   support

           0       0.74      0.35      0.48      1516
           1       0.82      0.96      0.89      4987
           2       0.44      0.08      0.13        51

    accuracy                           0.81      6554
   macro avg       0.67      0.46      0.50      6554
weighted avg       0.80      0.81      0.79      6554

request 1
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      5453
           1       0.85      0.49      0.62      1101

    accuracy                           0.90      6554
   macro avg       0.88      0.74      0.78      6554
weighted avg       0.90      0.90      0.89      6554

offer 2
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      6522
           1       0.00      0.00      0.00        32

    accuracy                           1.00      6554
   macro avg       0.50      0.50      0.50   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.96      1.00      0.98      6128
           1       0.93      0.33      0.49       426

    accuracy                           0.95      6554
   macro avg       0.94      0.67      0.73      6554
weighted avg       0.95      0.95      0.94      6554

food 11
              precision    recall  f1-score   support

           0       0.94      0.99      0.96      5814
           1       0.85      0.48      0.61       740

    accuracy                           0.93      6554
   macro avg       0.89      0.73      0.79      6554
weighted avg       0.93      0.93      0.92      6554

shelter 12
              precision    recall  f1-score   support

           0       0.94      0.99      0.97      6012
           1       0.83      0.33      0.47       542

    accuracy                           0.94      6554
   macro avg       0.89      0.66      0.72      6554
weighted avg       0.93      0.94      0.93      6554



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.97      0.99      0.98      5939
           1       0.89      0.75      0.81       615

    accuracy                           0.97      6554
   macro avg       0.93      0.87      0.90      6554
weighted avg       0.97      0.97      0.97      6554

cold 33
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      6428
           1       0.75      0.07      0.13       126

    accuracy                           0.98      6554
   macro avg       0.87      0.54      0.56      6554
weighted avg       0.98      0.98      0.97      6554

other_weather 34
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      6211
           1       0.62      0.01      0.03       343

    accuracy                           0.95      6554
   macro avg       0.79      0.51      0.50      6554
weighted avg       0.93      0.95      0.92      