# ML Pipeline Preperation
## *Disaster Response Project*

### Imports

In [1]:
import numpy as np
import pandas as pd 
import re
import nltk
import pickle
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thr3e\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thr3e\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thr3e\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\thr3e\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Static Variables and Configurations

In [2]:
url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

### Functions & Methods

In [3]:
def normalizeUrls(text):
  urls = re.findall(url_regex, text)

  for url in urls:
    text = text.replace(url, 'urlph')

  return text 

In [4]:
def tokenize(text):
  # remove the punctuations and special characters
  text = re.sub(r"[^a-zA-Z0-9]", " ", text).lower().strip()

  # tokenize the text
  tokens = word_tokenize(text)

  # remove stopwords
  tokens = [t for t in tokens if t not in stopwords.words("english")]

  # define the lematizer
  lemm = WordNetLemmatizer()

  # lemmatize words to the base form
  tokens = [lemm.lemmatize(t) for t in tokens]

  # lematize verbs also to the base form
  tokens = [lemm.lemmatize(t, pos="v") for t in tokens]

  return tokens

### Pre-Processing

In [5]:
# Read database table and convert to dataframe
engine = create_engine('sqlite:///disaster.db')
df = pd.read_sql("SELECT * from messages", engine)
df = df.drop(['original'], axis=1)
df.shape

(26216, 39)

In [6]:
# there are non-binary values at the dataframe, we got errors at classification report :(
# - we will copy the dataframe to check binary values
df_binary_check = df.copy()

In [7]:
# get unique values for each column in df_binary_check and see which one is not binary
for col in df_binary_check.columns[3:]:
  unq = df_binary_check[col].unique()
  if np.any((unq > 1) | (unq < 0)):
    print(col, "-", unq) # we can see the 'related' values are not binary, it does have the values of 2

related - [1 0 2]


In [8]:
df.shape

(26216, 39)

In [9]:
# for each column, if there are values equal 2, drop the row
for col in df.columns[3:]:
  df = df[df[col] < 2].dropna()
  unq = df[col].unique()
  # if np.any((unq > 1) | (unq < 0)):
    # print(col, " - ", x)


In [10]:
# the rows deleted that had a value of 2
print(df_binary_check.shape)

(26216, 39)


In [11]:
# drop the nulls from the dataframe
df.isna().sum()
# we noted that there are no null values in the dataframe, we have had removed the duplicates earier


id                        0
message                   0
genre                     0
related                   0
request                   0
offer                     0
aid_related               0
medical_help              0
medical_products          0
search_and_rescue         0
security                  0
military                  0
child_alone               0
water                     0
food                      0
shelter                   0
clothing                  0
money                     0
missing_people            0
refugees                  0
death                     0
other_aid                 0
infrastructure_related    0
transport                 0
buildings                 0
electricity               0
tools                     0
hospitals                 0
shops                     0
aid_centers               0
other_infrastructure      0
weather_related           0
floods                    0
storm                     0
fire                      0
earthquake          

In [12]:
# X = df[df.columns[3:]]
# y = df['message']

X = df['message']
y = df[df.columns[3:]]

In [13]:
y.shape

(26028, 36)

### Pipeline Processing

In [14]:
# define the machine learning pipeline
pipeline = Pipeline(
    [
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('classifier', (RandomForestClassifier(n_jobs=-1)))
    ]
)

In [15]:
# split the data in training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=11, train_size=.8)

# y_train.shape

In [16]:
# fit the model in pipeline took 9.46 min
pipeline.fit(X_train, y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(tokenizer=<function tokenize at 0x0000027046246F70>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', RandomForestClassifier(n_jobs=-1))])

### Prediction Testing

In [17]:
# predict the test data took 34 sec
y_pred = pipeline.predict(X_test)


In [18]:
# test shape of the outcome

y_pred.shape, y_test.shape, len(list(df.columns[3:]))

((5206, 36), (5206, 36), 36)

In [19]:
# check accuracy of your model
acc = (y_pred == y_test).mean()
print("Accuracy - ", acc)

Accuracy -  related                   0.830964
request                   0.896658
offer                     0.996350
aid_related               0.771802
medical_help              0.916827
medical_products          0.946408
search_and_rescue         0.972724
security                  0.982520
military                  0.970419
child_alone               1.000000
water                     0.952171
food                      0.930081
shelter                   0.929889
clothing                  0.986362
money                     0.978294
missing_people            0.986554
refugees                  0.966769
death                     0.954668
other_aid                 0.878602
infrastructure_related    0.931233
transport                 0.954476
buildings                 0.947945
electricity               0.980023
tools                     0.994814
hospitals                 0.987130
shops                     0.995198
aid_centers               0.988667
other_infrastructure      0.953131
weather_

In [20]:
# print the classification report
class_rept = classification_report(y_test, y_pred, target_names=df.columns[3:])
print(class_rept)

                        precision    recall  f1-score   support

               related       0.86      0.93      0.89      3922
               request       0.83      0.49      0.62       881
                 offer       0.00      0.00      0.00        19
           aid_related       0.81      0.58      0.67      2135
          medical_help       0.85      0.02      0.05       442
      medical_products       0.80      0.04      0.08       288
     search_and_rescue       0.47      0.05      0.09       141
              security       0.00      0.00      0.00        88
              military       0.45      0.03      0.06       153
           child_alone       0.00      0.00      0.00         0
                 water       0.88      0.28      0.42       328
                  food       0.87      0.45      0.59       589
               shelter       0.85      0.24      0.38       457
              clothing       0.79      0.14      0.24        79
                 money       1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# Export the pipeline as a model
pickle.dump(pipeline, open('random_forest_model.pkl', 'wb'))

### 6. Use GridSearch to improve the model
we ll use grid search to find better parameters. 

In [22]:
# pipeline = Pipeline(
#     [
#         ('vect', CountVectorizer(tokenizer=tokenize)),
#         ('tfidf', TfidfTransformer()),
#         ('classifier', MultiOutputClassifier(RandomForestClassifier()))
#     ]
# )

In [23]:
# parameters = {
#     'vect__ngram_range': ((1, 1), (1, 2)),
#     'vect__max_df': (0.5, 0.75, 1.0),
#     'vect__max_features': (None, 5000, 10000),
#     'tfidf__use_idf': (True, False),
#     'tfidf__sublinear_tf': (True, False),
#     'classifier__estimator__n_estimators': (200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000),
#     'classifier__estimator__bootstrap': (True, False),
#     'classifier__estimator__max_depth': (10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None),
#     'classifier__estimator__max_features': ('auto', 'sqrt'),
#     'classifier__estimator__min_samples_leaf': (1, 2, 4),
#     'classifier__estimator__min_samples_split': (2, 5, 10)
# }

parameters = {'clf__estimator__max_features':['sqrt', 0.5],
              'clf__estimator__n_estimators':[50, 100]}

cv = GridSearchCV(estimator=pipeline, param_grid = parameters, cv = 5, n_jobs = -1)


In [24]:
cv.get_params().keys()


dict_keys(['cv', 'error_score', 'estimator__memory', 'estimator__steps', 'estimator__verbose', 'estimator__vect', 'estimator__tfidf', 'estimator__classifier', 'estimator__vect__analyzer', 'estimator__vect__binary', 'estimator__vect__decode_error', 'estimator__vect__dtype', 'estimator__vect__encoding', 'estimator__vect__input', 'estimator__vect__lowercase', 'estimator__vect__max_df', 'estimator__vect__max_features', 'estimator__vect__min_df', 'estimator__vect__ngram_range', 'estimator__vect__preprocessor', 'estimator__vect__stop_words', 'estimator__vect__strip_accents', 'estimator__vect__token_pattern', 'estimator__vect__tokenizer', 'estimator__vect__vocabulary', 'estimator__tfidf__norm', 'estimator__tfidf__smooth_idf', 'estimator__tfidf__sublinear_tf', 'estimator__tfidf__use_idf', 'estimator__classifier__bootstrap', 'estimator__classifier__ccp_alpha', 'estimator__classifier__class_weight', 'estimator__classifier__criterion', 'estimator__classifier__max_depth', 'estimator__classifier__m

In [25]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12)



In [26]:
cv.fit(X_train, y_train)


In [None]:
y_pred = cv.predict(X_test)


In [None]:
print(classification_report(y_test, y_pred, target_names=y.columns))


In [None]:
pickle.dump(pipeline, open('random_forest_model_improved.pkl', 'wb'))

