# ML Pipeline Preparation

### 1. Importing libraries and loading data from database.


In [53]:
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords'])
import re
import numpy as np
import pandas as pd
import pickle
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support as score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from nltk.corpus import stopwords
from sqlalchemy import create_engine
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bisma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\bisma\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\bisma\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bisma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# loading data from database
engine = create_engine('sqlite:///DisasterResponse.db')

df = pd.read_sql_table("disaster_messages", con=engine)

In [4]:
df.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that c...,Un front froid se retrouve sur Cuba ce matin. ...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ...",direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospi...,UN reports Leogane 80-90 destroyed. Only Hospi...,direct,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country ...",facade ouest d Haiti et le reste du pays aujou...,direct,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Assigning feature and target variables
X = df['message']
y = df.iloc[:,4:]

In [7]:
# Analyzing feature variable
X.head()

0    Weather update - a cold front from Cuba that c...
1              Is the Hurricane over or is it not over
2                      Looking for someone but no name
3    UN reports Leogane 80-90 destroyed. Only Hospi...
4    says: west side of Haiti, rest of the country ...
Name: message, dtype: object

In [8]:
# Analyzing target variables
y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Saving category names
category_names = y.columns
category_names

Index(['related', 'request', 'offer', 'aid_related', 'medical_help',
       'medical_products', 'search_and_rescue', 'security', 'military',
       'water', 'food', 'shelter', 'clothing', 'money', 'missing_people',
       'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport',
       'buildings', 'electricity', 'tools', 'hospitals', 'shops',
       'aid_centers', 'other_infrastructure', 'weather_related', 'floods',
       'storm', 'fire', 'earthquake', 'cold', 'other_weather',
       'direct_report'],
      dtype='object')

In [10]:
# Filling NaN values with 0 so tha the model that doesn't throw errors. 
Y = Y.fillna(0)
Y.head()

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,water,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


NaN means that the message doesn't belong to this category, so we can easily replace it with 0.

### 2. Writing a tokenization function to process our text data

In [11]:
def tokenize(text):
    # Removing urls
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
        
    # Removing punctuation
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    # Tokenizing the data
    words = word_tokenize(text)
    
    # Remocing stop words
    stop_words = set(stopwords.words('english')) 
    tokens = [w for w in words if not w in stop_words]
    
    # Lemmatizing the data
    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)


    return clean_tokens

### 3. Building a machine learning pipeline


### Model 1 - Random Forest

In [12]:
# Random Forest pipeline
pipeline_rf = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(RandomForestClassifier()))
    ])

In [13]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [14]:
# Fitting the traning data to the pipeline
pipeline_rf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                        ccp_alpha=0.0,
                                                                        class_weight=None,
                                                                        criterion='gini',
                                                                   

In [16]:
# Predicting the results
y_pred_rf = pipeline_rf.predict(X_test)
y_pred_rf

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [18]:
# Classification report category-wise and overall
print(classification_report(y_test, y_pred_rf, target_names = category_names))

                        precision    recall  f1-score   support

               related       0.84      0.94      0.89      3993
               request       0.84      0.49      0.62       925
                 offer       0.00      0.00      0.00        28
           aid_related       0.76      0.69      0.73      2146
          medical_help       0.67      0.11      0.20       384
      medical_products       0.74      0.11      0.18       237
     search_and_rescue       0.45      0.03      0.06       147
              security       0.00      0.00      0.00       104
              military       0.75      0.06      0.11       159
                 water       0.88      0.38      0.53       336
                  food       0.85      0.65      0.73       565
               shelter       0.88      0.38      0.53       464
              clothing       0.82      0.12      0.20        78
                 money       1.00      0.04      0.08       124
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Random Forest took a lot of time to train and predict the data and gave an weighted average F-1 score of 0.58, which is decent enough for a multi-class classification problem. Let's see if we can do better.

### Model 2 - SVC

In [20]:
pipeline_svc = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(SVC(kernel='linear')))
    ])

In [21]:
pipeline_svc.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=SVC(C=1.0, break_ties=False,
                                                     cache_size=200,
                                

In [22]:
y_pred_svc = pipeline_svc.predict(X_test)

In [23]:
print(classification_report(y_test, y_pred_svc, target_names = category_names))

                        precision    recall  f1-score   support

               related       0.85      0.93      0.89      3993
               request       0.78      0.60      0.68       925
                 offer       0.00      0.00      0.00        28
           aid_related       0.75      0.70      0.72      2146
          medical_help       0.59      0.24      0.34       384
      medical_products       0.70      0.29      0.41       237
     search_and_rescue       0.63      0.13      0.21       147
              security       0.00      0.00      0.00       104
              military       0.64      0.31      0.42       159
                 water       0.75      0.68      0.71       336
                  food       0.81      0.75      0.78       565
               shelter       0.84      0.57      0.68       464
              clothing       0.67      0.44      0.53        78
                 money       0.58      0.12      0.20       124
        missing_people       0.82      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVC too way too much time as compared to the Random Forest and may not be an approporate choice for the real-time analysis or/and the dataset is huge. However, it performed better than Random Forest and gave an weighted average F-1 score of 0.64. 

### Model 3 - KNN

In [24]:
pipeline_knn = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(KNeighborsClassifier()))
    ])

In [25]:
pipeline_knn.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at 0x000001F224DC4950>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=KNeighborsClassifier(algorithm='au

In [26]:
y_pred_knn = pipeline_knn.predict(X_test)

In [27]:
print(classification_report(y_test, y_pred_knn, target_names = category_names))

                        precision    recall  f1-score   support

               related       0.78      0.98      0.87      3993
               request       0.80      0.08      0.15       925
                 offer       0.00      0.00      0.00        28
           aid_related       0.79      0.04      0.08      2146
          medical_help       0.00      0.00      0.00       384
      medical_products       0.40      0.01      0.02       237
     search_and_rescue       1.00      0.01      0.03       147
              security       0.00      0.00      0.00       104
              military       0.00      0.00      0.00       159
                 water       0.80      0.02      0.05       336
                  food       0.82      0.05      0.10       565
               shelter       0.77      0.04      0.08       464
              clothing       1.00      0.01      0.03        78
                 money       1.00      0.02      0.03       124
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KNN took very less time as it should have done. It is a lazy algorithm and doesn't really learn anything from the data as it just memorizes the data. That's why it performed very poorly and gave an weighted average F-1 score of 0.27. 

### Model 4 - Gradient Boosting

In [28]:
pipeline_gbc = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(GradientBoostingClassifier()))
    ])

In [29]:
pipeline_gbc.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                                            max_depth=3,
                                                                            max_features=None,
                                                                            max_leaf_nodes=None,
                                                      

In [30]:
y_pred_gbc = pipeline_gbc.predict(X_test)

In [31]:
print(classification_report(y_test, y_pred_gbc, target_names = category_names))

                        precision    recall  f1-score   support

               related       0.79      0.98      0.87      3993
               request       0.83      0.48      0.61       925
                 offer       0.05      0.04      0.04        28
           aid_related       0.78      0.58      0.67      2146
          medical_help       0.62      0.21      0.32       384
      medical_products       0.67      0.30      0.41       237
     search_and_rescue       0.30      0.17      0.22       147
              security       0.18      0.09      0.12       104
              military       0.62      0.28      0.39       159
                 water       0.75      0.65      0.69       336
                  food       0.78      0.79      0.79       565
               shelter       0.84      0.58      0.69       464
              clothing       0.51      0.46      0.48        78
                 money       0.46      0.26      0.33       124
        missing_people       0.17      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Gradient Boosting performed better than Random Forest but to a lesser extent as compared to SVC. It gave an weighted average F-1 score of 0.62.

### Model 5 - AdaBoost

In [32]:
pipeline_ab = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
    ])

In [33]:
pipeline_ab.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at 0x000001F224DC4950>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=AdaBoostClassifier(algorithm='SAMM

In [34]:
y_pred_ab = pipeline_ab.predict(X_test)

In [36]:
print(classification_report(y_test, y_pred_ab, target_names = category_names))

                        precision    recall  f1-score   support

               related       0.79      0.98      0.87      3993
               request       0.77      0.52      0.62       925
                 offer       0.00      0.00      0.00        28
           aid_related       0.77      0.62      0.68      2146
          medical_help       0.60      0.29      0.39       384
      medical_products       0.62      0.32      0.42       237
     search_and_rescue       0.62      0.19      0.29       147
              security       0.36      0.05      0.08       104
              military       0.57      0.37      0.45       159
                 water       0.77      0.67      0.72       336
                  food       0.80      0.66      0.72       565
               shelter       0.79      0.53      0.63       464
              clothing       0.67      0.47      0.56        78
                 money       0.54      0.34      0.42       124
        missing_people       0.43      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Adaptive Boosting performed second best after SVC with  n weighted average F-1 score of 0.63.

### Model 6 - XG Boost

In [54]:
pipeline_xgb = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(XGBClassifier()))
    ])

In [55]:
pipeline_xgb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                                                               booster='gbtree',
                                                               colsample_bylevel=1,
                                                               colsample_bynode=1,
                                                               colsample_bytree=1,
    

In [56]:
y_pred_xgb = pipeline_ab.predict(X_test)

In [57]:
print(classification_report(y_test, y_pred_xgb, target_names = category_names))

                        precision    recall  f1-score   support

               related       0.79      0.98      0.87      3993
               request       0.77      0.52      0.62       925
                 offer       0.00      0.00      0.00        28
           aid_related       0.77      0.62      0.68      2146
          medical_help       0.60      0.29      0.39       384
      medical_products       0.62      0.32      0.42       237
     search_and_rescue       0.62      0.19      0.29       147
              security       0.36      0.05      0.08       104
              military       0.57      0.37      0.45       159
                 water       0.77      0.67      0.72       336
                  food       0.80      0.66      0.72       565
               shelter       0.79      0.53      0.63       464
              clothing       0.67      0.47      0.56        78
                 money       0.54      0.34      0.42       124
        missing_people       0.43      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


XG Boost performed same as AdaBoost with n weighted average F-1 score of 0.63.

### Model 7 - Naive Bayes

In [37]:
pipeline_nb = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(MultinomialNB()))
    ])

In [38]:
pipeline_nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at 0x000001F224DC4950>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=MultinomialNB(alpha=1.0,
         

In [39]:
y_pred_nb = pipeline_nb.predict(X_test)

In [40]:
print(classification_report(y_test, y_pred_nb, target_names = category_names))

                        precision    recall  f1-score   support

               related       0.78      0.99      0.87      3993
               request       0.85      0.22      0.35       925
                 offer       0.00      0.00      0.00        28
           aid_related       0.73      0.64      0.68      2146
          medical_help       0.00      0.00      0.00       384
      medical_products       0.00      0.00      0.00       237
     search_and_rescue       0.00      0.00      0.00       147
              security       0.00      0.00      0.00       104
              military       0.00      0.00      0.00       159
                 water       0.00      0.00      0.00       336
                  food       0.76      0.03      0.06       565
               shelter       0.00      0.00      0.00       464
              clothing       0.00      0.00      0.00        78
                 money       0.00      0.00      0.00       124
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes performed poorly as compared to other algorithms and it only performed better than KNN. However, it is pretty fast and can be used for large datasets and real-time analysis.

### Mdoel 8 - Logistic Regression

In [49]:
pipeline_lr = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(LogisticRegression(solver = 'lbfgs')))
    ])

In [50]:
pipeline_lr.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=LogisticRegression(C=1.0,
                                                                    class_weight=None,
                 

In [51]:
y_pred_lr = pipeline_lr.predict(X_test)

In [52]:
print(classification_report(y_test, y_pred_lr, target_names = category_names))

                        precision    recall  f1-score   support

               related       0.83      0.96      0.89      3993
               request       0.81      0.53      0.64       925
                 offer       0.00      0.00      0.00        28
           aid_related       0.77      0.67      0.72      2146
          medical_help       0.65      0.17      0.27       384
      medical_products       0.75      0.21      0.33       237
     search_and_rescue       0.70      0.05      0.09       147
              security       0.00      0.00      0.00       104
              military       0.53      0.12      0.19       159
                 water       0.80      0.50      0.62       336
                  food       0.84      0.62      0.72       565
               shelter       0.83      0.46      0.59       464
              clothing       0.86      0.23      0.36        78
                 money       0.56      0.08      0.14       124
        missing_people       1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Although Logistic Regression is a very simple algorithm, it seems to be a powerful one, outperforming many of the other advanced algorithms such as Random Forest. It gave and weighted avg. F-1 score of 0.6.

### 4. Improving our model

We can see from the above analysis that SVC performs the best out of all models tested above based on the weighted avg. F-1 score but it is very slow even with a 'linear' kernel. This might not be a good choice in case the dataset size increases in the future. Similarly XGBoost and AdaBoost perfomed almost similar to SVC, but are computationally expensive. So, let's choose Logistic Regression as our final model as it performs decent enough and is less computation expensive. Let's also tweak some of its parameters using Grid Search.

In [70]:
sorted(pipeline_lr.get_params().keys())

['clf',
 'clf__estimator',
 'clf__estimator__C',
 'clf__estimator__class_weight',
 'clf__estimator__dual',
 'clf__estimator__fit_intercept',
 'clf__estimator__intercept_scaling',
 'clf__estimator__l1_ratio',
 'clf__estimator__max_iter',
 'clf__estimator__multi_class',
 'clf__estimator__n_jobs',
 'clf__estimator__penalty',
 'clf__estimator__random_state',
 'clf__estimator__solver',
 'clf__estimator__tol',
 'clf__estimator__verbose',
 'clf__estimator__warm_start',
 'clf__n_jobs',
 'memory',
 'steps',
 'tfidf',
 'tfidf__norm',
 'tfidf__smooth_idf',
 'tfidf__sublinear_tf',
 'tfidf__use_idf',
 'vect',
 'vect__analyzer',
 'vect__binary',
 'vect__decode_error',
 'vect__dtype',
 'vect__encoding',
 'vect__input',
 'vect__lowercase',
 'vect__max_df',
 'vect__max_features',
 'vect__min_df',
 'vect__ngram_range',
 'vect__preprocessor',
 'vect__stop_words',
 'vect__strip_accents',
 'vect__token_pattern',
 'vect__tokenizer',
 'vect__vocabulary',
 'verbose']

In [71]:
# Grid Search CV
parameters = {
        'clf__estimator__solver': ['newton-cg', 'lbfgs'],
    }

cv_lr = GridSearchCV(pipeline_lr, param_grid=parameters)
cv_lr

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        p

### 5. Testing our improved model


In [72]:
# Running Grid Search - taking too much time
cv_lr.fit(X_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vect',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        p

In [75]:
y_pred_lr = cv_lr.predict(X_test)

In [76]:
print(classification_report(y_test, y_pred_lr, target_names = category_names))

                        precision    recall  f1-score   support

               related       0.83      0.96      0.89      3993
               request       0.81      0.53      0.64       925
                 offer       0.00      0.00      0.00        28
           aid_related       0.77      0.67      0.72      2146
          medical_help       0.65      0.17      0.27       384
      medical_products       0.75      0.21      0.33       237
     search_and_rescue       0.70      0.05      0.09       147
              security       0.00      0.00      0.00       104
              military       0.53      0.12      0.19       159
                 water       0.80      0.50      0.62       336
                  food       0.84      0.62      0.72       565
               shelter       0.83      0.46      0.59       464
              clothing       0.86      0.23      0.36        78
                 money       0.56      0.08      0.14       124
        missing_people       1.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


After running GridSearch, it performs same as running without it. It is partly because we haven't passed a lot of parameters to tune. If we have more computation resources, we can pass in some more parameters or even choose AdaBoost/XGBoost.

### 6. Exporting our model as a pickle file

In [87]:
pickle.dump(cv_lr, open('model.pkl', 'wb'))

## Future Work

1) This seems to be an  imbalanced dataset as the distribution of classes present in the dataset is not unfiform, meaning number of data points available for different classes are differnt. This creates a bias as ML algorithms are built to minimize errors. Algorithms are much more likley to classify new observations to the majority class.

We could do re-sampling in order to mitigate these problems through Over-sampling or Under-sampling. Over-sampling increases the samples in minority class  without losing any data but it is prone to Overfitting. This can be done using Synthetic Minority Over-sampling Technique (SMOTE) or Adaptive Synthetic (ADASYN). 

Under-sampling redcues the number of samples belonging to majority class, thus we lose some of the data that might be useful. This can be done using NearMiss-n, Tomek's link or Edited nearest neighbors.

2) We could check the correlation between different features and drop variables, which have high correlation as some of the models such as Naive Bayes assume that features are independent. 

3) We could further fine-tune the model by experming with different hyper-paramters in the Grid Search.