In [9]:
import pandas as pd
import nltk
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import pickle

In [3]:
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [4]:
engine = create_engine('sqlite:///DisasterResponse.db')
df = pd.read_sql_table('DisasterResponse',engine)
df.shape

(26180, 39)

In [5]:
X =  df['message']
y = df.iloc[:,4:]

In [6]:
def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")

    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()

    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## First Pipeline

In [7]:
#pipeline_v1 = Pipeline([
#    ('tf_vect', TfidfVectorizer(tokenizer = tokenize)),
#    ('clf', MultiOutputClassifier(RandomForestClassifier()))
#    ])

In [None]:
#y_pred = pipeline_v1.predict(X_test)
#print(classification_report(y_test, y_pred, target_names=y.columns.values))

In [None]:
'''
#Gridsearch
    
    parameters = {
            #TFIDF Parameters 
            'tf_vect__max_df': (0.8, 1.0),
    
            #Random Forest Parameters
            'clf__estimator__n_estimators': [50, 100]
            'clf__estimator__min_samples_split': [2, 4]
            }

    cv = GridSearchCV(pipeline_v1, param_grid=parameters, n_jobs=-1)
    cv.fit(X_train, y_train)


y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred, target_names=y.columns.values))
'''

Results superceded by second pipeline

## Second Pipeline

In [15]:
 pipeline_v2 = Pipeline([
        ('features', FeatureUnion([
            ('text_pipeline', Pipeline([
                ('cvt', CountVectorizer(tokenizer=tokenize)),
                ('tfidf', TfidfTransformer())
            ]))
        ])),
        
        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
    ])

In [16]:
pipeline_v2.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('text_pipeline',
                                                 Pipeline(steps=[('cvt',
                                                                  CountVectorizer(tokenizer=<function tokenize at 0x2EA322B0>)),
                                                                 ('tfidf',
                                                                  TfidfTransformer())]))])),
                ('clf', MultiOutputClassifier(estimator=AdaBoostClassifier()))])

In [17]:
y_pred = pipeline_v2.predict(X_test)
print(classification_report(y_test, y_pred, target_names=y.columns.values))

                        precision    recall  f1-score   support

               related       0.84      0.93      0.88      5042
               request       0.79      0.58      0.67      1135
                 offer       0.20      0.03      0.05        36
           aid_related       0.78      0.59      0.67      2760
          medical_help       0.65      0.25      0.36       569
      medical_products       0.66      0.37      0.47       344
     search_and_rescue       0.52      0.18      0.27       183
              security       0.37      0.06      0.10       116
              military       0.59      0.30      0.40       218
                 water       0.78      0.60      0.68       463
                  food       0.78      0.71      0.74       738
               shelter       0.79      0.56      0.65       595
              clothing       0.67      0.38      0.48        95
                 money       0.53      0.26      0.35       150
        missing_people       0.64      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
parameters = {
            #Text Pipeline - Countvectorizer Parameters 
            #'features__text_pipeline__cvt__min_df': [1, 5],
            
            #Text Pipeline - Tfidf Parameters
            'features__text_pipeline__tfidf__use_idf': (True, False),
            
            #Adaboost Forest Parameters
            #'clf__estimator__min_samples_split': [2, 4]}
            'clf__estimator__n_estimators': [50, 100]
            }

In [22]:
cv = GridSearchCV(pipeline_v2, param_grid=parameters, n_jobs=-1, verbose =5)
cv.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  20 | elapsed:  2.7min remaining: 10.7min
[Parallel(n_jobs=-1)]: Done   9 out of  20 | elapsed:  3.2min remaining:  3.9min
[Parallel(n_jobs=-1)]: Done  14 out of  20 | elapsed:  5.6min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  6.5min finished


GridSearchCV(estimator=Pipeline(steps=[('features',
                                        FeatureUnion(transformer_list=[('text_pipeline',
                                                                        Pipeline(steps=[('cvt',
                                                                                         CountVectorizer(tokenizer=<function tokenize at 0x2EA322B0>)),
                                                                                        ('tfidf',
                                                                                         TfidfTransformer())]))])),
                                       ('clf',
                                        MultiOutputClassifier(estimator=AdaBoostClassifier()))]),
             n_jobs=-1,
             param_grid={'clf__estimator__n_estimators': [50, 100],
                         'features__text_pipeline__tfidf__use_idf': (True,
                                                                     False)},
       

In [23]:
y_pred = cv.predict(X_test)
print(classification_report(y_test, y_pred, target_names=y.columns.values))

                        precision    recall  f1-score   support

               related       0.84      0.93      0.88      5042
               request       0.79      0.58      0.67      1135
                 offer       0.09      0.03      0.04        36
           aid_related       0.77      0.62      0.69      2760
          medical_help       0.62      0.24      0.35       569
      medical_products       0.69      0.39      0.50       344
     search_and_rescue       0.47      0.19      0.27       183
              security       0.16      0.03      0.04       116
              military       0.67      0.34      0.45       218
                 water       0.73      0.60      0.66       463
                  food       0.78      0.70      0.74       738
               shelter       0.78      0.54      0.64       595
              clothing       0.64      0.43      0.52        95
                 money       0.49      0.26      0.34       150
        missing_people       0.54      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
