# ML Pipeline Preparation

### 1. Import libraries and load data from database



In [1]:
import re
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import  f1_score, classification_report, accuracy_score, recall_score, precision_score
from sklearn.model_selection import train_test_split, GridSearchCV
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

nltk.download(['punkt','wordnet','stopwords'])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
engine = create_engine('sqlite:///messages.db')
df = pd.read_sql_table('messages', engine)
X = df['message']
Y = df.iloc[:,4:].values

In [3]:
# get name of classes
labels = df.columns[4:]

### 2. Write a tokenization function to process your text data

In [229]:
def tokenize(text):
    
    # Normalization
    text = text.lower()
    text = re.sub(r"[^a-zA-Z]", " ", text) 

    # Tokenize
    words = nltk.word_tokenize(text)
    
    # Remove Stopwords
    words = [w for w in words if w not in stopwords.words('english')]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmed = [lemmatizer.lemmatize(w).strip() for w in words]
        
    return lemmed

### 3. Build a machine learning pipeline
This machine pipeline take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [230]:
pipeline = Pipeline([
        ('vect',CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(MultinomialNB())),
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [231]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [232]:
%%time
pipeline.fit(X_train, y_train)

CPU times: user 1min 11s, sys: 5.86 s, total: 1min 17s
Wall time: 1min 18s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ssifier(estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
           n_jobs=1))])

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset.

In [233]:
def evaluate_model(y_true, y_pred, model_name):
    acc = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred, average='weighted')
    prec = precision_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')
    print('[{0}]Accuracy:{1:.2f} [{0}]Recall:{2:.2f} [{0}]Precision:{3:.2f} [{0}]F1_score:{4:.2f}\n' \
            .format(model_name, acc, recall, prec, f1))
    #print(classification_report(y_true, y_pred))

In [234]:
y_preds = pipeline.predict(X_test)

In [235]:
# evaluate model
for i in range(len(labels)):
    print(labels[i].upper())
    evaluate_model(y_test[:, i],y_preds[:, i], 'NB')
    #print(classification_report(y_test[:, i],y_preds[:, i]))

RELATED
[NB]Accuracy:0.78 [NB]Recall:0.78 [NB]Precision:0.78 [NB]F1_score:0.70

REQUEST
[NB]Accuracy:0.86 [NB]Recall:0.86 [NB]Precision:0.86 [NB]F1_score:0.83

OFFER
[NB]Accuracy:0.99 [NB]Recall:0.99 [NB]Precision:0.99 [NB]F1_score:0.99

AID_RELATED
[NB]Accuracy:0.76 [NB]Recall:0.76 [NB]Precision:0.76 [NB]F1_score:0.75

MEDICAL_HELP
[NB]Accuracy:0.92 [NB]Recall:0.92 [NB]Precision:0.89 [NB]F1_score:0.89

MEDICAL_PRODUCTS
[NB]Accuracy:0.95 [NB]Recall:0.95 [NB]Precision:0.95 [NB]F1_score:0.93

SEARCH_AND_RESCUE
[NB]Accuracy:0.98 [NB]Recall:0.98 [NB]Precision:0.95 [NB]F1_score:0.96

SECURITY
[NB]Accuracy:0.98 [NB]Recall:0.98 [NB]Precision:0.97 [NB]F1_score:0.97

MILITARY
[NB]Accuracy:0.97 [NB]Recall:0.97 [NB]Precision:0.93 [NB]F1_score:0.95

WATER
[NB]Accuracy:0.94 [NB]Recall:0.94 [NB]Precision:0.88 [NB]F1_score:0.91

FOOD
[NB]Accuracy:0.89 [NB]Recall:0.89 [NB]Precision:0.89 [NB]F1_score:0.84

SHELTER
[NB]Accuracy:0.92 [NB]Recall:0.92 [NB]Precision:0.92 [NB]F1_score:0.88

CLOTHING
[NB]Accu


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



In [236]:
# evaluate on train set to check for overfitting
y_preds_train = pipeline.predict(X_train)
for i in range(len(labels)):
    print(labels[i].upper())
    evaluate_model(y_train[:, i],y_preds_train[:, i], 'NB-train')

RELATED
[NB-train]Accuracy:0.81 [NB-train]Recall:0.81 [NB-train]Precision:0.83 [NB-train]F1_score:0.76

REQUEST
[NB-train]Accuracy:0.88 [NB-train]Recall:0.88 [NB-train]Precision:0.88 [NB-train]F1_score:0.85

OFFER
[NB-train]Accuracy:1.00 [NB-train]Recall:1.00 [NB-train]Precision:0.99 [NB-train]F1_score:0.99

AID_RELATED
[NB-train]Accuracy:0.84 [NB-train]Recall:0.84 [NB-train]Precision:0.84 [NB-train]F1_score:0.84

MEDICAL_HELP
[NB-train]Accuracy:0.92 [NB-train]Recall:0.92 [NB-train]Precision:0.89 [NB-train]F1_score:0.88

MEDICAL_PRODUCTS
[NB-train]Accuracy:0.95 [NB-train]Recall:0.95 [NB-train]Precision:0.91 [NB-train]F1_score:0.92

SEARCH_AND_RESCUE
[NB-train]Accuracy:0.97 [NB-train]Recall:0.97 [NB-train]Precision:0.94 [NB-train]F1_score:0.96

SECURITY
[NB-train]Accuracy:0.98 [NB-train]Recall:0.98 [NB-train]Precision:0.96 [NB-train]F1_score:0.97

MILITARY
[NB-train]Accuracy:0.97 [NB-train]Recall:0.97 [NB-train]Precision:0.95 [NB-train]F1_score:0.95

WATER
[NB-train]Accuracy:0.94 [NB-tr


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[NB-train]Accuracy:0.87 [NB-train]Recall:0.87 [NB-train]Precision:0.78 [NB-train]F1_score:0.81

INFRASTRUCTURE_RELATED
[NB-train]Accuracy:0.93 [NB-train]Recall:0.93 [NB-train]Precision:0.87 [NB-train]F1_score:0.90

TRANSPORT
[NB-train]Accuracy:0.95 [NB-train]Recall:0.95 [NB-train]Precision:0.91 [NB-train]F1_score:0.93

BUILDINGS
[NB-train]Accuracy:0.95 [NB-train]Recall:0.95 [NB-train]Precision:0.90 [NB-train]F1_score:0.92

ELECTRICITY
[NB-train]Accuracy:0.98 [NB-train]Recall:0.98 [NB-train]Precision:0.96 [NB-train]F1_score:0.97

TOOLS
[NB-train]Accuracy:0.99 [NB-train]Recall:0.99 [NB-train]Precision:0.99 [NB-train]F1_score:0.99

HOSPITALS
[NB-train]Accuracy:0.99 [NB-train]Recall:0.99 [NB-train]Precision:0.98 [NB-train]F1_score:0.98

SHOPS
[NB-train]Accuracy:1.00 [NB-train]Recall:1.00 [NB-train]Precision:0.99 [NB-train]F1_score:0.99

AID_CENTERS
[NB-train]Accuracy:0.99 [NB-train]Recall:0.99 [NB-train]Precision:0.98 [NB-train]F1_score:0.98

OTHER_INFRASTRUCTURE
[NB-train]Accuracy:0.96 [N

### 6. Improve your model
Use grid search to find better parameters. 

In [239]:
cv.estimator.get_params().keys()

dict_keys(['memory', 'steps', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__alpha', 'clf__estimator__class_prior', 'clf__estimator__fit_prior', 'clf__estimator', 'clf__n_jobs'])

In [240]:
%%time
parameters = {
     'clf__estimator__fit_prior': [True, False],
     'clf__estimator__alpha': [0.5, 0.7, 1.0],
}

cv = GridSearchCV(pipeline, param_grid=parameters, verbose=3, scoring='f1_weighted')
cv.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] clf__estimator__alpha=0.5, clf__estimator__fit_prior=True .......



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.5, clf__estimator__fit_prior=True, score=0.42432332937896, total= 1.3min
[CV] clf__estimator__alpha=0.5, clf__estimator__fit_prior=True .......


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.2min remaining:    0.0s

F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.5, clf__estimator__fit_prior=True, score=0.427523262446678, total= 1.3min
[CV] clf__estimator__alpha=0.5, clf__estimator__fit_prior=True .......


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.4min remaining:    0.0s

F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.5, clf__estimator__fit_prior=True, score=0.42509663424052546, total= 1.3min
[CV] clf__estimator__alpha=0.5, clf__estimator__fit_prior=False ......
[CV]  clf__estimator__alpha=0.5, clf__estimator__fit_prior=False, score=0.5485838111599952, total= 1.3min
[CV] clf__estimator__alpha=0.5, clf__estimator__fit_prior=False ......
[CV]  clf__estimator__alpha=0.5, clf__estimator__fit_prior=False, score=0.5553410806381147, total= 1.3min
[CV] clf__estimator__alpha=0.5, clf__estimator__fit_prior=False ......
[CV]  clf__estimator__alpha=0.5, clf__estimator__fit_prior=False, score=0.5529338520373385, total= 1.3min
[CV] clf__estimator__alpha=0.7, clf__estimator__fit_prior=True .......



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.7, clf__estimator__fit_prior=True, score=0.40356563203729623, total= 1.3min
[CV] clf__estimator__alpha=0.7, clf__estimator__fit_prior=True .......



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.7, clf__estimator__fit_prior=True, score=0.4063683993090031, total= 1.3min
[CV] clf__estimator__alpha=0.7, clf__estimator__fit_prior=True .......



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.7, clf__estimator__fit_prior=True, score=0.4054391866030226, total= 1.3min
[CV] clf__estimator__alpha=0.7, clf__estimator__fit_prior=False ......
[CV]  clf__estimator__alpha=0.7, clf__estimator__fit_prior=False, score=0.5310054543547935, total= 1.3min
[CV] clf__estimator__alpha=0.7, clf__estimator__fit_prior=False ......
[CV]  clf__estimator__alpha=0.7, clf__estimator__fit_prior=False, score=0.5368838831905199, total= 1.3min
[CV] clf__estimator__alpha=0.7, clf__estimator__fit_prior=False ......
[CV]  clf__estimator__alpha=0.7, clf__estimator__fit_prior=False, score=0.5335136254582041, total= 1.3min
[CV] clf__estimator__alpha=1.0, clf__estimator__fit_prior=True .......



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=1.0, clf__estimator__fit_prior=True, score=0.38252428561332197, total= 1.3min
[CV] clf__estimator__alpha=1.0, clf__estimator__fit_prior=True .......



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=1.0, clf__estimator__fit_prior=True, score=0.3864090132550835, total= 1.3min
[CV] clf__estimator__alpha=1.0, clf__estimator__fit_prior=True .......



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=1.0, clf__estimator__fit_prior=True, score=0.38287483666463784, total= 1.3min
[CV] clf__estimator__alpha=1.0, clf__estimator__fit_prior=False ......
[CV]  clf__estimator__alpha=1.0, clf__estimator__fit_prior=False, score=0.5101624981581948, total= 1.3min
[CV] clf__estimator__alpha=1.0, clf__estimator__fit_prior=False ......
[CV]  clf__estimator__alpha=1.0, clf__estimator__fit_prior=False, score=0.5157069249728614, total= 1.3min
[CV] clf__estimator__alpha=1.0, clf__estimator__fit_prior=False ......
[CV]  clf__estimator__alpha=1.0, clf__estimator__fit_prior=False, score=0.5112901356661392, total= 1.3min


[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed: 39.1min finished


CPU times: user 37min 8s, sys: 2min 56s, total: 40min 4s
Wall time: 40min 21s


### 7. Test your model
Show the accuracy, precision, and recall of the tuned model.  


In [242]:
%%time
y_preds_best = cv.predict(X_test)
y_preds_best_train = cv.predict(X_train)

CPU times: user 1min 43s, sys: 8.2 s, total: 1min 51s
Wall time: 1min 52s


In [241]:
best_model = cv.best_estimator_.steps[2][1]
best_model

MultiOutputClassifier(estimator=MultinomialNB(alpha=0.5, class_prior=None, fit_prior=False),
           n_jobs=1)

In [243]:
for i in range(len(labels)):
    print(labels[i].upper())
    evaluate_model(y_test[:, i],y_preds_best[:, i], 'NB-tuned')

RELATED
[NB-tuned]Accuracy:0.82 [NB-tuned]Recall:0.82 [NB-tuned]Precision:0.82 [NB-tuned]F1_score:0.82

REQUEST
[NB-tuned]Accuracy:0.86 [NB-tuned]Recall:0.86 [NB-tuned]Precision:0.87 [NB-tuned]F1_score:0.87

OFFER
[NB-tuned]Accuracy:0.99 [NB-tuned]Recall:0.99 [NB-tuned]Precision:0.99 [NB-tuned]F1_score:0.99

AID_RELATED
[NB-tuned]Accuracy:0.74 [NB-tuned]Recall:0.74 [NB-tuned]Precision:0.75 [NB-tuned]F1_score:0.74

MEDICAL_HELP
[NB-tuned]Accuracy:0.90 [NB-tuned]Recall:0.90 [NB-tuned]Precision:0.90 [NB-tuned]F1_score:0.90

MEDICAL_PRODUCTS
[NB-tuned]Accuracy:0.93 [NB-tuned]Recall:0.93 [NB-tuned]Precision:0.93 [NB-tuned]F1_score:0.93

SEARCH_AND_RESCUE
[NB-tuned]Accuracy:0.96 [NB-tuned]Recall:0.96 [NB-tuned]Precision:0.95 [NB-tuned]F1_score:0.96

SECURITY
[NB-tuned]Accuracy:0.97 [NB-tuned]Recall:0.97 [NB-tuned]Precision:0.97 [NB-tuned]F1_score:0.97

MILITARY
[NB-tuned]Accuracy:0.96 [NB-tuned]Recall:0.96 [NB-tuned]Precision:0.96 [NB-tuned]F1_score:0.96

WATER
[NB-tuned]Accuracy:0.92 [NB-tu

In [244]:
for i in range(len(labels)):
    print(labels[i].upper())
    evaluate_model(y_train[:, i],y_preds_best_train[:, i], 'NBtuned-train')

RELATED
[NBtuned-train]Accuracy:0.89 [NBtuned-train]Recall:0.89 [NBtuned-train]Precision:0.89 [NBtuned-train]F1_score:0.89

REQUEST
[NBtuned-train]Accuracy:0.88 [NBtuned-train]Recall:0.88 [NBtuned-train]Precision:0.90 [NBtuned-train]F1_score:0.89

OFFER
[NBtuned-train]Accuracy:0.98 [NBtuned-train]Recall:0.98 [NBtuned-train]Precision:0.99 [NBtuned-train]F1_score:0.99

AID_RELATED
[NBtuned-train]Accuracy:0.86 [NBtuned-train]Recall:0.86 [NBtuned-train]Precision:0.86 [NBtuned-train]F1_score:0.86

MEDICAL_HELP
[NBtuned-train]Accuracy:0.92 [NBtuned-train]Recall:0.92 [NBtuned-train]Precision:0.92 [NBtuned-train]F1_score:0.92

MEDICAL_PRODUCTS
[NBtuned-train]Accuracy:0.93 [NBtuned-train]Recall:0.93 [NBtuned-train]Precision:0.94 [NBtuned-train]F1_score:0.94

SEARCH_AND_RESCUE
[NBtuned-train]Accuracy:0.95 [NBtuned-train]Recall:0.95 [NBtuned-train]Precision:0.96 [NBtuned-train]F1_score:0.96

SECURITY
[NBtuned-train]Accuracy:0.96 [NBtuned-train]Recall:0.96 [NBtuned-train]Precision:0.97 [NBtuned-tr

### 8. Try improving model further:

In [245]:
%%time
# Linear SVM
pipeline_svm = Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer()),
        ('clf', MultiOutputClassifier(SGDClassifier(loss='hinge', penalty='l2'))),
 ])
pipeline_svm.fit(X_train, y_train)

CPU times: user 1min 12s, sys: 5.58 s, total: 1min 17s
Wall time: 1min 18s


In [246]:
y_preds_svm = pipeline_svm.predict(X_test)

In [247]:
y_preds_svm_train = pipeline_svm.predict(X_train)

In [248]:
for i in range(len(labels)):
    print(labels[i].upper())
    evaluate_model(y_train[:, i],y_preds_svm_train[:, i], 'SVM-TRAIN')

RELATED
[SVM-TRAIN]Accuracy:0.89 [SVM-TRAIN]Recall:0.89 [SVM-TRAIN]Precision:0.89 [SVM-TRAIN]F1_score:0.88

REQUEST
[SVM-TRAIN]Accuracy:0.92 [SVM-TRAIN]Recall:0.92 [SVM-TRAIN]Precision:0.92 [SVM-TRAIN]F1_score:0.92

OFFER
[SVM-TRAIN]Accuracy:1.00 [SVM-TRAIN]Recall:1.00 [SVM-TRAIN]Precision:0.99 [SVM-TRAIN]F1_score:0.99

AID_RELATED
[SVM-TRAIN]Accuracy:0.85 [SVM-TRAIN]Recall:0.85 [SVM-TRAIN]Precision:0.85 [SVM-TRAIN]F1_score:0.85

MEDICAL_HELP
[SVM-TRAIN]Accuracy:0.94 [SVM-TRAIN]Recall:0.94 [SVM-TRAIN]Precision:0.93 [SVM-TRAIN]F1_score:0.92

MEDICAL_PRODUCTS
[SVM-TRAIN]Accuracy:0.96 [SVM-TRAIN]Recall:0.96 [SVM-TRAIN]Precision:0.96 [SVM-TRAIN]F1_score:0.95

SEARCH_AND_RESCUE
[SVM-TRAIN]Accuracy:0.98 [SVM-TRAIN]Recall:0.98 [SVM-TRAIN]Precision:0.97 [SVM-TRAIN]F1_score:0.97

SECURITY
[SVM-TRAIN]Accuracy:0.98 [SVM-TRAIN]Recall:0.98 [SVM-TRAIN]Precision:0.96 [SVM-TRAIN]F1_score:0.97

MILITARY
[SVM-TRAIN]Accuracy:0.98 [SVM-TRAIN]Recall:0.98 [SVM-TRAIN]Precision:0.98 [SVM-TRAIN]F1_score:0.97




Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[SVM-TRAIN]Accuracy:0.87 [SVM-TRAIN]Recall:0.87 [SVM-TRAIN]Precision:0.87 [SVM-TRAIN]F1_score:0.82

INFRASTRUCTURE_RELATED
[SVM-TRAIN]Accuracy:0.94 [SVM-TRAIN]Recall:0.94 [SVM-TRAIN]Precision:0.94 [SVM-TRAIN]F1_score:0.90

TRANSPORT
[SVM-TRAIN]Accuracy:0.96 [SVM-TRAIN]Recall:0.96 [SVM-TRAIN]Precision:0.96 [SVM-TRAIN]F1_score:0.95

BUILDINGS
[SVM-TRAIN]Accuracy:0.97 [SVM-TRAIN]Recall:0.97 [SVM-TRAIN]Precision:0.96 [SVM-TRAIN]F1_score:0.96

ELECTRICITY
[SVM-TRAIN]Accuracy:0.98 [SVM-TRAIN]Recall:0.98 [SVM-TRAIN]Precision:0.98 [SVM-TRAIN]F1_score:0.97

TOOLS
[SVM-TRAIN]Accuracy:0.99 [SVM-TRAIN]Recall:0.99 [SVM-TRAIN]Precision:0.99 [SVM-TRAIN]F1_score:0.99

HOSPITALS
[SVM-TRAIN]Accuracy:0.99 [SVM-TRAIN]Recall:0.99 [SVM-TRAIN]Precision:0.98 [SVM-TRAIN]F1_score:0.98

SHOPS
[SVM-TRAIN]Accuracy:1.00 [SVM-TRAIN]Recall:1.00 [SVM-TRAIN]Precision:0.99 [SVM-TRAIN]F1_score:0.99

AID_CENTERS
[SVM-TRAIN]Accuracy:0.99 [SVM-TRAIN]Recall:0.99 [SVM-TRAIN]Precision:0.98 [SVM-TRAIN]F1_score:0.98

OTHER_INFRA

In [249]:
for i in range(len(labels)):
    print(labels[i].upper())
    evaluate_model(y_test[:, i],y_preds_svm[:, i], 'SVM')

RELATED
[SVM]Accuracy:0.82 [SVM]Recall:0.82 [SVM]Precision:0.81 [SVM]F1_score:0.80

REQUEST
[SVM]Accuracy:0.91 [SVM]Recall:0.91 [SVM]Precision:0.90 [SVM]F1_score:0.90

OFFER
[SVM]Accuracy:0.99 [SVM]Recall:0.99 [SVM]Precision:0.99 [SVM]F1_score:0.99

AID_RELATED
[SVM]Accuracy:0.78 [SVM]Recall:0.78 [SVM]Precision:0.78 [SVM]F1_score:0.78

MEDICAL_HELP
[SVM]Accuracy:0.93 [SVM]Recall:0.93 [SVM]Precision:0.91 [SVM]F1_score:0.91

MEDICAL_PRODUCTS
[SVM]Accuracy:0.96 [SVM]Recall:0.96 [SVM]Precision:0.95 [SVM]F1_score:0.95

SEARCH_AND_RESCUE
[SVM]Accuracy:0.98 [SVM]Recall:0.98 [SVM]Precision:0.97 [SVM]F1_score:0.97

SECURITY
[SVM]Accuracy:0.98 [SVM]Recall:0.98 [SVM]Precision:0.97 [SVM]F1_score:0.97

MILITARY
[SVM]Accuracy:0.97 [SVM]Recall:0.97 [SVM]Precision:0.96 [SVM]F1_score:0.96

WATER
[SVM]Accuracy:0.96 [SVM]Recall:0.96 [SVM]Precision:0.96 [SVM]F1_score:0.96

FOOD
[SVM]Accuracy:0.95 [SVM]Recall:0.95 [SVM]Precision:0.95 [SVM]F1_score:0.95

SHELTER
[SVM]Accuracy:0.95 [SVM]Recall:0.95 [SVM]Prec


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



In [62]:
# Try tuning some parameters for SVM which performed better

In [250]:
pipeline_svm.named_steps['clf']

MultiOutputClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
           n_jobs=1)

In [251]:
%%time
parameters = {
    'clf__estimator__alpha': [0.001, 0.0001],
}
cv = GridSearchCV(pipeline_svm, param_grid=parameters, verbose=3, scoring='f1_weighted')
cv.fit(X_train, y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] clf__estimator__alpha=0.001 .....................................



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.001, score=0.4474825083154286, total= 1.3min
[CV] clf__estimator__alpha=0.001 .....................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.2min remaining:    0.0s

F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.001, score=0.45180161041942135, total= 1.3min
[CV] clf__estimator__alpha=0.001 .....................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.4min remaining:    0.0s

F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.001, score=0.4492263753005728, total= 1.3min
[CV] clf__estimator__alpha=0.0001 ....................................



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.0001, score=0.6169196250409618, total= 1.3min
[CV] clf__estimator__alpha=0.0001 ....................................



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.0001, score=0.6176810922056178, total= 1.3min
[CV] clf__estimator__alpha=0.0001 ....................................



F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



[CV]  clf__estimator__alpha=0.0001, score=0.6184696920223226, total= 1.3min


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 13.1min finished


CPU times: user 13min 13s, sys: 1min 2s, total: 14min 16s
Wall time: 14min 22s


In [252]:
best_model = cv.best_estimator_.named_steps['clf']
best_model

MultiOutputClassifier(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False),
           n_jobs=1)

In [253]:
%%time
y_preds_svm_opt = cv.predict(X_test)
y_preds_svm_train = cv.predict(X_train)

CPU times: user 1min 43s, sys: 8.26 s, total: 1min 51s
Wall time: 1min 52s


In [254]:
for i in range(len(labels)):
    print(labels[i].upper())
    evaluate_model(y_test[:, i],y_preds_svm_opt[:, i], 'SVM-opt')

RELATED
[SVM-opt]Accuracy:0.82 [SVM-opt]Recall:0.82 [SVM-opt]Precision:0.81 [SVM-opt]F1_score:0.81

REQUEST
[SVM-opt]Accuracy:0.90 [SVM-opt]Recall:0.90 [SVM-opt]Precision:0.90 [SVM-opt]F1_score:0.90

OFFER
[SVM-opt]Accuracy:0.99 [SVM-opt]Recall:0.99 [SVM-opt]Precision:0.99 [SVM-opt]F1_score:0.99

AID_RELATED
[SVM-opt]Accuracy:0.78 [SVM-opt]Recall:0.78 [SVM-opt]Precision:0.78 [SVM-opt]F1_score:0.78

MEDICAL_HELP
[SVM-opt]Accuracy:0.93 [SVM-opt]Recall:0.93 [SVM-opt]Precision:0.91 [SVM-opt]F1_score:0.91

MEDICAL_PRODUCTS
[SVM-opt]Accuracy:0.96 [SVM-opt]Recall:0.96 [SVM-opt]Precision:0.95 [SVM-opt]F1_score:0.95

SEARCH_AND_RESCUE
[SVM-opt]Accuracy:0.98 [SVM-opt]Recall:0.98 [SVM-opt]Precision:0.97 [SVM-opt]F1_score:0.97

SECURITY
[SVM-opt]Accuracy:0.98 [SVM-opt]Recall:0.98 [SVM-opt]Precision:0.97 [SVM-opt]F1_score:0.97

MILITARY
[SVM-opt]Accuracy:0.97 [SVM-opt]Recall:0.97 [SVM-opt]Precision:0.96 [SVM-opt]F1_score:0.96

WATER
[SVM-opt]Accuracy:0.96 [SVM-opt]Recall:0.96 [SVM-opt]Precision:0.9


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


F-score is ill-defined and being set to 0.0 in labels with no predicted samples.



In [205]:
for i in range(len(labels)):
    print(labels[i].upper())
    evaluate_model(y_train[:, i],y_preds_svm_train[:, i], 'SVM-opt-train')

RELATED
[SVM-opt-train]Accuracy:0.88 [SVM-opt-train]Recall:0.88 [SVM-opt-train]Precision:0.88 [SVM-opt-train]F1_score:0.87

REQUEST
[SVM-opt-train]Accuracy:0.92 [SVM-opt-train]Recall:0.92 [SVM-opt-train]Precision:0.92 [SVM-opt-train]F1_score:0.91

OFFER
[SVM-opt-train]Accuracy:1.00 [SVM-opt-train]Recall:1.00 [SVM-opt-train]Precision:0.99 [SVM-opt-train]F1_score:0.99

AID_RELATED
[SVM-opt-train]Accuracy:0.85 [SVM-opt-train]Recall:0.85 [SVM-opt-train]Precision:0.85 [SVM-opt-train]F1_score:0.85

MEDICAL_HELP
[SVM-opt-train]Accuracy:0.94 [SVM-opt-train]Recall:0.94 [SVM-opt-train]Precision:0.93 [SVM-opt-train]F1_score:0.92

MEDICAL_PRODUCTS
[SVM-opt-train]Accuracy:0.96 [SVM-opt-train]Recall:0.96 [SVM-opt-train]Precision:0.96 [SVM-opt-train]F1_score:0.95

SEARCH_AND_RESCUE
[SVM-opt-train]Accuracy:0.98 [SVM-opt-train]Recall:0.98 [SVM-opt-train]Precision:0.97 [SVM-opt-train]F1_score:0.97

SECURITY
[SVM-opt-train]Accuracy:0.98 [SVM-opt-train]Recall:0.98 [SVM-opt-train]Precision:0.96 [SVM-opt-tr

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[SVM-opt-train]Accuracy:0.98 [SVM-opt-train]Recall:0.98 [SVM-opt-train]Precision:0.97 [SVM-opt-train]F1_score:0.97

OTHER_AID
[SVM-opt-train]Accuracy:0.87 [SVM-opt-train]Recall:0.87 [SVM-opt-train]Precision:0.87 [SVM-opt-train]F1_score:0.82

INFRASTRUCTURE_RELATED
[SVM-opt-train]Accuracy:0.94 [SVM-opt-train]Recall:0.94 [SVM-opt-train]Precision:0.94 [SVM-opt-train]F1_score:0.90

TRANSPORT
[SVM-opt-train]Accuracy:0.96 [SVM-opt-train]Recall:0.96 [SVM-opt-train]Precision:0.96 [SVM-opt-train]F1_score:0.95

BUILDINGS
[SVM-opt-train]Accuracy:0.96 [SVM-opt-train]Recall:0.96 [SVM-opt-train]Precision:0.96 [SVM-opt-train]F1_score:0.96

ELECTRICITY
[SVM-opt-train]Accuracy:0.98 [SVM-opt-train]Recall:0.98 [SVM-opt-train]Precision:0.98 [SVM-opt-train]F1_score:0.98

TOOLS
[SVM-opt-train]Accuracy:0.99 [SVM-opt-train]Recall:0.99 [SVM-opt-train]Precision:0.99 [SVM-opt-train]F1_score:0.99

HOSPITALS
[SVM-opt-train]Accuracy:0.99 [SVM-opt-train]Recall:0.99 [SVM-opt-train]Precision:0.98 [SVM-opt-train]F1_sco

### 9. Export your model as a pickle file

In [255]:
with open('model.pkl', 'wb') as file:
    pickle.dump(best_model, file)