# ML Pipeline Preparation
Follow the instructions below to help you create your ML pipeline.
### 1. Import libraries and load data from database.
- Import Python libraries
- Load dataset from database with [`read_sql_table`](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_sql_table.html)
- Define feature and target variables X and Y

In [None]:
! pip install --upgrade setuptools
! pip install --upgrade pip
! pip install xgboost

In [None]:
import nltk
nltk.download('stopwords')
nltk.download(['punkt', 'wordnet'])
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from sqlalchemy import create_engine
import pandas as pd
import sqlite3
import numpy as np
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.metrics import confusion_matrix,f1_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import  train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

import xgboost as xgb


In [6]:
# load data from database
engine = create_engine(
    'sqlite:///..\data\disaster_records.db')

#read table and separate X and Y features
df = pd.read_sql_table('disaster_table', engine)



In [7]:
df.describe()

Unnamed: 0,id,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,...,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
count,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0,...,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0,26215.0
mean,15224.871333,0.76647,0.170666,0.004501,0.414267,0.079496,0.050086,0.027618,0.017967,0.032806,...,0.011787,0.043906,0.278352,0.082205,0.093191,0.010757,0.093649,0.020217,0.052489,0.193591
std,8827.053788,0.423085,0.376224,0.066941,0.492604,0.270517,0.218126,0.163878,0.132833,0.178131,...,0.107929,0.20489,0.448196,0.274682,0.290705,0.10316,0.291345,0.140746,0.223015,0.39512
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7446.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15663.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,22924.5,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,30265.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


We want to be aware of edge cases like this. blank tweets, or all space tweet could break a model. We will add some try/except nonetype where needed


In [8]:
df.columns

Index(['id', 'message', 'original', 'genre', 'related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'child_alone', 'water', 'food', 'shelter',
       'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [9]:
# child_alone has only 0 values, so we will remove
df=df.drop(['child_alone'],axis=1)

In [10]:
X = df.iloc[:, 1].values
y = df.iloc[:,4:].values

### 2. Write a tokenization function to process your text data

In [11]:
stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()

#Tokenize message

def tokenize(text):
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    detected_urls = re.findall(url_regex, text)
    for url in detected_urls:
        text = text.replace(url, "urlplaceholder")
    
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    
    return clean_tokens

### 3. Build a machine learning pipeline
This machine pipeline should take in the `message` column as input and output classification results on the other 36 categories in the dataset. You may find the [MultiOutputClassifier](http://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputClassifier.html) helpful for predicting multiple target variables.

In [12]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

### 4. Train pipeline
- Split data into train and test sets
- Train pipeline

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### 5. Test your model
Report the f1 score, precision and recall for each output category of the dataset. You can do this by iterating through the columns and calling sklearn's `classification_report` on each.

In [14]:
pipeline.fit(X_train, y_train)
y_pred1 = pipeline.predict(X_test)

In [15]:
labels = list(df.iloc[:,4:].columns)

In [16]:
print(classification_report(y_test,y_pred1, target_names = labels))

                        precision    recall  f1-score   support

               related       0.81      0.97      0.88      5020
               request       0.90      0.41      0.56      1148
                 offer       0.00      0.00      0.00        35
           aid_related       0.78      0.61      0.68      2742
          medical_help       0.54      0.04      0.07       513
      medical_products       0.67      0.07      0.12       330
     search_and_rescue       0.62      0.04      0.08       183
              security       0.67      0.02      0.03       118
              military       0.78      0.03      0.06       222
                 water       0.91      0.21      0.34       399
                  food       0.90      0.39      0.55       728
               shelter       0.80      0.23      0.36       587
              clothing       0.73      0.08      0.15        98
                 money       1.00      0.03      0.05       148
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
def categoryClassificationReport(labels,y_test,y_pred):
    
    for index, label in enumerate(labels):
        classification = classification_report(y_test[:,index-1], y_pred[:,index-1]);
        print('----------------------------\n')
        print(label,"\n",classification)
    return

In [18]:
categoryClassificationReport(labels,y_test,y_pred1)

----------------------------

related 
               precision    recall  f1-score   support

           0       0.86      0.99      0.92      5280
           1       0.86      0.36      0.51      1274

    accuracy                           0.86      6554
   macro avg       0.86      0.67      0.71      6554
weighted avg       0.86      0.86      0.84      6554

----------------------------

request 
               precision    recall  f1-score   support

           0       0.73      0.25      0.37      1534
           1       0.81      0.97      0.88      5020

    accuracy                           0.80      6554
   macro avg       0.77      0.61      0.63      6554
weighted avg       0.79      0.80      0.76      6554

----------------------------

offer 
               precision    recall  f1-score   support

           0       0.89      0.99      0.94      5406
           1       0.90      0.41      0.56      1148

    accuracy                           0.89      6554
   macro a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



food 
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      6155
           1       0.91      0.21      0.34       399

    accuracy                           0.95      6554
   macro avg       0.93      0.60      0.66      6554
weighted avg       0.95      0.95      0.94      6554

----------------------------

shelter 
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5826
           1       0.90      0.39      0.55       728

    accuracy                           0.93      6554
   macro avg       0.91      0.69      0.75      6554
weighted avg       0.93      0.93      0.91      6554

----------------------------

clothing 
               precision    recall  f1-score   support

           0       0.93      0.99      0.96      5967
           1       0.80      0.23      0.36       587

    accuracy                           0.93      6554
   macro avg       0.86      0.61      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


----------------------------

hospitals 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6515
           1       0.00      0.00      0.00        39

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      0.99      0.99      6554

----------------------------

shops 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6491
           1       0.00      0.00      0.00        63

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.98      0.99      0.99      6554

----------------------------

aid_centers 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6525
           1       0.00      0.00      0.00        29

    accuracy                           1.00      6554
   m

### 6. Improve your model
Use grid search to find better parameters. 

In [19]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'clf', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'clf__estimator__bootstrap', 'clf__estimator__ccp_alpha', 'clf__estimator__class_weight', 'clf__estimator__criterion', 'clf__estimator__max_depth', 'clf__estimator__max_features', 'clf__estimator__max_leaf_nodes', 'clf__estimator__max_samples', 'clf__estimator__min_impurity_decrease', 'clf__estimator__min_samples_leaf', 'clf__estimator__min_samples_split', 'clf__estimator__min_weight_fraction_leaf', 'clf__estimator__n_estimators', 'clf__estimator__n_jobs', 'clf__estimator__oob_score', 'clf__estimator__random_state', 'clf__estim

In [20]:
parameters = {
    'clf__estimator__max_depth': [2, 5, 10, 15, 20],
    'clf__estimator__min_samples_split': [2, 3, 4, 5, 10],
    'clf__estimator__n_estimators': [5, 50, 100, 250]
}

cv = GridSearchCV(pipeline, param_grid = parameters, n_jobs=-1, scoring = "f1_samples", verbose=2, cv=2)

### 7. Test  model
Show the accuracy, precision, and recall of the tuned model.  

Since this project focuses on code quality, process, and  pipelines, there is no minimum performance metric needed to pass. However, make sure to fine tune your models for accuracy, precision and recall to make your project stand out - especially for your portfolio!

In [21]:
cv.fit(X_train, y_train)
y_pred = cv.predict(X_test)

Fitting 2 folds for each of 100 candidates, totalling 200 fits


In [22]:
cv.best_params_

{'clf__estimator__max_depth': 20,
 'clf__estimator__min_samples_split': 5,
 'clf__estimator__n_estimators': 5}

In [23]:
cv.best_score_

0.4455325433419327

In [24]:
print(classification_report(y_test, y_pred, target_names = labels))

                        precision    recall  f1-score   support

               related       0.77      1.00      0.87      5020
               request       0.75      0.09      0.16      1148
                 offer       0.00      0.00      0.00        35
           aid_related       0.77      0.33      0.47      2742
          medical_help       0.75      0.02      0.03       513
      medical_products       0.50      0.00      0.01       330
     search_and_rescue       0.00      0.00      0.00       183
              security       0.00      0.00      0.00       118
              military       0.62      0.02      0.04       222
                 water       0.33      0.00      0.00       399
                  food       0.87      0.05      0.10       728
               shelter       0.71      0.02      0.04       587
              clothing       0.68      0.19      0.30        98
                 money       0.00      0.00      0.00       148
        missing_people       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
categoryClassificationReport(labels,y_test,y_pred)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


----------------------------

related 
               precision    recall  f1-score   support

           0       0.81      1.00      0.89      5280
           1       0.78      0.03      0.06      1274

    accuracy                           0.81      6554
   macro avg       0.80      0.52      0.48      6554
weighted avg       0.80      0.81      0.73      6554

----------------------------

request 
               precision    recall  f1-score   support

           0       0.74      0.03      0.05      1534
           1       0.77      1.00      0.87      5020

    accuracy                           0.77      6554
   macro avg       0.75      0.51      0.46      6554
weighted avg       0.76      0.77      0.68      6554

----------------------------

offer 
               precision    recall  f1-score   support

           0       0.84      0.99      0.91      5406
           1       0.75      0.09      0.16      1148

    accuracy                           0.84      6554
   macro a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


shelter 
               precision    recall  f1-score   support

           0       0.89      1.00      0.94      5826
           1       0.87      0.05      0.10       728

    accuracy                           0.89      6554
   macro avg       0.88      0.53      0.52      6554
weighted avg       0.89      0.89      0.85      6554

----------------------------

clothing 
               precision    recall  f1-score   support

           0       0.91      1.00      0.95      5967
           1       0.71      0.02      0.04       587

    accuracy                           0.91      6554
   macro avg       0.81      0.51      0.50      6554
weighted avg       0.89      0.91      0.87      6554

----------------------------

money 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      6456
           1       0.68      0.19      0.30        98

    accuracy                           0.99      6554
   macro avg       0.83      0.60      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

----------------------------

hospitals 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6515
           1       0.00      0.00      0.00        39

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.99      0.99      0.99      6554

----------------------------

shops 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      6491
           1       0.00      0.00      0.00        63

    accuracy                           0.99      6554
   macro avg       0.50      0.50      0.50      6554
weighted avg       0.98      0.99      0.99      6554

----------------------------

aid_centers 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      6525
           1       0.00      0.00      0.00        29

    accuracy                           1.00      6554
   m

### 7.B  Compare  model to original

In [26]:
pipeline_HP = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(min_samples_split= 5,n_estimators = 5, max_depth = 20)))
])

In [27]:
pipeline_HP.fit(X_train, y_train, )
y_pred_HP = pipeline_HP.predict(X_test)

In [28]:
print(f'Original Score',f1_score(y_test, y_pred1, average = 'samples'))
print(f'New Score',f1_score(y_test, y_pred_HP, average = 'samples'))

Original Score 0.5093121008458176
New Score 0.4429334438955722


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


### 8. Try improving your model further. Here are a few ideas:
* try other machine learning algorithms
* add other features besides the TF-IDF

In [29]:
#XGBoost, using mlogloss for multi-label classification
pipeline_xg = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('xg', MultiOutputClassifier(xgb.XGBClassifier(eval_metric='mlogloss',use_label_encoder=False)))
])

In [30]:
pipeline_xg.fit(X_train,y_train)

In [31]:
y_pred_xg = pipeline_xg.predict(X_test)

In [32]:
print(f'New Score',f1_score(y_test, y_pred_xg, average = 'samples'))

New Score 0.5272281324015899


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


## New Feature Extraction

In [33]:
class StartingVerbExtractor(BaseEstimator, TransformerMixin):

    def starting_verb(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            # return true if the first word is an appropriate verb or RT for retweet
            if first_tag in ['VB', 'VBP'] or first_word == 'RT':
                return True
        return False

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_verb)
        return pd.DataFrame(X_tagged)



In [34]:
class StartingPronounExtractor(BaseEstimator, TransformerMixin):

    def starting_pronoun(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            # return true if the first word is an appropriate verb or RT for retweet
            if first_tag in ['PRP', 'PRP$']:
                return True
        return False

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_pronoun)
        return pd.DataFrame(X_tagged)


In [35]:
pipeline_feature = Pipeline([
    ('features', FeatureUnion([

        ('nlp_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('prnoun', StartingPronounExtractor())
    ])),

    ('xg', MultiOutputClassifier(xgb.XGBClassifier(eval_metric='mlogloss',use_label_encoder=False)))
])

In [36]:
pipeline_feature.fit(X_train,y_train)

In [37]:
y_pred_feat = pipeline_feature.predict(X_test)

In [38]:
print(classification_report(y_test, y_pred_feat, target_names = labels))

                        precision    recall  f1-score   support

               related       0.84      0.94      0.89      5020
               request       0.79      0.58      0.67      1148
                 offer       0.00      0.00      0.00        35
           aid_related       0.77      0.63      0.69      2742
          medical_help       0.58      0.26      0.36       513
      medical_products       0.64      0.28      0.39       330
     search_and_rescue       0.67      0.19      0.29       183
              security       0.43      0.03      0.05       118
              military       0.67      0.30      0.42       222
                 water       0.78      0.64      0.71       399
                  food       0.81      0.76      0.78       728
               shelter       0.76      0.56      0.64       587
              clothing       0.69      0.48      0.57        98
                 money       0.55      0.26      0.35       148
        missing_people       0.59      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [39]:
categoryClassificationReport(labels,y_test,y_pred_feat)

----------------------------

related 
               precision    recall  f1-score   support

           0       0.89      0.96      0.92      5280
           1       0.75      0.52      0.62      1274

    accuracy                           0.87      6554
   macro avg       0.82      0.74      0.77      6554
weighted avg       0.87      0.87      0.86      6554

----------------------------

request 
               precision    recall  f1-score   support

           0       0.69      0.43      0.53      1534
           1       0.84      0.94      0.89      5020

    accuracy                           0.82      6554
   macro avg       0.77      0.68      0.71      6554
weighted avg       0.81      0.82      0.81      6554

----------------------------

offer 
               precision    recall  f1-score   support

           0       0.92      0.97      0.94      5406
           1       0.79      0.58      0.67      1148

    accuracy                           0.90      6554
   macro a

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
print(f'New Score',f1_score(y_test, y_pred_feat, average = 'samples'))

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


New Score 0.5273218935587908


## Who,what,where,why,when

In [41]:
class StartingQuestionExtractor(BaseEstimator, TransformerMixin):

    def starting_w_question(self, text):
        sentence_list = nltk.sent_tokenize(text)
        for sentence in sentence_list:
            pos_tags = nltk.pos_tag(tokenize(sentence))
            first_word, first_tag = pos_tags[0]
            # return true if the first word is an appropriate verb or RT for retweet
            if first_tag in ['WDT', 'WP','WP$','WRB']:
                return True
        return False

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_tagged = pd.Series(X).apply(self.starting_w_question)
        return pd.DataFrame(X_tagged)

In [42]:
pipeline_feature_2 = Pipeline([
    ('features', FeatureUnion([

        ('nlp_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('W_question', StartingQuestionExtractor()),
                ('prnoun', StartingPronounExtractor()),
        ('verb',StartingVerbExtractor())
    ])),

    ('xg', MultiOutputClassifier(xgb.XGBClassifier(eval_metric='mlogloss',use_label_encoder=False)))
])

pipeline_feature_3 = Pipeline([
    ('features', FeatureUnion([

        ('nlp_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('W_question', StartingQuestionExtractor())
    ])),

    ('xg', MultiOutputClassifier(xgb.XGBClassifier(eval_metric='mlogloss',use_label_encoder=False)))
])

In [43]:
pipeline_feature_2.fit(X_train,y_train)

In [44]:
pipeline_feature_3.fit(X_train,y_train)

In [45]:
y_pred_feat_2 = pipeline_feature_2.predict(X_test)

In [46]:
y_pred_feat_3 = pipeline_feature_3.predict(X_test)

In [47]:
print(classification_report(y_test, y_pred_feat_2, target_names = labels))

                        precision    recall  f1-score   support

               related       0.84      0.94      0.89      5020
               request       0.79      0.58      0.67      1148
                 offer       0.00      0.00      0.00        35
           aid_related       0.76      0.63      0.69      2742
          medical_help       0.58      0.26      0.36       513
      medical_products       0.63      0.30      0.41       330
     search_and_rescue       0.67      0.19      0.29       183
              security       0.43      0.03      0.05       118
              military       0.69      0.30      0.42       222
                 water       0.78      0.64      0.71       399
                  food       0.81      0.76      0.78       728
               shelter       0.76      0.56      0.64       587
              clothing       0.71      0.50      0.59        98
                 money       0.62      0.24      0.34       148
        missing_people       0.59      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [48]:
print(f'New Score',f1_score(y_test, y_pred_feat_2, average = 'samples'))

New Score 0.5271833791489067


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


In [49]:
print(f'New Score',f1_score(y_test, y_pred_feat_3, average = 'samples'))

New Score 0.5288215432437016


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


## Hyper Parameter Compare

All new features lowered model performance, we will add some hyperparameter tuning.

In [50]:
parameters = {
        'xg__estimator__learning_rate' : [0.1, 0.2],
        'xg__estimator__subsample' : [0.25, 0.5],
        'xg__estimator__max_depth' : [4, 5],
        'xg__estimator__n_estimators': [10, 100]
    }

cv = GridSearchCV(pipeline_feature, param_grid = parameters, n_jobs=-1, scoring = "f1_samples", verbose=2,cv =2)

In [51]:
cv.fit(X_train,y_train)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


In [52]:
cv.best_params_

{'xg__estimator__learning_rate': 0.1,
 'xg__estimator__max_depth': 4,
 'xg__estimator__n_estimators': 100,
 'xg__estimator__subsample': 0.5}

In [53]:
cv.best_score_

0.5333235804061722

In [58]:
#XGBoost_ hyperparam
pipeline_xg_hp = Pipeline([
    ('features', FeatureUnion([

        ('nlp_pipeline', Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize)),
            ('tfidf', TfidfTransformer())
        ])),

        ('prnoun', StartingPronounExtractor())
    ])),

    ('xg', MultiOutputClassifier(xgb.XGBClassifier(learning_rate=0.1, subsample=0.5, max_depth=4, n_estimators=100, eval_metric='mlogloss',use_label_encoder=False)))])

In [61]:
pipeline_xg_hp.fit(X_train,y_train)

In [62]:
y_pred_xg_hp = pipeline_xg_hp.predict(X_test)

In [63]:
print(f'Original Score',f1_score(y_test, y_pred1, average = 'samples'))
print(f'Original Tuned Score',f1_score(y_test, y_pred, average = 'samples'))
print(f'Original XG Score',f1_score(y_test, y_pred_xg, average = 'samples'))
print(f'Feature XG Score',f1_score(y_test, y_pred_feat, average = 'samples'))
print(f'Second Feature XG Score',f1_score(y_test, y_pred_feat_2, average = 'samples'))
print(f'Third Feature XG Score',f1_score(y_test, y_pred_feat_3, average = 'samples'))
print(f'New Tuned Feature Score',f1_score(y_test, y_pred_xg_hp, average = 'samples'))

Original Score 0.5093121008458176
Original Tuned Score 0.448855690174563
Original XG Score 0.5272281324015899
Feature XG Score 0.5273218935587908
Second Feature XG Score 0.5271833791489067
Third Feature XG Score 0.5288215432437016
New Tuned Feature Score 0.5358246453240082


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
