In [1]:
import sys
import re
import numpy as np
import pandas as pd
import optuna
import joblib
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

from sqlalchemy import create_engine
from sklearn.metrics import accuracy_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /Users/zacks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zacks/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zacks/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def load_data(data_file):
    # read in file
    engine = create_engine(f'sqlite:///{data_file}.db')
    df = pd.read_sql(f'select * from {data_file}', con=engine)

    # define features and label arrays
    X = df.message
    Y = df.loc[:, 'related':]
    target_names = Y.columns
    y = Y.to_numpy()

    return X, y

In [3]:
def tokenize(message, stem='lemm'):
    """Text processing.

    Args:
        message(str): Message content.
        stem(str): stem or lemm.

    Returns:
        list: Cleaned tokens.
    """
    # Cleaning
    # replace each url in text string with placeholder
    url_regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    url_pattern = re.compile(url_regex)
    message = url_pattern.sub('urlplaceholder', message)

    # Normalization
    message = re.sub(r"[^a-zA-Z0-9]", " ", message.lower())

    # Tokenization
    tokens = word_tokenize(message)

    # Stop Word Removal & Stemming/Lemmatization
    stop_words = stopwords.words("english")
    # because the targets are not roots, we should use Lemmatization
    if stem == 'stem':
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(tok)
                for tok in tokens if tok not in stop_words] 
    else:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(tok, pos='v')
                  for tok in tokens if tok not in stop_words]

    return tokens

In [4]:
def ml_pipeline():
    # text processing and model pipeline
    vect = TfidfVectorizer(tokenizer=tokenize, use_idf=True, smooth_idf=True)
    svd = TruncatedSVD(random_state=42)
    # forest = RandomForestClassifier(random_state=42, n_jobs=16)
    # mlp = MLPClassifier(random_state=42, early_stopping=True, learning_rate='adaptive')
    knn = KNeighborsClassifier(n_jobs=16)
    
    multi_clf = MultiOutputClassifier(knn)

    pipeline = Pipeline([
        ('vect', vect),
        # ('svd', svd),
        ('multi_clf', multi_clf)
    ])
    
    return pipeline

In [5]:
# Silence Optuna
optuna.logging.set_verbosity(optuna.logging.ERROR)

In [7]:
def objective(trial):
    """Modeling tuning with Target encoding.
    """
    X, y = load_data('disaster_response')
    
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42)
    
    pipeline = ml_pipeline()

    params = {
        'vect__max_features': trial.suggest_int('vect__max_features', 100, 5000, 100),
        'multi_clf__estimator__leaf_size': trial.suggest_int('multi_clf__estimator__leaf_size', 30, 50, 1),
        'multi_clf__estimator__n_neighbors': trial.suggest_int('multi_clf__estimator__n_neighbors', 36, 50, 1)
    }
    
    model = pipeline.set_params(**params)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    scores = []
    for i in range(y_pred.shape[1]):
        scores.append(recall_score(y_test[:, i], y_pred[:, i], average='weighted'))
    
    return np.mean(scores)

In [8]:
# Define number of trails
n_trials = 100

In [9]:
%%time
study = optuna.create_study(direction='maximize', study_name=f'NLP {n_trials} trails')
study.optimize(objective, n_trials=n_trials, show_progress_bar=True) # set n_triasl

  self._init_valid()


  0%|          | 0/100 [00:00<?, ?it/s]

CPU times: user 8h 48min 40s, sys: 1h 47min 9s, total: 10h 35min 50s
Wall time: 4h 23min 36s


In [10]:
study.best_params

{'vect__max_features': 100,
 'multi_clf__estimator__leaf_size': 45,
 'multi_clf__estimator__n_neighbors': 41}

In [11]:
study.best_value

0.9385700059327063

In [12]:
# store study model
# joblib.dump(study, "study.pkl")