In [1]:
# import libraries
import re
import numpy as np
import pandas as pd
import nltk
nltk.download(['punkt', 'wordnet', 'averaged_perceptron_tagger'])

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

from sqlalchemy import create_engine

[nltk_data] Downloading package punkt to /Users/zacks/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zacks/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/zacks/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# load data from database
engine = create_engine('sqlite:///disaster_response.db')
df = pd.read_sql('select * from disaster_response', con=engine)
X = df.message
Y = df.loc[:, 'related':]
target_names = Y.columns
y = Y.to_numpy()

In [3]:
def tokenize(message, stem='lemm'):
    """Text processing.
    
    Args:
        stem(str): stem or lemm.
        
    Returns:
        list: Cleaned tokens.
    """
    # 1. Cleaning
    
    # 2. Normalization
    text = re.sub(r"[^a-zA-Z0-9]", " ", message.lower())
    
    # 3. Tokenization
    tokens = word_tokenize(text)
    
    # 4. Stop Word Removal
    stop_words = stopwords.words("english")
    tokens = list(filter(lambda w: w not in stop_words, tokens))
    
    # 5. Part of Speech Tagging / Named Entity Recognition
    
    # 6. Stemming or Lemmatization
    # Because the targets are not roots, we should use Lemmatization
    
    clean_tokens = []
    if stem == 'stem':
        stemmer = PorterStemmer()
        for tok in tokens:
            clean_tok = stemmer.stem(tok).strip()
            clean_tokens.append(clean_tok)
    else:
        lemmatizer = WordNetLemmatizer()
        for tok in tokens:
            clean_tok = lemmatizer.lemmatize(tok).strip()
            clean_tokens.append(clean_tok)

    return clean_tokens

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
%%time
vect = TfidfVectorizer(tokenizer=tokenize)
X_train_tfidf = vect.fit_transform(X_train)

CPU times: user 7.9 s, sys: 613 ms, total: 8.51 s
Wall time: 8.54 s


In [6]:
X_train_tfidf.shape

(20972, 28191)

In [7]:
%%time
svd = TruncatedSVD(n_components=1000, random_state=42)
sla = svd.fit_transform(X_train_tfidf)

CPU times: user 1min 8s, sys: 6.61 s, total: 1min 15s
Wall time: 15.6 s


In [8]:
sla.shape

(20972, 1000)

In [9]:
forest = RandomForestClassifier(random_state=42, n_jobs=-1)
multi_label_clf = MultiOutputClassifier(forest)

In [10]:
%%time
multi_label_clf.fit(sla, y_train)

CPU times: user 10.9 s, sys: 3.15 s, total: 14.1 s
Wall time: 11min 49s


MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=-1,
                                                       random_state=42))

In [11]:
multi_label_clf.score

SyntaxError: 'return' outside function (3438313781.py, line 1)

In [None]:
pipeline = Pipeline([
    ('vect', vect),
    ('svd', svd)
])

In [None]:
X_train

In [None]:
# Neural Network needs much more time for modeling
# mlp = MLPClassifier(random_state=42, max_iter=200, verbose=True, early_stopping=True)
vect = TfidfVectorizer(tokenizer=tokenize)
forest = RandomForestClassifier(random_state=42)
multi_label_clf = MultiOutputClassifier(forest)
svd = TruncatedSVD(n_components=1000)

In [12]:
pipeline = Pipeline([
    ('vect', vect),
    ('svd', svd),
    ('multi_label_clf', multi_label_clf)
])

In [13]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect',
   TfidfVectorizer(tokenizer=<function tokenize at 0x1349ab790>)),
  ('svd', TruncatedSVD(n_components=1000, random_state=42)),
  ('multi_label_clf',
   MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=-1,
                                                          random_state=42)))],
 'verbose': False,
 'vect': TfidfVectorizer(tokenizer=<function tokenize at 0x1349ab790>),
 'svd': TruncatedSVD(n_components=1000, random_state=42),
 'multi_label_clf': MultiOutputClassifier(estimator=RandomForestClassifier(n_jobs=-1,
                                                        random_state=42)),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.float64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 1.0,
 'vect__max_features': None,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 1),
 'vect__norm': 'l2',
 'vect__preprocessor': None,
 'vect__

In [None]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)