In [1]:
import sys
# import libraries
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from nltk import word_tokenize
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from custom_transformer import StartingVerbExtractor
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
import pickle
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

database_filepath = 'data/DisasterResponse.db'
model_filepath = 'models/classifier.pkl'

# load data from database
engine = create_engine('sqlite:///{}'.format(database_filepath))

# Create datframe by quering database
df = pd.read_sql("SELECT * from messages", engine)

# Feature selection
X = df['message']

# Choosing column names for multiobjective classification
category_names=df.drop(['id','message','original','genre'], axis=1).columns

# Target values to predict
Y =df[category_names] 
    
    

def tokenize(text):
    # Normalize text
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9]"," ",text)
    
    # Tokenize text
    words = word_tokenize(text)
    
    # Remove stop words
    tokens = [w for w in words if w not in stopwords.words('english')]    
    
    # initiate lemmatizer
    lemmatizer = WordNetLemmatizer()

    # iterate through each token
    clean_tokens = []
    for tok in tokens:
        # lemmatize, normalize case, and remove leading/trailing white space
        clean_tok = lemmatizer.lemmatize(tok).lower().strip() # [WordNetLemmatizer().lemmatize(w) for w in tokens]
        clean_tokens.append(clean_tok)
    
    return clean_tokens



pipeline = Pipeline([
    ('features', FeatureUnion([
       ('text_pipeline', Pipeline([('vect', CountVectorizer(tokenizer=tokenize)),('tfidf', TfidfTransformer())])),
        ('verb', StartingVerbExtractor())])),
     ('clf', RandomForestClassifier())
])

    # hyerparameters for grid to search within
#     parameters = [{'clf__bootstrap': [False, True],
#         'clf__bootstrap': [False, True],
#          'clf__n_estimators': [80,90, 100, 110, 130],
#          'clf__max_features': [0.6, 0.65, 0.7, 0.73, 0.7500000000000001, 0.78, 0.8],
#          'clf__min_samples_leaf': [10, 12, 14],
#          'clf__min_samples_split': [3, 5, 7]
#         }
#     ]
# specify parameters for grid search
parameters = {
    'features__text_pipeline__vect__ngram_range': ((1, 1), (1, 2)),
    'features__text_pipeline__vect__max_df': (0.5, 0.75, 1.0),
    'features__text_pipeline__vect__max_features': (None, 5000, 10000),
    'features__text_pipeline__tfidf__use_idf': (True, False),
    'clf__n_estimators': [50, 100, 200],
    'clf__min_samples_split': [2, 3, 4],
    'features__transformer_weights': (
        {'text_pipeline': 1, 'starting_verb': 0.5},
        {'text_pipeline': 0.5, 'starting_verb': 1},
        {'text_pipeline': 0.8, 'starting_verb': 1},
    )
}


# Final model ready to be applied on dataset
model = GridSearchCV(pipeline, param_grid=parameters)





y_pred = model.predict(X_test)
print(classification_report(Y_test, y_pred, target_names=category_names))


pickle.dump(model,open(model_filepath,'wb'))
  


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alire\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alire\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alire\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alire\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\alire\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


OperationalError: (sqlite3.OperationalError) unable to open database file
(Background on this error at: http://sqlalche.me/e/13/e3q8)