In [5]:
import models.train_classifier as md

In [6]:

database_filepath = 'data/DisasterResponse.db'
model_filepath = 'classifier.pkl'
print(f"Loading data...\n    DATABASE: {database_filepath}")
X, Y, category_names = md.load_data(database_filepath)
X_train, X_test, Y_train, Y_test = md.train_test_split(X, Y, test_size=0.2)

print("Building model...")
# model = build_model()


Loading data...
    DATABASE: data/DisasterResponse.db
Building model...


In [8]:
# without pipeline
vect = md.CountVectorizer(tokenizer=md.tokenize)
tfidf = md.TfidfTransformer()
clf = md.MultiOutputClassifier(md.RandomForestClassifier())

In [9]:
X_train_counts = vect.fit_transform(X_train)
X_train_counts

<20822x31114 sparse matrix of type '<class 'numpy.int64'>'
	with 475682 stored elements in Compressed Sparse Row format>

In [11]:
X_train_tfidf = tfidf.fit_transform(X_train_counts)
X_train_tfidf

<20822x31114 sparse matrix of type '<class 'numpy.float64'>'
	with 475682 stored elements in Compressed Sparse Row format>

In [12]:
clf.fit(X_train_tfidf, Y_train)

In [14]:
X_test_counts = vect.transform(X_test)
X_test_counts

<5206x31114 sparse matrix of type '<class 'numpy.int64'>'
	with 113374 stored elements in Compressed Sparse Row format>

In [16]:
X_test_tfidf = tfidf.transform(X_test_counts)
X_test_counts

<5206x31114 sparse matrix of type '<class 'numpy.int64'>'
	with 113374 stored elements in Compressed Sparse Row format>

In [18]:
y_pred = clf.predict(X_test_tfidf)
y_pred

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
md.evaluate_model(clf, X_test, Y_test, category_names)

ValueError: could not convert string to float: "But China's ruling Communist Party takes pride in its huge engineering feats, and it has sought to make post-quake reconstruction a showcase of its strength and principles."

In [20]:
# with pipeline
pipeline = md.Pipeline(
    [
        ("vect", md.CountVectorizer(tokenizer=md.tokenize)),
        ("tfidf", md.TfidfTransformer()),
        ("clf", md.MultiOutputClassifier(md.RandomForestClassifier())),
    ]
)

parameters = {
    "clf__estimator__max_depth": [10, 50],
    "clf__estimator__min_samples_leaf": [2, 5, 10],
}
# model = md.GridSearchCV(pipeline, param_grid=parameters)

# train classifier
pipeline.fit(X_train, Y_train)

In [21]:
predicted = pipeline.predict(X_test)
predicted

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [None]:
print("Training model...")
model.fit(X_train, Y_train)

print("Evaluating model...")
md.evaluate_model(model, X_test, Y_test, category_names)

print(f"Saving model...\n    MODEL: {model_filepath}")
md.save_model(model, model_filepath)

print("Trained model saved!")
