# Build a model with the data

In [6]:
import numpy as np
import pandas as pd
import sqlite3
import pickle

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.stem.porter import PorterStemmer


from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import classification_report, accuracy_score, f1_score

In [29]:
PATH_TO_DB = 'data/disaster_tweets.db'
conn = sqlite3.connect(PATH_TO_DB)
df = pd.read_sql('SELECT * FROM tweets', con = conn)

In [8]:
df.columns

Index(['id', 'message', 'original', 'genre', 'related', 'request', 'offer',
       'aid_related', 'medical_help', 'medical_products', 'search_and_rescue',
       'security', 'military', 'water', 'food', 'shelter', 'clothing', 'money',
       'missing_people', 'refugees', 'death', 'other_aid',
       'infrastructure_related', 'transport', 'buildings', 'electricity',
       'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure',
       'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold',
       'other_weather', 'direct_report'],
      dtype='object')

In [11]:
X = df['message']
Y = df.drop(labels=['id', 'message', 'original', 'genre'], axis=1)

category_names = list(Y.columns)
print(category_names)

['related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


Here are disaster related messages.

In [12]:
for i in range(10):
    print(X[Y['related'] == 1].iloc[i] + '\n')

Weather update - a cold front from Cuba that could pass over Haiti

Is the Hurricane over or is it not over

says: west side of Haiti, rest of the country today and tonight

Storm at sacred heart of jesus

Please, we need tents and water. We are in Silo, Thank you!

There's nothing to eat and water, we starving and thirsty.

I am in Thomassin number 32, in the area named Pyron. I would like to have some water. Thank God we are fine, but we desperately need water. Thanks

Let's do it together, need food in Delma 75, in didine area

More information on the 4636 number in order for me to participate. ( To see if I can use it )

A Comitee in Delmas 19, Rue ( street ) Janvier, Impasse Charite #2. We have about 500 people in a temporary shelter and we are in dire need of Water, Food, Medications, Tents and Clothes. Please stop by and see us.



Here are non-disaster related messages.

In [13]:
for i in range(10):
    print(X[Y['related'] == 0].iloc[i] + '\n')

Information about the National Palace-

I would like to receive the messages, thank you

I am in Petionville. I need more information regarding 4636

I don't understand how to use this thing 4636.

Can you tell me about this service

Good evening, Radio one please. I would like information on Tiyous.

I'm here, I didn't find the person that I needed to send the pant by phone

I'm listening to you at Miraguan we asking the government to take change because one gallon gas is 80.

i am very happy, i hear god, religious hyme

I would like to know how food is distributed.



In [14]:
porter = PorterStemmer()
def tokenize(text):
    return [porter.stem(word) for word in text.split()]

In [15]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer = tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(LogisticRegression(class_weight='balanced',multi_class='ovr')))
])

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [17]:
pipeline.fit(X_train, Y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function tokenize at...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultiOutputClassifier(estimator=LogisticRegression(C=1.0,
                                                                    class_weight='balanced',
           

In [18]:
Y_pred = pipeline.predict(X_test)

In [19]:
# Calculate the accuracy for each of them.
for i in range(len(category_names)):
    print('Category: {} '.format(category_names[i]))
    print('f1 Score: {} '.format(f1_score(Y_test.iloc[:, i].values, Y_pred[:, i])))
    print('Accuracy {}\n\n'.format(accuracy_score(Y_test.iloc[:, i].values, Y_pred[:, i])))

Category: related 
f1 Score: 0.8566875597577818 
Accuracy 0.7929075836659503


Category: request 
f1 Score: 0.6643464810518175 
Accuracy 0.8667485416027019


Category: offer 
f1 Score: 0.115702479338843 
Accuracy 0.9835738409579368


Category: aid_related 
f1 Score: 0.735467715972098 
Accuracy 0.7729505680073687


Category: medical_help 
f1 Score: 0.45385149201943104 
Accuracy 0.8791832975130488


Category: medical_products 
f1 Score: 0.41365853658536583 
Accuracy 0.907737181455327


Category: search_and_rescue 
f1 Score: 0.2679245283018868 
Accuracy 0.9404359840343874


Category: security 
f1 Score: 0.166189111747851 
Accuracy 0.9553269880257906


Category: military 
f1 Score: 0.45977011494252873 
Accuracy 0.9494933988332822


Category: water 
f1 Score: 0.6446886446886446 
Accuracy 0.9404359840343874


Category: food 
f1 Score: 0.7210144927536232 
Accuracy 0.9290758366595027


Category: shelter 
f1 Score: 0.5931758530183727 
Accuracy 0.9048203868590727


Category: clothing 
f1 Score: 

In [22]:
CLASSIFIER_PATH = 'classifier/trained_classifier.pkl'
pickle.dump(pipeline, open(CLASSIFIER_PATH, 'wb'))

In [25]:
# Load the model from pickle file
model = pickle.load(open(CLASSIFIER_PATH, 'rb'))

In [26]:
def get_predicted_category_names(category_predicted):
    return [category_names[i] for i in range(len(category_predicted)) if category_predicted[i] == 1]

In [27]:
get_predicted_category_names(model.predict(["I felt a big storm in Eugene this morning. I need water."])[0])

['related', 'water', 'weather_related', 'storm', 'direct_report']

In [28]:
get_predicted_category_names(model.predict(["How do I download zoom for this remote meeting?"])[0])

[]

In [30]:
# Just be sure any changes have been committed or they will be lost.
conn.close()