In [4]:
import pandas as pd
import re
import nltk
import joblib

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from operator import itemgetter

nltk.download(['punkt', 'wordnet', 'stopwords'])

pd.options.display.max_rows = 4000
pd.options.display.max_columns = 100
pd.options.display.max_seq_items = 2000
pd.options.display.max_colwidth = 2000

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abitf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abitf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abitf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
path = 'data/'
messages = pd.read_csv(path + 'disaster_messages.csv')
categories = pd.read_csv(path + 'disaster_categories.csv')

In [6]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26248 entries, 0 to 26247
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        26248 non-null  int64 
 1   message   26248 non-null  object
 2   original  10184 non-null  object
 3   genre     26248 non-null  object
dtypes: int64(1), object(3)
memory usage: 820.4+ KB


In [7]:
messages.loc[messages['message'].str.contains(r'^RT\s'), 'message']

9904                                                                                                                                                                                                                                                                                                                                                                                                                   RT selenagomez UNICEF has just announced an emergency alert for the people of Haiti who were hit by a 7.0 earthquake and a tsunami.. ..
9906                                                                                                                                                                                                                                                                                                                                                                                                                    RT TheNewsBlotter RT caribnews On Call Internatio

In [8]:
messages.loc[messages['message'].str.contains(r'\(\s*exten.*\)'), 'message']
#r'\(\s*exten\s*\)'

16      We need food and water in Klecin 12. We are dying of hunger. Impasse Chretien Klecin 12 extended ( extension ) We are hungry and sick.
5098                                                                                           #5 Lambert street (extended)and Merilus street 
6699                                   We would like for the water company to intervene at Fontamara 27 and rue Jannot prolongee (extension). 
Name: message, dtype: object

In [9]:
messages.head(50)

Unnamed: 0,id,message,original,genre
0,2,Weather update - a cold front from Cuba that could pass over Haiti,Un front froid se retrouve sur Cuba ce matin. Il pourrait traverser Haiti demain. Des averses de pluie isolee sont encore prevues sur notre region ce soi,direct
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ak timoun yo. Mesi se john jean depi Monben kwochi.",direct
3,9,UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.,UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.,direct
4,12,"says: west side of Haiti, rest of the country today and tonight",facade ouest d Haiti et le reste du pays aujourd hui et ce soir,direct
5,14,Information about the National Palace-,Informtion au nivaux palais nationl,direct
6,15,Storm at sacred heart of jesus,Cyclone Coeur sacr de jesus,direct
7,16,"Please, we need tents and water. We are in Silo, Thank you!",Tanpri nou bezwen tant avek dlo nou zon silo mesi.,direct
8,17,"I would like to receive the messages, thank you",Mwen ta renmen jouin messag yo. Merci,direct
9,18,I am in Croix-des-Bouquets. We have health issues. They ( workers ) are in Santo 15. ( an area in Croix-des-Bouquets ),"Nou kwadebouke, nou gen pwoblem sant m yo nan santo 15",direct


In [10]:
messages['genre'].unique()

array(['direct', 'social', 'news'], dtype=object)

In [11]:
messages['genre'].value_counts()

news      13068
direct    10782
social     2398
Name: genre, dtype: int64

In [12]:
categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26248 entries, 0 to 26247
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          26248 non-null  int64 
 1   categories  26248 non-null  object
dtypes: int64(1), object(1)
memory usage: 410.2+ KB


In [13]:
messages['id'].max()

30265

In [14]:
categories['id'].max()

30265

In [15]:
messages['id']

0            2
1            7
2            8
3            9
4           12
         ...  
26243    30261
26244    30262
26245    30263
26246    30264
26247    30265
Name: id, Length: 26248, dtype: int64

In [16]:
messages['id'] = messages['id'].astype('int16')
categories['id'] = categories['id'].astype('int16')

In [17]:
messages.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26248 entries, 0 to 26247
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        26248 non-null  int16 
 1   message   26248 non-null  object
 2   original  10184 non-null  object
 3   genre     26248 non-null  object
dtypes: int16(1), object(3)
memory usage: 666.6+ KB


In [18]:
categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26248 entries, 0 to 26247
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          26248 non-null  int16 
 1   categories  26248 non-null  object
dtypes: int16(1), object(1)
memory usage: 256.5+ KB


In [19]:
categories.head(20)

Unnamed: 0,id,categories
0,2,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0
1,7,related-1;request-0;offer-0;aid_related-1;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-1;floods-0;storm-1;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0
2,8,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0
3,9,related-1;request-1;offer-0;aid_related-1;medical_help-0;medical_products-1;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-1;transport-0;buildings-1;electricity-0;tools-0;hospitals-1;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0
4,12,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0
5,14,related-0;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0
6,15,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-1;floods-0;storm-1;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0
7,16,related-1;request-1;offer-0;aid_related-1;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-1;food-0;shelter-1;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-1
8,17,related-0;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0
9,18,related-1;request-1;offer-0;aid_related-1;medical_help-1;medical_products-1;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-1


In [20]:
row0 = categories.loc[0, 'categories']
pattern = re.compile(r'[\-0-9;]+')
header = re.split(pattern, row0)
print(header)

['related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report', '']


In [21]:
if '' in header:
    header.remove('')
print(header)

['related', 'request', 'offer', 'aid_related', 'medical_help', 'medical_products', 'search_and_rescue', 'security', 'military', 'child_alone', 'water', 'food', 'shelter', 'clothing', 'money', 'missing_people', 'refugees', 'death', 'other_aid', 'infrastructure_related', 'transport', 'buildings', 'electricity', 'tools', 'hospitals', 'shops', 'aid_centers', 'other_infrastructure', 'weather_related', 'floods', 'storm', 'fire', 'earthquake', 'cold', 'other_weather', 'direct_report']


In [22]:
test_cat = categories['categories'].str.split(pat=';', expand=True)
print(test_cat.head())
labels = test_cat.loc[0, :].apply(lambda x: x[:-2]).to_list()
print(labels)
test_cat.columns = labels
print(test_cat.head())
for column in test_cat.columns:
    test_cat[column] = test_cat[column].apply(lambda x: 0 if x[-1] == '0' else 1).astype('int8')
print(test_cat.head())

          0          1        2              3               4   \
0  related-1  request-0  offer-0  aid_related-0  medical_help-0   
1  related-1  request-0  offer-0  aid_related-1  medical_help-0   
2  related-1  request-0  offer-0  aid_related-0  medical_help-0   
3  related-1  request-1  offer-0  aid_related-1  medical_help-0   
4  related-1  request-0  offer-0  aid_related-0  medical_help-0   

                   5                    6           7           8   \
0  medical_products-0  search_and_rescue-0  security-0  military-0   
1  medical_products-0  search_and_rescue-0  security-0  military-0   
2  medical_products-0  search_and_rescue-0  security-0  military-0   
3  medical_products-1  search_and_rescue-0  security-0  military-0   
4  medical_products-0  search_and_rescue-0  security-0  military-0   

              9        10      11         12          13       14  \
0  child_alone-0  water-0  food-0  shelter-0  clothing-0  money-0   
1  child_alone-0  water-0  food-0  she

In [25]:
for idx, label in enumerate(test_cat.columns.to_list(), start=1):
    print(idx, label)

1 related
2 request
3 offer
4 aid_related
5 medical_help
6 medical_products
7 search_and_rescue
8 security
9 military
10 child_alone
11 water
12 food
13 shelter
14 clothing
15 money
16 missing_people
17 refugees
18 death
19 other_aid
20 infrastructure_related
21 transport
22 buildings
23 electricity
24 tools
25 hospitals
26 shops
27 aid_centers
28 other_infrastructure
29 weather_related
30 floods
31 storm
32 fire
33 earthquake
34 cold
35 other_weather
36 direct_report


In [81]:
messages.iloc[test_cat.loc[test_cat['related'] > 1, :].index]

Unnamed: 0,id,message,original,genre
117,146,"Dans la zone de Saint Etienne la route de Jacmel est bloqu, il est trsdifficile de se rendre Jacmel",Nan zon st. etine rout jakmel la bloke se mize pr nou al jakmel,direct
219,263,. .. i with limited means. Certain patients come from the capital.,t avec des moyens limites. Certains patients viennent de la Capital,direct
305,373,The internet caf Net@le that's by the Dal road by the Maranata church ( incomplete ),Cyber cafe net@le ki chita rout de dal tou pr legliz maranata.,direct
460,565,"Bonsoir, on est a bon repos aprs la compagnie teleko sur la route a droite de l'impasse Roger colas aprs la 9e maison sur la main droite de la rue, on est environ 30 personnes sur un. ..",Bonswa nou nan bon repo apri teleko nan wout ti sou la men dwat imp. Roger colas apri nevim kay sou la men dwat nou anviwon 30 moun sou yon taren nou pa,direct
576,700,URGENT CRECHE ORPHANAGE KAY TOUT TIMOUN CROIX DES MISSIONS IMPASSE BALEV BUTTE BOYER MANQUE EAU ET NOURRITURE N ONT VU AUCUN SECOURS DEPUIS 8 JOURS HELP HELP,r et Salon Furterer. mwen se yon Cosmtologue. Biochimiste. Pathologis,direct
655,804,elle est vraiment malade et a besoin d'aide. utilisez mon numero de tlphone pour obtenir plus de renseignements. Nous attendons une reponse. Aucun numero fourni par contre.,she is really sick she need your help. please use my phone number to get more informations about her. We waiting for your answers.,direct
656,804,elle est vraiment malade et a besoin d'aide. utilisez mon numero de tlphone pour obtenir plus de renseignements. Nous attendons une reponse. Aucun numero fourni par contre.,she is really sick she need your help. please use my phone number to get more informations about her. We waiting for your answers.,direct
883,1063,no authority has passed by to see us. We don't have a place t sleep ( incomplete ),EN OKENN OTORITE POKO PASE WE NOU NOU PAGEN KOTE POU NOU DOMI NOU P,direct
897,1080,It's Over in Gressier. The population in the area - Incomplete,Se over toujou nan gresye 24e seksyon komin. Popilasyon komin nan n,direct
925,1113,we sleep with the baby. Thanks in advance for the help you will bring us. ( incomplete ),sa nou demi avek ti bebe. Mesi davans pou d nou pral pote,direct


In [18]:
df_header = pd.DataFrame(columns=header)
categories_dummies = pd.concat([categories, df_header], axis=1)
categories_dummies

Unnamed: 0,id,categories,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,7,related-1;request-0;offer-0;aid_related-1;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-1;floods-0;storm-1;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,8,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,9,related-1;request-1;offer-0;aid_related-1;medical_help-0;medical_products-1;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-1;transport-0;buildings-1;electricity-0;tools-0;hospitals-1;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,12,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26243,30261,related-0;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26244,30262,related-0;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26245,30263,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
26246,30264,related-1;request-0;offer-0;aid_related-1;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-1;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [19]:
categories_dummies[header] = 0
categories_dummies.head()

Unnamed: 0,id,categories,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,related-1;request-0;offer-0;aid_related-1;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-1;floods-0;storm-1;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,8,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,related-1;request-1;offer-0;aid_related-1;medical_help-0;medical_products-1;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-1;transport-0;buildings-1;electricity-0;tools-0;hospitals-1;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,12,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
categories_dummies['id'] = categories_dummies['id'].astype('int16')
categories_dummies[header] = categories_dummies[header].astype('int8')
categories_dummies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26248 entries, 0 to 26247
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      26248 non-null  int16 
 1   categories              26248 non-null  object
 2   related                 26248 non-null  int8  
 3   request                 26248 non-null  int8  
 4   offer                   26248 non-null  int8  
 5   aid_related             26248 non-null  int8  
 6   medical_help            26248 non-null  int8  
 7   medical_products        26248 non-null  int8  
 8   search_and_rescue       26248 non-null  int8  
 9   security                26248 non-null  int8  
 10  military                26248 non-null  int8  
 11  child_alone             26248 non-null  int8  
 12  water                   26248 non-null  int8  
 13  food                    26248 non-null  int8  
 14  shelter                 26248 non-null  int8  
 15  clothin

In [21]:
dummy_pattern = re.compile(r''+header[0]+'-([0-9]+)')
result = re.search(dummy_pattern, categories_dummies.loc[0, 'categories'])
int(result.group(1))

1

In [22]:
def update_categories(row):
    for category in header:
        pattern = re.compile(r''+category+'-([0-9]+)')
        result = re.search(pattern, row['categories'])
        row[category] = int(result.group(1))
    return row

In [23]:
categories_dummies = categories_dummies.apply(update_categories, axis=1)

In [24]:
categories_dummies.head()

Unnamed: 0,id,categories,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,related-1;request-0;offer-0;aid_related-1;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-1;floods-0;storm-1;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,related-1;request-1;offer-0;aid_related-1;medical_help-0;medical_products-1;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-1;infrastructure_related-1;transport-0;buildings-1;electricity-0;tools-0;hospitals-1;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,related-1;request-0;offer-0;aid_related-0;medical_help-0;medical_products-0;search_and_rescue-0;security-0;military-0;child_alone-0;water-0;food-0;shelter-0;clothing-0;money-0;missing_people-0;refugees-0;death-0;other_aid-0;infrastructure_related-0;transport-0;buildings-0;electricity-0;tools-0;hospitals-0;shops-0;aid_centers-0;other_infrastructure-0;weather_related-0;floods-0;storm-0;fire-0;earthquake-0;cold-0;other_weather-0;direct_report-0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [25]:
categories_dummies[header].loc[0, :].sum()

1

In [26]:
categories_clean = categories_dummies.drop(columns=['categories'])
categories_clean.head()

Unnamed: 0,id,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
messages.duplicated().sum()

68

In [28]:
categories_clean.duplicated().sum()

32

TODO: 
1. Combine categories of the duplicated messages, since duplicates in messages > duplicates in categories.
2. Concat messages and categories.
3. Drop duplicates after concatenation.

# Cleaned CATEGORIES

In [29]:
eng_stopwords = stopwords.words('english')
print(eng_stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [30]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

In [31]:
messages.loc[:20, 'message']

0                                                                                                                                                         Weather update - a cold front from Cuba that could pass over Haiti
1                                                                                                                                                                                    Is the Hurricane over or is it not over
2                                                                                                                                                                                            Looking for someone but no name
3                                                                                                                       UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.
4                                                                                                                   

In [32]:
def tokenize(text):
    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    url_pattern = re.compile(url_regex)
    result = re.sub(url_pattern, ' ', text.lower())
    pattern = re.compile(r'[^A-Za-z]')
    result = re.sub(pattern, ' ', result)
    tokens = word_tokenize(result)
    words = []
    for word in tokens:
        if word not in eng_stopwords:
            # Lemmatization
            lem = lemmatizer.lemmatize(word)
            # Stemming
            #words.append(lem)
            #continue
            stem = stemmer.stem(lem)
            words.append(stem)

    return words

In [33]:
tokenize(messages.loc[16, 'message'])

['need',
 'food',
 'water',
 'klecin',
 'die',
 'hunger',
 'impass',
 'chretien',
 'klecin',
 'hungri',
 'sick']

pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

In [34]:
data = messages[['id', 'message']].merge(categories_clean[['id']+header], on='id')
data.shape

(26386, 38)

In [35]:
data.head()

Unnamed: 0,id,message,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that could pass over Haiti,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country today and tonight",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
data = data.drop_duplicates()
data.shape

(26216, 38)

X_train, X_test, y_train, y_test = train_test_split(data['message'], data[header])

parameters = [
    {
        'clf': [MultinomialNB()],
        'clf__estimator__alpha': [0.001, 0.1, 1, 10, 100]
    },
    {
        'clf': [SVC()],
        'clf__estimator__C': [0.001, 0.1, 1, 10, 100, 10e5],
        'clf__estimator__kernel': ['linear', 'rbf'],
        'clf__estimator__class_weight': ['balanced'],
        'clf__estimator__probability': [True]
    },
    {
        'clf': [DecisionTreeClassifier()],
        'clf__estimator__criterion': ['gini','entropy'],
        'clf__estimator__splitter': ['best','random'],
        'clf__estimator__class_weight':['balanced', None]
    },
    {
        'clf': [RandomForestClassifier()],
        'clf__estimator__n_estimators': [50, 100, 200],
        'clf__estimator__min_samples_split': [2, 3, 4]
    }
]

pipe = Pipeline([('vect', CountVectorizer(tokenizer=tokenize)), ('tf-idf', TfidfTransformer()), ('clf', MultiOutputClassifier(RandomForestClassifier()))])

pipe.get_params(deep=True).keys()

#pipeline parameters
parameters = [
    {
        'clf': [MultinomialNB()],
        'clf__estimator__alpha': [0.001, 0.1, 1, 10, 100]
    },
    {
        'clf': [SVC()],
        'clf__estimator__C': [0.001, 0.1, 1, 10, 100, 10e5],
        'clf__estimator__kernel': ['linear', 'rbf'],
        'clf__estimator__class_weight': ['balanced'],
        'clf__estimator__probability': [True]
    },
    {
        'clf': [DecisionTreeClassifier()],
        'clf__estimator__criterion': ['gini','entropy'],
        'clf__estimator__splitter': ['best','random'],
        'clf__estimator__class_weight':['balanced', None]
    },
    {
        'clf': [RandomForestClassifier()],
        'clf__estimator__n_estimators': [50, 100, 200],
        'clf__estimator__min_samples_split': [2, 3, 4]
    }
]

#evaluating multiple classifiers
#based on pipeline parameters
#-------------------------------
result = []

for params in parameters:

    #classifier
    clf = params['clf'][0]

    #getting arguments by
    #popping out classifier
    params.pop('clf')

    #pipeline
    steps = [('vect', CountVectorizer(tokenizer=tokenize)), ('tf-idf', TfidfTransformer()), ('clf', MultiOutputClassifier(clf))]

    #cross validation using
    #Grid Search
    grid = GridSearchCV(Pipeline(steps), param_grid=params, cv=3)
    grid.fit(data['message'], data[header])

    #storing result
    result.append(
        {
            'grid': grid,
            'classifier': grid.best_estimator_,
            'best score': grid.best_score_,
            'best params': grid.best_params_,
            'cv': grid.cv
        }
    )

#sorting result by best score
result = sorted(result, key=itemgetter('best score'), reverse=True)

#saving best classifier
grid = result[0]['grid']
joblib.dump(grid, 'Disaster Response Pipelines/models/classifier.pickle')

pipeline.fit(X_train, y_train)

parameters = {
        'vect__ngram_range': ((1, 1), (1, 2)),
        'vect__max_df': (0.5, 0.75, 1.0),
        'vect__max_features': (None, 5000, 10000),
        'tfidf__use_idf': (True, False),
        'clf__estimator__n_estimators': [50, 100, 200],
        'clf__estimator__min_samples_split': [2, 3, 4]
    }

cv = GridSearchCV(pipeline, param_grid=parameters)
cv.fit(X_train, y_train)

In [None]:
model = joblib.load(path + '../models/classifier.pkl')

In [81]:
pd.DataFrame(model.predict(['Help! We need food and water! We have injured people! We are burning!']), columns=header)

Unnamed: 0,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,1,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [72]:
model.score(X_train, y_train)

0.9912577694678862

In [None]:
#joblib.dump(pipeline, 'Disaster Response Pipelines/models/pipeline.pkl')