In [1]:
import pandas as pd
import re
import nltk
import joblib

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report

from operator import itemgetter

nltk.download(['punkt', 'wordnet', 'stopwords'])

pd.options.display.max_rows = 4000
pd.options.display.max_columns = 100
pd.options.display.max_seq_items = 2000
pd.options.display.max_colwidth = 2000

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abitf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abitf\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abitf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
path = 'data/'
messages = pd.read_csv(path + 'disaster_messages.csv')
categories = pd.read_csv(path + 'disaster_categories.csv')

In [3]:
messages['id'] = messages['id'].astype('int16')
categories['id'] = categories['id'].astype('int16')

In [4]:
categories_expanded = categories['categories'].str.split(pat=';', expand=True)
labels = categories_expanded.loc[0, :].apply(lambda x: x[:-2]).to_list()
categories_expanded.columns = labels
for column in categories_expanded.columns:
    categories_expanded[column] = categories_expanded[column].apply(lambda x: 0 if x[-1] == '0' else 1).astype('int8')

In [5]:
categories_clean = pd.concat([categories['id'], categories_expanded], axis=1)

In [6]:
df_data = messages.merge(categories_clean, on='id')
df_data.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that could pass over Haiti,Un front froid se retrouve sur Cuba ce matin. Il pourrait traverser Haiti demain. Des averses de pluie isolee sont encore prevues sur notre region ce soi,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ak timoun yo. Mesi se john jean depi Monben kwochi.",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.,UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.,direct,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country today and tonight",facade ouest d Haiti et le reste du pays aujourd hui et ce soir,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
print(df_data.duplicated().sum())
df_data_clean = df_data.drop_duplicates()
print(df_data_clean.duplicated().sum())

171
0


In [8]:
df_data_clean.head()

Unnamed: 0,id,message,original,genre,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
0,2,Weather update - a cold front from Cuba that could pass over Haiti,Un front froid se retrouve sur Cuba ce matin. Il pourrait traverser Haiti demain. Des averses de pluie isolee sont encore prevues sur notre region ce soi,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,7,Is the Hurricane over or is it not over,Cyclone nan fini osinon li pa fini,direct,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
2,8,Looking for someone but no name,"Patnm, di Maryani relem pou li banm nouvel li ak timoun yo. Mesi se john jean depi Monben kwochi.",direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,9,UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.,UN reports Leogane 80-90 destroyed. Only Hospital St. Croix functioning. Needs supplies desperately.,direct,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,12,"says: west side of Haiti, rest of the country today and tonight",facade ouest d Haiti et le reste du pays aujourd hui et ce soir,direct,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
def tokenize(text):
    stopwords_eng = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    url_pattern = re.compile(url_regex)
    urls = re.findall(url_pattern, text.lower())
    for url in urls:
        text = text.replace(url, ' ')
    pattern = re.compile(r'[^A-Za-z]+')
    text = re.sub(pattern, ' ', text)
    tokens = word_tokenize(text)
    words = []
    for word in tokens:
        if word not in stopwords_eng:
            # Lemmatization
            tok = lemmatizer.lemmatize(word)
            # Stemming
            tok = stemmer.stem(tok)
            words.append(tok)
    return words

In [10]:
X = df_data['message']
y = df_data.iloc[:, -36:]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [11]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=0, n_jobs=-1)))
    ])

In [57]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [69]:
print(classification_report(y_test.loc[:,:], y_pred[:,:], zero_division=1, target_names=labels))

                        precision    recall  f1-score   support

               related       0.84      0.95      0.89      4019
               request       0.85      0.47      0.60       894
                 offer       1.00      0.04      0.07        26
           aid_related       0.75      0.67      0.71      2190
          medical_help       0.72      0.08      0.14       435
      medical_products       0.84      0.12      0.20       277
     search_and_rescue       0.65      0.08      0.13       146
              security       0.50      0.02      0.04        99
              military       0.62      0.07      0.13       179
           child_alone       1.00      1.00      1.00         0
                 water       0.85      0.37      0.52       326
                  food       0.84      0.61      0.71       580
               shelter       0.83      0.41      0.55       476
              clothing       0.82      0.09      0.16       103
                 money       1.00      

# Optimization of classifier's parameters

In [20]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(DecisionTreeClassifier(random_state=0, splitter='random', min_samples_leaf=8, min_samples_split=44)))
    ])

In [21]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [23]:
test_df = pd.concat([X_test, y_test], axis=1)
test_df.to_csv('models/test.csv')
test_df

Unnamed: 0,message,related,request,offer,aid_related,medical_help,medical_products,search_and_rescue,security,military,child_alone,water,food,shelter,clothing,money,missing_people,refugees,death,other_aid,infrastructure_related,transport,buildings,electricity,tools,hospitals,shops,aid_centers,other_infrastructure,weather_related,floods,storm,fire,earthquake,cold,other_weather,direct_report
24803,"They expressed concern over reported grave abuses of human rights throughout the country, in particular abuses against soldiers, their families, and journalists, as well as the recruitment of child soldiers.",1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8966,Good morning i need some information i never get it?,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8855,NOTES: It's not necessary to translate this message because the author is cursing.,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
14135,"Baltimore (IOCC) -- A global Orthodox Christian response, bolstered by a groundswell of charitable giving, is bringing life-sustaining assistance to some of the hardest hit victims of the South Asia tsunami.",1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,1,0
10439,Tragic earthquake in Haiti leaves so many dead injured and missing including UN peacekeepers and among them Jordanians on duty. Awful.,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17663,"Karakoram Highway remained blocked for the second day due to heavy landsliding in Gonar Farm, Geeni and Chilas areas.",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0
9844,What information do you have for this moment?,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
20328,"The state has barred independent reporters from entering the war zone, and threatened to detain and prosecute anyone who publishes ""sensitive information"" that could incite mutiny under the current state of emergency.",1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
19387,"Most were cases with respiratory infections (42 %), followed by malaria (21.7 %), intestinal worms (7.7 %) and diarrhoea (6.3 %).",1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [43]:
print(classification_report(y_test.loc[:,:], y_pred[:,:], zero_division=1, target_names=labels))

related       precision    recall  f1-score   support

           0       0.58      0.52      0.55      1259
           1       0.85      0.88      0.87      4019

    accuracy                           0.80      5278
   macro avg       0.72      0.70      0.71      5278
weighted avg       0.79      0.80      0.79      5278

request       precision    recall  f1-score   support

           0       0.91      0.95      0.93      4384
           1       0.69      0.53      0.60       894

    accuracy                           0.88      5278
   macro avg       0.80      0.74      0.77      5278
weighted avg       0.87      0.88      0.87      5278

offer         precision    recall  f1-score   support

           0       1.00      1.00      1.00      5252
           1       0.00      0.00      0.00        26

    accuracy                           1.00      5278
   macro avg       0.50      0.50      0.50      5278
weighted avg       0.99      1.00      0.99      5278

aid_related   preci

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


other_weather precision    recall  f1-score   support

           0       0.96      0.99      0.97      5007
           1       0.48      0.19      0.27       271

    accuracy                           0.95      5278
   macro avg       0.72      0.59      0.62      5278
weighted avg       0.93      0.95      0.94      5278

direct_report precision    recall  f1-score   support

           0       0.88      0.93      0.90      4285
           1       0.59      0.44      0.51       993

    accuracy                           0.84      5278
   macro avg       0.74      0.69      0.70      5278
weighted avg       0.82      0.84      0.83      5278



In [17]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize, ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultiOutputClassifier(AdaBoostClassifier(DecisionTreeClassifier(random_state=0, splitter='random', min_samples_leaf=8, min_samples_split=44))))
    ])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(classification_report(y_test.iloc[:,1], y_pred[:,1]))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92      4374
           1       0.62      0.58      0.60       904

    accuracy                           0.87      5278
   macro avg       0.77      0.75      0.76      5278
weighted avg       0.86      0.87      0.86      5278



In [13]:
pipeline.get_params()

{'memory': None,
 'steps': [('vect', CountVectorizer(ngram_range=(1, 2),
                   tokenizer=<function tokenize at 0x0000020CE210E3A0>)),
  ('tfidf', TfidfTransformer()),
  ('clf',
   MultiOutputClassifier(estimator=DecisionTreeClassifier(min_samples_leaf=8,
                                                          min_samples_split=44,
                                                          random_state=0,
                                                          splitter='random')))],
 'verbose': False,
 'vect': CountVectorizer(ngram_range=(1, 2),
                 tokenizer=<function tokenize at 0x0000020CE210E3A0>),
 'tfidf': TfidfTransformer(),
 'clf': MultiOutputClassifier(estimator=DecisionTreeClassifier(min_samples_leaf=8,
                                                        min_samples_split=44,
                                                        random_state=0,
                                                        splitter='random')),
 'vect__analyzer': 'wo

In [53]:
for idx, label in enumerate(labels, start=1):
    print(idx, label)

1 related
2 request
3 offer
4 aid_related
5 medical_help
6 medical_products
7 search_and_rescue
8 security
9 military
10 child_alone
11 water
12 food
13 shelter
14 clothing
15 money
16 missing_people
17 refugees
18 death
19 other_aid
20 infrastructure_related
21 transport
22 buildings
23 electricity
24 tools
25 hospitals
26 shops
27 aid_centers
28 other_infrastructure
29 weather_related
30 floods
31 storm
32 fire
33 earthquake
34 cold
35 other_weather
36 direct_report
