In [45]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report
import re
import nltk
from nltk.tokenize import TweetTokenizer
import string
import unidecode

import warnings
warnings.filterwarnings('ignore')

stopwords = nltk.corpus.stopwords.words('english')

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
def concatenate(x,char):
    words = ""
    for word in x:
        if word.startswith(char):
            words = words + word + " "
    return words

def count_vowels(x):
    return (x.count('a') + x.count('e') + x.count('i') + x.count('o') + x.count('u'))

def count_short_words(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if 1 <= len(word) <= 3:
            count += 1
    return count

def count_stopwords(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if word in stopwords:
            count += 1
    return count

In [4]:
def remove_punctuation(word):        
    clean_word = ''.join([char for char in word if char not in string.punctuation])
    return clean_word

def cleaning_text(text):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text_tokenize = tokenizer.tokenize(text)
    wordlist = []
    for word in text_tokenize:
        word = word.lower()
        word = re.sub('(?P<url>https?://[^\s]+)', ' ', word)
        word = remove_punctuation(word)
        word = re.sub(r'[^\w]', ' ', word)
        word = unidecode.unidecode(word)
        word = re.sub(r'[0-9]','', word)
        if((word != '')&(word != ' ')&(word not in stopwords)):
            wordlist.append(word)
    clean_text = ' '.join(wordlist)
    return clean_text

In [5]:
train["special_chars_count"] =  train["text"]
train["special_chars_count"] =  train["special_chars_count"].str.lower()
train["special_chars_count"] = train["special_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
train["special_chars_count"] = train["special_chars_count"].str.strip()
train["special_chars_count"] = train["special_chars_count"].apply(lambda x: re.sub(' +','', x))
train["special_chars_count"] = train["special_chars_count"].apply(lambda x: re.sub(r'[0-9]','', x))
train["special_chars_count"] = train["special_chars_count"].str.len()

train["hashtags"] = train["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'#'))
train["labels"] = train["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'@'))
train["hashtags_count"] = train["hashtags"].str.split(' ').apply(lambda x: len(x))-1
train["labels_count"] = train["labels"].str.split(' ').apply(lambda x: len(x))-1

train["num_chars_count"] = train["text"]
train["num_chars_count"] =  train["num_chars_count"].str.lower()
train["num_chars_count"] = train["num_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
train["num_chars_count"] = train["num_chars_count"].apply(lambda x: re.sub(r'[^\w]','',x))
train["num_chars_count"] = train["num_chars_count"].str.strip()
train["num_chars_count"] = train["num_chars_count"].str.len()

train["links_count"] = train['text'].apply(lambda x: len([w for w in str(x).lower().split()
                                                           if 'http' in w or 'https' in w]))

train["clean_text"] = train["text"].apply(lambda x: cleaning_text(x))

train["text"] = train["text"].str.lower()
train["text"] = train["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
train["text"] = train["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
train["text"] = train["text"].apply(lambda x: re.sub(r'_', ' ', x))
train["text"] = train["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
train["text"] = train["text"].apply(lambda x: re.sub(' +',' ', x))
train["text"] = train["text"].apply(lambda x: unidecode.unidecode(x))
train["text"] = train["text"].str.strip()
train["text_length"] = train["text"].str.len()

train["mean_word_length"] = train['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
train["vowels_count"] = train["text"].apply(lambda x: count_vowels(x))
train["short_words_count"] = train["text"].apply(lambda x: count_short_words(x))
train["stopwords_count"] = train["text"].apply(lambda x: count_stopwords(x))
train["text"] = train["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
train["words_count"] = train["text"].str.split(' ').apply(lambda x: len(x))

train["keyword"] = train["keyword"].str.replace('%20',' ')
train["keyword"] = train["keyword"].astype('category')

train.rename(columns={"target":"target_label"}, inplace=True)
train.rename(columns={"location":"location_original"}, inplace=True)
train.rename(columns={"id":"id_original"}, inplace=True)
train.rename(columns={"text":"text_original"}, inplace=True)
train.rename(columns={"keyword":"keyword_original"}, inplace=True)

train.head()

Unnamed: 0,id_original,keyword_original,location_original,text_original,target_label,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,links_count,clean_text,text_length,mean_word_length,vowels_count,short_words_count,stopwords_count,words_count
0,1,,,our deeds are the reason of this earthquake ma...,1,1,#earthquake,,1,0,0,0,deeds reason earthquake may allah forgive us,68,4.307692,25,7,6,13
1,4,,,forest fire near la ronge sask canada,1,1,,,0,0,0,0,forest fire near la ronge sask canada,37,4.428571,13,1,0,7
2,5,,,all residents asked to shelter in place are be...,1,3,,,0,0,0,0,residents asked shelter place notified officer...,130,4.954545,45,9,11,22
3,6,,,people receive wildfires evacuation orders in ...,1,2,#wildfires,,1,0,5,0,people receive wildfires evacuation orders cal...,56,7.142857,24,1,1,7
4,7,,,just got sent this photo from ruby alaska as s...,1,2,#alaska #wildfires,,2,0,0,0,got sent photo ruby alaska smoke wildfires pou...,85,4.375,25,3,7,16


In [6]:
test["special_chars_count"] =  test["text"]
test["special_chars_count"] =  test["special_chars_count"].str.lower()
test["special_chars_count"] = test["special_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
test["special_chars_count"] = test["special_chars_count"].str.strip()
test["special_chars_count"] = test["special_chars_count"].apply(lambda x: re.sub(' +','', x))
test["special_chars_count"] = test["special_chars_count"].apply(lambda x: re.sub(r'[0-9]','', x))
test["special_chars_count"] = test["special_chars_count"].str.len()

test["hashtags"] = test["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'#'))
test["labels"] = test["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'@'))
test["hashtags_count"] = test["hashtags"].str.split(' ').apply(lambda x: len(x))-1
test["labels_count"] = test["labels"].str.split(' ').apply(lambda x: len(x))-1

test["num_chars_count"] = test["text"]
test["num_chars_count"] =  test["num_chars_count"].str.lower()
test["num_chars_count"] = test["num_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
test["num_chars_count"] = test["num_chars_count"].apply(lambda x: re.sub(r'[^\w]','',x))
test["num_chars_count"] = test["num_chars_count"].str.strip()
test["num_chars_count"] = test["num_chars_count"].str.len()

test["links_count"] = test['text'].apply(lambda x: len([w for w in str(x).lower().split()
                                                           if 'http' in w or 'https' in w]))

test["clean_text"] = test["text"].apply(lambda x: cleaning_text(x))

test["text"] = test["text"].str.lower()
test["text"] = test["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
test["text"] = test["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
test["text"] = test["text"].apply(lambda x: re.sub(r'_', ' ', x))
test["text"] = test["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
test["text"] = test["text"].apply(lambda x: re.sub(' +',' ', x))
test["text"] = test["text"].apply(lambda x: unidecode.unidecode(x))
test["text"] = test["text"].str.strip()
test["text_length"] = test["text"].str.len()

test["mean_word_length"] = test['text'].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
test["vowels_count"] = test["text"].apply(lambda x: count_vowels(x))
test["short_words_count"] = test["text"].apply(lambda x: count_short_words(x))
test["stopwords_count"] = test["text"].apply(lambda x: count_stopwords(x))
test["text"] = test["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
test["words_count"] = test["text"].str.split(' ').apply(lambda x: len(x))

test["keyword"] = test["keyword"].str.replace('%20',' ')
test["keyword"] = test["keyword"].astype('category')

test.rename(columns={"location":"location_original"}, inplace=True)
test.rename(columns={"id":"id_original"}, inplace=True)
test.rename(columns={"text":"text_original"}, inplace=True)
test.rename(columns={"keyword":"keyword_original"}, inplace=True)

test.head()

Unnamed: 0,id_original,keyword_original,location_original,text_original,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,links_count,clean_text,text_length,mean_word_length,vowels_count,short_words_count,stopwords_count,words_count
0,0,,,just happened terrible car crash,0,,,0,0,0,0,happened terrible car crash,34,4.833333,10,2,2,6
1,2,,,heard about earthquake is different cities sta...,3,#earthquake,,1,0,0,0,heard earthquake different cities stay safe ev...,61,5.888889,24,1,2,9
2,3,,,there is forest fire at spot pond geese are f...,2,,,0,0,0,0,forest fire spot pond geese fleeing across str...,94,4.0,31,7,9,19
3,9,,,apocalypse lighting spokane wildfires,3,#spokane #wildfires,,2,0,0,0,apocalypse lighting spokane wildfires,37,8.5,12,0,0,4
4,11,,,typhoon soudelor kills in china and taiwan,0,,,0,0,2,0,typhoon soudelor kills china taiwan,42,5.142857,14,2,2,7


### One Hot Encoding

In [7]:
dummies_train = pd.get_dummies(train["keyword_original"], prefix="keyword")
dummies_test = pd.get_dummies(test["keyword_original"], prefix="keyword")

print(dummies_train.shape)
print(dummies_test.shape)

(7613, 221)
(3263, 221)


In [8]:
train_ohe = pd.concat([train,dummies_train], axis="columns")
train_ohe.head()

Unnamed: 0,id_original,keyword_original,location_original,text_original,target_label,special_chars_count,hashtags,labels,hashtags_count,labels_count,...,keyword_weapons,keyword_whirlwind,keyword_wild fires,keyword_wildfire,keyword_windstorm,keyword_wounded,keyword_wounds,keyword_wreck,keyword_wreckage,keyword_wrecked
0,1,,,our deeds are the reason of this earthquake ma...,1,1,#earthquake,,1,0,...,0,0,0,0,0,0,0,0,0,0
1,4,,,forest fire near la ronge sask canada,1,1,,,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,,,all residents asked to shelter in place are be...,1,3,,,0,0,...,0,0,0,0,0,0,0,0,0,0
3,6,,,people receive wildfires evacuation orders in ...,1,2,#wildfires,,1,0,...,0,0,0,0,0,0,0,0,0,0
4,7,,,just got sent this photo from ruby alaska as s...,1,2,#alaska #wildfires,,2,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
test_ohe = pd.concat([test,dummies_test], axis="columns")
test_ohe.head()

Unnamed: 0,id_original,keyword_original,location_original,text_original,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,...,keyword_weapons,keyword_whirlwind,keyword_wild fires,keyword_wildfire,keyword_windstorm,keyword_wounded,keyword_wounds,keyword_wreck,keyword_wreckage,keyword_wrecked
0,0,,,just happened terrible car crash,0,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,,,heard about earthquake is different cities sta...,3,#earthquake,,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,,,there is forest fire at spot pond geese are f...,2,,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9,,,apocalypse lighting spokane wildfires,3,#spokane #wildfires,,2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,11,,,typhoon soudelor kills in china and taiwan,0,,,0,0,2,...,0,0,0,0,0,0,0,0,0,0


### Bag of words

In [12]:
vectorizer = CountVectorizer(stop_words='english', binary=True)
vectorizer.fit(train["clean_text"])

vec_train = vectorizer.fit_transform(train["clean_text"])
feature_words = vectorizer.get_feature_names()
print(len(feature_words))
df_bow_train = pd.DataFrame(vec_train.toarray(), columns=feature_words)
df_bow_train.head()

14190


Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
vec_test = vectorizer.transform(test["clean_text"])
df_bow_test = pd.DataFrame(vec_test.toarray(), columns=feature_words)
df_bow_test.head()

Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
df_filter_train = df_bow_train.loc[:,(df_bow_train.sum()>1)]
print(df_filter_train.shape)
df_filter_train.columns

(7613, 5925)


Index(['aa', 'aba', 'abandon', 'abandoned', 'abbott', 'abbswinston', 'abc',
       'abcnews', 'abe', 'abia',
       ...
       'zero', 'zimbabwe', 'zionism', 'zionist', 'zippednews', 'zombie',
       'zone', 'zones', 'zouma', 'zss'],
      dtype='object', length=5925)

In [17]:
filter_list = df_filter_train.columns.tolist()
df_filter_test = df_bow_test.filter(items=filter_list)
print(df_filter_test.shape)
df_filter_test.columns

(3263, 5925)


Index(['aa', 'aba', 'abandon', 'abandoned', 'abbott', 'abbswinston', 'abc',
       'abcnews', 'abe', 'abia',
       ...
       'zero', 'zimbabwe', 'zionism', 'zionist', 'zippednews', 'zombie',
       'zone', 'zones', 'zouma', 'zss'],
      dtype='object', length=5925)

In [18]:
train_final = pd.concat([train_ohe, df_filter_train], axis="columns")
train_final.shape

(7613, 6165)

In [19]:
test_final = pd.concat([test_ohe, df_filter_test], axis="columns")
test_final.shape

(3263, 6164)

In [52]:
X = train_final.drop(["id_original","keyword_original","location_original","text_original","target_label","hashtags","labels","clean_text"], axis=1)
y = train_final["target_label"]

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [41]:
# realizamos el proceso de normalizacion
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

In [54]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5709, 6157)
(5709,)
(1904, 6157)
(1904,)


In [55]:
model = MLPClassifier(hidden_layer_sizes=(11,8,8), max_iter=200, verbose=False)
model.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(11, 8, 8), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [56]:
y_test_hat = model.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.750000


In [57]:
print(confusion_matrix(y_test, y_test_hat))
print(classification_report(y_test, y_test_hat))

[[837 272]
 [204 591]]
              precision    recall  f1-score   support

           0       0.80      0.75      0.78      1109
           1       0.68      0.74      0.71       795

    accuracy                           0.75      1904
   macro avg       0.74      0.75      0.75      1904
weighted avg       0.75      0.75      0.75      1904



In [58]:
X_train = train_final.drop(["id_original","keyword_original","location_original","text_original","target_label","hashtags","labels","clean_text"], axis=1)
X_test = test_final.drop(["id_original","keyword_original","location_original","text_original","hashtags","labels","clean_text"], axis=1)
y_train = train_final["target_label"]

In [59]:
model = MLPClassifier(hidden_layer_sizes=(8,8,8), max_iter=200, verbose=False)
model.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(8, 8, 8), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [60]:
y_pred = model.predict(X_test)
y_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [61]:
test["target"] = y_pred
test[["id_original","target"]]

Unnamed: 0,id_original,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [62]:
test["target"].value_counts()

0    1874
1    1389
Name: target, dtype: int64

In [63]:
test[["id_original","target"]].rename(columns={"id_original":"id"}).to_csv("../data/pred8_mlp", index=False)

In [34]:
#Usando cross-validation
kfold = KFold(n_splits=4, random_state=100)
resultados = cross_val_score(model, X_train, y_train, cv=kfold)
print("Accuracy: %f" % (resultados.mean()))

Iteration 1, loss = 0.68017938
Iteration 2, loss = 0.64869732
Iteration 3, loss = 0.63111697
Iteration 4, loss = 0.61367568
Iteration 5, loss = 0.59800798
Iteration 6, loss = 0.58336898
Iteration 7, loss = 0.56480688
Iteration 8, loss = 0.54662986
Iteration 9, loss = 0.52144948
Iteration 10, loss = 0.48517467
Iteration 11, loss = 0.44201914
Iteration 12, loss = 0.40609249
Iteration 13, loss = 0.37397019
Iteration 14, loss = 0.36082686
Iteration 15, loss = 0.33897823
Iteration 16, loss = 0.33132499
Iteration 17, loss = 0.31530159
Iteration 18, loss = 0.30377969
Iteration 19, loss = 0.29187009
Iteration 20, loss = 0.28589358
Iteration 21, loss = 0.27793663
Iteration 22, loss = 0.26650746
Iteration 23, loss = 0.26355810
Iteration 24, loss = 0.25343405
Iteration 25, loss = 0.24321688
Iteration 26, loss = 0.24710123
Iteration 27, loss = 0.23783281
Iteration 28, loss = 0.22898358
Iteration 29, loss = 0.22265719
Iteration 30, loss = 0.22093797
Iteration 31, loss = 0.21377636
Iteration 32, los

Iteration 68, loss = 0.16451142
Iteration 69, loss = 0.15364535
Iteration 70, loss = 0.15640925
Iteration 71, loss = 0.15140858
Iteration 72, loss = 0.15024452
Iteration 73, loss = 0.15137783
Iteration 74, loss = 0.14872146
Iteration 75, loss = 0.14927273
Iteration 76, loss = 0.14491130
Iteration 77, loss = 0.14639026
Iteration 78, loss = 0.14532588
Iteration 79, loss = 0.14034647
Iteration 80, loss = 0.13964111
Iteration 81, loss = 0.14738484
Iteration 82, loss = 0.14758227
Iteration 83, loss = 0.14001116
Iteration 84, loss = 0.13560352
Iteration 85, loss = 0.13784447
Iteration 86, loss = 0.13406364
Iteration 87, loss = 0.13290647
Iteration 88, loss = 0.13595089
Iteration 89, loss = 0.13172410
Iteration 90, loss = 0.13084263
Iteration 91, loss = 0.13086295
Iteration 92, loss = 0.12934963
Iteration 93, loss = 0.12657691
Iteration 94, loss = 0.13246082
Iteration 95, loss = 0.12623902
Iteration 96, loss = 0.12924415
Iteration 97, loss = 0.12415958
Iteration 98, loss = 0.12298567
Iteratio

Iteration 158, loss = 0.10671596
Iteration 159, loss = 0.10691577
Iteration 160, loss = 0.10913147
Iteration 161, loss = 0.10934316
Iteration 162, loss = 0.10855309
Iteration 163, loss = 0.11049674
Iteration 164, loss = 0.10632218
Iteration 165, loss = 0.10743631
Iteration 166, loss = 0.10277196
Iteration 167, loss = 0.10387427
Iteration 168, loss = 0.10999685
Iteration 169, loss = 0.10431276
Iteration 170, loss = 0.10909146
Iteration 171, loss = 0.10344405
Iteration 172, loss = 0.10343926
Iteration 173, loss = 0.10098122
Iteration 174, loss = 0.10423280
Iteration 175, loss = 0.10835080
Iteration 176, loss = 0.10125913
Iteration 177, loss = 0.10161629
Iteration 178, loss = 0.10700743
Iteration 179, loss = 0.10285424
Iteration 180, loss = 0.10557615
Iteration 181, loss = 0.11276372
Iteration 182, loss = 0.10916499
Iteration 183, loss = 0.10272746
Iteration 184, loss = 0.10278697
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss =