In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import unidecode
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

In [46]:
tweets = pd.read_csv("../data/train.csv")
tweets_test = pd.read_csv("../data/test.csv")

In [47]:
tweets["text"] = tweets["text"].str.lower()
tweets["text"] = tweets["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'_', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(' +',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: unidecode.unidecode(x))
tweets["text"] = tweets["text"].str.strip()
tweets["text_length"] = tweets["text"].str.len()
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
tweets["words_count"] = tweets["text"].str.split(' ').apply(lambda x: len(x))

tweets["keyword"] = tweets["keyword"].str.replace('%20',' ')
tweets["keyword"] = tweets["keyword"].astype('category')
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   id           7613 non-null   int64   
 1   keyword      7552 non-null   category
 2   location     5080 non-null   object  
 3   text         7613 non-null   object  
 4   target       7613 non-null   int64   
 5   text_length  7613 non-null   int64   
 6   words_count  7613 non-null   int64   
dtypes: category(1), int64(4), object(2)
memory usage: 383.6+ KB


In [48]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target,text_length,words_count
0,1,,,our deeds are the reason of this earthquake ma...,1,68,13
1,4,,,forest fire near la ronge sask canada,1,37,7
2,5,,,all residents asked to shelter in place are be...,1,130,22
3,6,,,people receive wildfires evacuation orders in ...,1,56,7
4,7,,,just got sent this photo from ruby alaska as s...,1,85,16


In [49]:
tweets.shape

(7613, 7)

In [50]:
tweets_test["text"] = tweets_test["text"].str.lower()
tweets_test["text"] = tweets_test["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
tweets_test["text"] = tweets_test["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
tweets_test["text"] = tweets_test["text"].apply(lambda x: re.sub(r'_', ' ', x))
tweets_test["text"] = tweets_test["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
tweets_test["text"] = tweets_test["text"].apply(lambda x: re.sub(' +',' ', x))
tweets_test["text"] = tweets_test["text"].apply(lambda x: unidecode.unidecode(x))
tweets_test["text"] = tweets_test["text"].str.strip()
tweets_test["text_length"] = tweets_test["text"].str.len()
tweets_test["text"] = tweets_test["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
tweets_test["words_count"] = tweets_test["text"].str.split(' ').apply(lambda x: len(x))

tweets_test["keyword"] = tweets_test["keyword"].str.replace('%20',' ')
tweets_test["keyword"] = tweets_test["keyword"].astype('category')
tweets_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   id           3263 non-null   int64   
 1   keyword      3237 non-null   category
 2   location     2158 non-null   object  
 3   text         3263 non-null   object  
 4   text_length  3263 non-null   int64   
 5   words_count  3263 non-null   int64   
dtypes: category(1), int64(3), object(2)
memory usage: 145.7+ KB


In [51]:
tweets_test.head()

Unnamed: 0,id,keyword,location,text,text_length,words_count
0,0,,,just happened terrible car crash,34,6
1,2,,,heard about earthquake is different cities sta...,61,9
2,3,,,there is forest fire at spot pond geese are f...,94,19
3,9,,,apocalypse lighting spokane wildfires,37,4
4,11,,,typhoon soudelor kills in china and taiwan,42,7


In [52]:
tweets_test.shape

(3263, 6)

In [53]:
#One Hot Encoding
dummies = pd.get_dummies(tweets["keyword"], prefix="keyword")
dummies_test = pd.get_dummies(tweets_test["keyword"], prefix="keyword")

In [54]:
tweets_ohe = pd.concat([tweets,dummies], axis="columns")
tweets_ohe.columns

Index(['id', 'keyword', 'location', 'text', 'target', 'text_length',
       'words_count', 'keyword_ablaze', 'keyword_accident',
       'keyword_aftershock',
       ...
       'keyword_weapons', 'keyword_whirlwind', 'keyword_wild fires',
       'keyword_wildfire', 'keyword_windstorm', 'keyword_wounded',
       'keyword_wounds', 'keyword_wreck', 'keyword_wreckage',
       'keyword_wrecked'],
      dtype='object', length=228)

In [55]:
tweets_ohe.shape

(7613, 228)

In [58]:
tweets_test_ohe = pd.concat([tweets_test,dummies_test], axis="columns")
tweets_test_ohe.columns

Index(['id', 'keyword', 'location', 'text', 'text_length', 'words_count',
       'keyword_ablaze', 'keyword_accident', 'keyword_aftershock',
       'keyword_airplane accident',
       ...
       'keyword_weapons', 'keyword_whirlwind', 'keyword_wild fires',
       'keyword_wildfire', 'keyword_windstorm', 'keyword_wounded',
       'keyword_wounds', 'keyword_wreck', 'keyword_wreckage',
       'keyword_wrecked'],
      dtype='object', length=227)

In [59]:
tweets_test_ohe.shape

(3263, 227)

In [60]:
#Split train-test usando solo tweets
X = tweets_ohe.drop(["id","keyword","location","text","target"], axis=1)
y = tweets_ohe["target"]

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [62]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5709, 223)
(1904, 223)
(5709,)
(1904,)


In [63]:
model_mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), max_iter=300, verbose=True)
model_mlp.fit(X_train, y_train)

Iteration 1, loss = 0.75775187
Iteration 2, loss = 0.69271912
Iteration 3, loss = 0.68979041
Iteration 4, loss = 0.68782011
Iteration 5, loss = 0.68639433
Iteration 6, loss = 0.68450943
Iteration 7, loss = 0.68195148
Iteration 8, loss = 0.68018310
Iteration 9, loss = 0.67861739
Iteration 10, loss = 0.67640668
Iteration 11, loss = 0.67395115
Iteration 12, loss = 0.67062071
Iteration 13, loss = 0.66534952
Iteration 14, loss = 0.65445171
Iteration 15, loss = 0.63414684
Iteration 16, loss = 0.61513558
Iteration 17, loss = 0.59972799
Iteration 18, loss = 0.58514022
Iteration 19, loss = 0.56449329
Iteration 20, loss = 0.55157997
Iteration 21, loss = 0.54418458
Iteration 22, loss = 0.53373113
Iteration 23, loss = 0.53111442
Iteration 24, loss = 0.52603809
Iteration 25, loss = 0.51951276
Iteration 26, loss = 0.51654164
Iteration 27, loss = 0.51730904
Iteration 28, loss = 0.52151894
Iteration 29, loss = 0.51563074
Iteration 30, loss = 0.51266131
Iteration 31, loss = 0.51558433
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(8, 8, 8), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=True,
              warm_start=False)

In [64]:
y_test_hat = model_mlp.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.738971


In [65]:
print(confusion_matrix(y_test, y_test_hat))
print(classification_report(y_test, y_test_hat))

[[824 285]
 [212 583]]
              precision    recall  f1-score   support

           0       0.80      0.74      0.77      1109
           1       0.67      0.73      0.70       795

    accuracy                           0.74      1904
   macro avg       0.73      0.74      0.73      1904
weighted avg       0.74      0.74      0.74      1904



In [66]:
#Usando standard scaler
# scaler = StandardScaler()
# scaler.fit(X_train)

In [67]:
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

In [68]:
X_train = tweets_ohe.drop(["id","keyword","location","text","target"], axis=1)
y_train = tweets_ohe["target"]

In [69]:
X_test = tweets_test_ohe.drop(["id","keyword","location","text"], axis=1)

In [70]:
model_mlp = MLPClassifier(hidden_layer_sizes=(8,8,8), max_iter=300, verbose=False)

In [71]:
#Usando cross-validation
kfold = KFold(n_splits=4, random_state=100)
resultados = cross_val_score(model_mlp, X_train, y_train, cv=kfold)
print("Accuracy: %f" % (resultados.mean()*100))

Accuracy: 58.859401


In [72]:
sample_submission = pd.read_csv("../data/sample_submission.csv")

In [73]:
model_mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(8, 8, 8), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [74]:
sample_submission["target"] = model_mlp.predict(X_test)

In [75]:
sample_submission["target"].mean()

0.43303708243947286