In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import re
import unidecode

plt.style.use('default')
plt.rcParams['figure.figsize'] = (8,5)

sns.set(style="whitegrid")

pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings('ignore')

In [2]:
tweets = pd.read_csv("../data/train.csv")

In [3]:
tweets["text"] = tweets["text"].str.lower()
tweets["text"] = tweets["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'_', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(' +',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: unidecode.unidecode(x))
tweets["text"] = tweets["text"].str.strip()
tweets["text_length"] = tweets["text"].str.len()
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
tweets["words_count"] = tweets["text"].str.split(' ').apply(lambda x: len(x))

In [4]:
tweets["keyword"] = tweets["keyword"].str.replace('%20',' ')
tweets["keyword"] = tweets["keyword"].astype('category')
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 7 columns):
id             7613 non-null int64
keyword        7552 non-null category
location       5080 non-null object
text           7613 non-null object
target         7613 non-null int64
text_length    7613 non-null int64
words_count    7613 non-null int64
dtypes: category(1), int64(4), object(2)
memory usage: 383.5+ KB


### Mean Encoding

In [12]:
by_keyword = tweets.groupby("keyword").agg({"target":"mean"}).reset_index()
by_keyword.rename(columns={"target":"keyword_mean"}, inplace=True)
by_keyword.head(10)

Unnamed: 0,keyword,keyword_mean
0,ablaze,0.36
1,accident,0.69
2,aftershock,0.0
3,airplane accident,0.86
4,ambulance,0.53
5,annihilated,0.32
6,annihilation,0.34
7,apocalypse,0.28
8,armageddon,0.12
9,army,0.15


In [13]:
tweets_me = pd.merge(tweets, by_keyword, on="keyword", how="left")
tweets_me.head(10)

Unnamed: 0,id,keyword,location,text,target,text_length,words_count,keyword_mean
0,1,,,our deeds are the reason of this earthquake ma...,1,68,13,
1,4,,,forest fire near la ronge sask canada,1,37,7,
2,5,,,all residents asked to shelter in place are be...,1,130,22,
3,6,,,people receive wildfires evacuation orders in ...,1,56,7,
4,7,,,just got sent this photo from ruby alaska as s...,1,85,16,
5,8,,,rockyfire update california hwy closed in both...,1,98,15,
6,10,,,flood disaster heavy rain causes flash floodin...,1,92,14,
7,13,,,on top of the hill and can see fire in the...,1,56,16,
8,14,,,there an emergency evacuation happening now i...,1,79,13,
9,15,,,afraid that the tornado is coming to our area,1,49,11,


In [16]:
tweets_me["keyword_mean"].value_counts().nlargest()

0.15    170
0.67    165
0.14    147
0.15    132
0.12    120
Name: keyword_mean, dtype: int64

In [23]:
#Construimos los datos de entrenamiento y de test
X = tweets_me.loc[:,["text_length", "words_count", "keyword_mean"]]
y = tweets_me["target"]

In [24]:
X.head()

Unnamed: 0,text_length,words_count,keyword_mean
0,68,13,
1,37,7,
2,130,22,
3,56,7,
4,85,16,


In [141]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [137]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(6090, 3)
(1523, 3)
(6090,)
(1523,)


#### Ajustando hiper-parametros (usando 20% de set de test):
    - n_estimators=10, max_depth=6, learning_rate=0.05, subsample=1, colsample_bytree=0.5
    SCORE 0.738674
    - n_estimators=10, max_depth=7, learning_rate=0.1, subsample=1, colsample_bytree=0.5
    SCORE 0.742613
    - n_estimators=20, max_depth=7, learning_rate=0.1, subsample=1, colsample_bytree=0.5
    SCORE 0.751806
    - n_estimators=50, max_depth=7, learning_rate=0.1, subsample=1, colsample_bytree=0.5
    SCORE 0.751149
    - n_estimators=100, max_depth=7, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.755089
    - n_estimators=100, max_depth=9, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.754432
    - n_estimators=150, max_depth=6, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.762311
    - n_estimators=500, max_depth=6, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.759685
#### Ajustando hiper-parametros (usando 33% de set de test):    
    - n_estimators=10, max_depth=6, learning_rate=0.1, subsample=1, colsample_bytree=0.5
    SCORE 0.738559
    - n_estimators=50, max_depth=7, learning_rate=0.1, subsample=1, colsample_bytree=0.5
    SCORE 0.742141
    - n_estimators=100, max_depth=6, learning_rate=0.1, subsample=1, colsample_bytree=0.8
    SCORE 0.748110
    - n_estimators=100, max_depth=7, learning_rate=0.1, subsample=1, colsample_bytree=0.7
    SCORE 0.752487
    - n_estimators=120, max_depth=4, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8
    SCORE 0.754477

In [142]:
model_xgb = xgb.XGBClassifier(n_estimators=150, objective='binary:logistic', max_depth=6, learning_rate=0.1,
                          subsample=1, colsample_bytree=0.7, n_jobs=1)

In [143]:
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=150, n_jobs=1, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [144]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.762311


In [145]:
model_xgb.score(X_test, y_test)*100

76.23112278397899

#### XGBoost - mejor resultado: 0.762311