In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import re
import unidecode

plt.style.use('default')
plt.rcParams['figure.figsize'] = (8,5)

sns.set(style="whitegrid")

pd.options.display.float_format = '{:20,.2f}'.format

import warnings
warnings.filterwarnings('ignore')

In [2]:
tweets = pd.read_csv("../data/train.csv")

In [3]:
tweets["text"] = tweets["text"].str.lower()
tweets["text"] = tweets["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'_', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(' +',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: unidecode.unidecode(x))
tweets["text"] = tweets["text"].str.strip()
tweets["text_length"] = tweets["text"].str.len()
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
tweets["words_count"] = tweets["text"].str.split(' ').apply(lambda x: len(x))

In [4]:
tweets["keyword"] = tweets["keyword"].str.replace('%20',' ')
tweets["keyword"] = tweets["keyword"].astype('category')

In [5]:
by_keyword = tweets.groupby("keyword").agg({"target":"mean"}).reset_index()
by_keyword.rename(columns={"target":"keyword_mean"}, inplace=True)
by_keyword.head()

Unnamed: 0,keyword,keyword_mean
0,ablaze,0.36
1,accident,0.69
2,aftershock,0.0
3,airplane accident,0.86
4,ambulance,0.53


In [6]:
tweets_me = pd.merge(tweets, by_keyword, on="keyword", how="left")
#tweets_me["keyword_mean"] = tweets_me["keyword_mean"].fillna(value=0) #LGBM no se lleva bien con los NaNs
tweets_me.head()

Unnamed: 0,id,keyword,location,text,target,text_length,words_count,keyword_mean
0,1,,,our deeds are the reason of this earthquake ma...,1,68,13,
1,4,,,forest fire near la ronge sask canada,1,37,7,
2,5,,,all residents asked to shelter in place are be...,1,130,22,
3,6,,,people receive wildfires evacuation orders in ...,1,56,7,
4,7,,,just got sent this photo from ruby alaska as s...,1,85,16,


In [7]:
#Construimos los datos de entrenamiento y de test
X = tweets_me.loc[:,["text_length", "words_count", "keyword_mean"]]
y = tweets_me["target"]

In [8]:
X.head(20).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
text_length,68.0,37.0,130.0,56.0,85.0,98.0,92.0,56.0,79.0,49.0,43.0,127.0,63.0,37.0,51.0,13.0,13.0,16.0,17.0,22.0
words_count,13.0,7.0,22.0,7.0,16.0,15.0,14.0,16.0,13.0,11.0,9.0,27.0,11.0,7.0,10.0,4.0,3.0,3.0,5.0,3.0
keyword_mean,,,,,,,,,,,,,,,,,,,,


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5709, 3)
(1904, 3)
(5709,)
(1904,)


In [16]:
model_catb = CatBoostClassifier()
model_catb.fit(X_train, y_train, verbose=100)

Learning rate set to 0.021676
0:	learn: 0.6852148	total: 19.4ms	remaining: 19.4s
100:	learn: 0.5078403	total: 1.61s	remaining: 14.4s
200:	learn: 0.4927889	total: 3.04s	remaining: 12.1s
300:	learn: 0.4852644	total: 4.5s	remaining: 10.4s
400:	learn: 0.4794489	total: 5.9s	remaining: 8.82s
500:	learn: 0.4729250	total: 7.29s	remaining: 7.26s
600:	learn: 0.4664646	total: 8.7s	remaining: 5.78s
700:	learn: 0.4602172	total: 10.1s	remaining: 4.31s
800:	learn: 0.4538584	total: 11.5s	remaining: 2.86s
900:	learn: 0.4479436	total: 13s	remaining: 1.43s
999:	learn: 0.4426440	total: 14.6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1de39eb8668>

In [17]:
y_test_hat = model_catb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.759454


#### Ajustando hiper-parametros:
    - iterations=10
    SCORE: 0.753151
    - iterations=50
    SCORE: 0.750000
    - iterations=100
    SCORE: 0.752101
    - iterations=200
    SCORE: 0.761555
    - iterations=300
    SCORE: 0.763130
    - iterations=300, learning_rate=0.1
    SCORE: 0.763130
    - iterations=500
    SCORE: 0.765231
    - iterations=500, learning_rate=0.05
    SCORE: 0.755777
    - iterations=10, learning_rate=0.03
    SCORE: 0.758929

In [56]:
model_catb = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.005)
model_catb.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x1de39847b00>

In [55]:
y_test_hat = model_catb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.741071


In [58]:
model_catb = CatBoostClassifier(iterations=200)
model_catb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

Learning rate set to 0.098137
0:	learn: 0.6589682	test: 0.6595500	best: 0.6595500 (0)	total: 15.7ms	remaining: 3.13s
1:	learn: 0.6320289	test: 0.6325003	best: 0.6325003 (1)	total: 29.6ms	remaining: 2.93s
2:	learn: 0.6090061	test: 0.6094365	best: 0.6094365 (2)	total: 45ms	remaining: 2.95s
3:	learn: 0.5903014	test: 0.5910822	best: 0.5910822 (3)	total: 59.5ms	remaining: 2.92s
4:	learn: 0.5769059	test: 0.5780213	best: 0.5780213 (4)	total: 74.2ms	remaining: 2.9s
5:	learn: 0.5657341	test: 0.5672767	best: 0.5672767 (5)	total: 89.7ms	remaining: 2.9s
6:	learn: 0.5577224	test: 0.5596684	best: 0.5596684 (6)	total: 103ms	remaining: 2.84s
7:	learn: 0.5504350	test: 0.5529814	best: 0.5529814 (7)	total: 120ms	remaining: 2.87s
8:	learn: 0.5436873	test: 0.5469819	best: 0.5469819 (8)	total: 135ms	remaining: 2.86s
9:	learn: 0.5385901	test: 0.5421576	best: 0.5421576 (9)	total: 153ms	remaining: 2.9s
10:	learn: 0.5323715	test: 0.5361639	best: 0.5361639 (10)	total: 169ms	remaining: 2.9s
11:	learn: 0.5291494	t

100:	learn: 0.4750735	test: 0.5080080	best: 0.5069210 (76)	total: 1.77s	remaining: 1.73s
101:	learn: 0.4745305	test: 0.5075429	best: 0.5069210 (76)	total: 1.8s	remaining: 1.73s
102:	learn: 0.4738239	test: 0.5076948	best: 0.5069210 (76)	total: 1.82s	remaining: 1.72s
103:	learn: 0.4734643	test: 0.5075866	best: 0.5069210 (76)	total: 1.84s	remaining: 1.7s
104:	learn: 0.4728937	test: 0.5074023	best: 0.5069210 (76)	total: 1.86s	remaining: 1.68s
105:	learn: 0.4724139	test: 0.5074652	best: 0.5069210 (76)	total: 1.88s	remaining: 1.66s
106:	learn: 0.4722257	test: 0.5075349	best: 0.5069210 (76)	total: 1.89s	remaining: 1.65s
107:	learn: 0.4715421	test: 0.5073257	best: 0.5069210 (76)	total: 1.91s	remaining: 1.63s
108:	learn: 0.4714102	test: 0.5072336	best: 0.5069210 (76)	total: 1.94s	remaining: 1.62s
109:	learn: 0.4709917	test: 0.5073046	best: 0.5069210 (76)	total: 1.96s	remaining: 1.6s
110:	learn: 0.4708073	test: 0.5073586	best: 0.5069210 (76)	total: 1.97s	remaining: 1.58s
111:	learn: 0.4705646	te

195:	learn: 0.4465808	test: 0.5105014	best: 0.5069210 (76)	total: 3.49s	remaining: 71.2ms
196:	learn: 0.4463583	test: 0.5105414	best: 0.5069210 (76)	total: 3.5s	remaining: 53.4ms
197:	learn: 0.4462267	test: 0.5105175	best: 0.5069210 (76)	total: 3.52s	remaining: 35.6ms
198:	learn: 0.4458977	test: 0.5106236	best: 0.5069210 (76)	total: 3.54s	remaining: 17.8ms
199:	learn: 0.4457256	test: 0.5105892	best: 0.5069210 (76)	total: 3.55s	remaining: 0us

bestTest = 0.5069210259
bestIteration = 76

Shrink model to first 77 iterations.


<catboost.core.CatBoostClassifier at 0x1de39847c18>

In [59]:
model_catb.score(X_test, y_test)

0.7557773109243697

In [60]:
model_catb = CatBoostClassifier(iterations=500)
model_catb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

Learning rate set to 0.065876
0:	learn: 0.6697133	test: 0.6700981	best: 0.6700981 (0)	total: 17.5ms	remaining: 8.74s
1:	learn: 0.6492558	test: 0.6495295	best: 0.6495295 (1)	total: 33.1ms	remaining: 8.25s
2:	learn: 0.6323661	test: 0.6327239	best: 0.6327239 (2)	total: 48.4ms	remaining: 8.02s
3:	learn: 0.6164224	test: 0.6170344	best: 0.6170344 (3)	total: 63.1ms	remaining: 7.82s
4:	learn: 0.6039281	test: 0.6047850	best: 0.6047850 (4)	total: 77.8ms	remaining: 7.7s
5:	learn: 0.5930702	test: 0.5942181	best: 0.5942181 (5)	total: 91.8ms	remaining: 7.56s
6:	learn: 0.5846724	test: 0.5860962	best: 0.5860962 (6)	total: 105ms	remaining: 7.42s
7:	learn: 0.5767279	test: 0.5783145	best: 0.5783145 (7)	total: 120ms	remaining: 7.35s
8:	learn: 0.5684800	test: 0.5703298	best: 0.5703298 (8)	total: 134ms	remaining: 7.31s
9:	learn: 0.5623342	test: 0.5641760	best: 0.5641760 (9)	total: 151ms	remaining: 7.41s
10:	learn: 0.5573499	test: 0.5593317	best: 0.5593317 (10)	total: 167ms	remaining: 7.45s
11:	learn: 0.5524

103:	learn: 0.4854500	test: 0.5073646	best: 0.5071973 (93)	total: 1.73s	remaining: 6.59s
104:	learn: 0.4852754	test: 0.5074182	best: 0.5071973 (93)	total: 1.75s	remaining: 6.58s
105:	learn: 0.4851565	test: 0.5074832	best: 0.5071973 (93)	total: 1.77s	remaining: 6.58s
106:	learn: 0.4849279	test: 0.5072364	best: 0.5071973 (93)	total: 1.79s	remaining: 6.57s
107:	learn: 0.4847653	test: 0.5072908	best: 0.5071973 (93)	total: 1.8s	remaining: 6.54s
108:	learn: 0.4846493	test: 0.5072841	best: 0.5071973 (93)	total: 1.82s	remaining: 6.53s
109:	learn: 0.4843533	test: 0.5071864	best: 0.5071864 (109)	total: 1.83s	remaining: 6.5s
110:	learn: 0.4841318	test: 0.5071612	best: 0.5071612 (110)	total: 1.85s	remaining: 6.48s
111:	learn: 0.4839225	test: 0.5070789	best: 0.5070789 (111)	total: 1.86s	remaining: 6.46s
112:	learn: 0.4837849	test: 0.5070351	best: 0.5070351 (112)	total: 1.88s	remaining: 6.44s
113:	learn: 0.4835875	test: 0.5070161	best: 0.5070161 (113)	total: 1.9s	remaining: 6.42s
114:	learn: 0.48349

195:	learn: 0.4663892	test: 0.5076898	best: 0.5066905 (153)	total: 3.31s	remaining: 5.13s
196:	learn: 0.4660376	test: 0.5076464	best: 0.5066905 (153)	total: 3.32s	remaining: 5.11s
197:	learn: 0.4658772	test: 0.5076880	best: 0.5066905 (153)	total: 3.34s	remaining: 5.09s
198:	learn: 0.4657730	test: 0.5077939	best: 0.5066905 (153)	total: 3.35s	remaining: 5.08s
199:	learn: 0.4656892	test: 0.5078684	best: 0.5066905 (153)	total: 3.37s	remaining: 5.06s
200:	learn: 0.4656233	test: 0.5079056	best: 0.5066905 (153)	total: 3.39s	remaining: 5.05s
201:	learn: 0.4656008	test: 0.5079173	best: 0.5066905 (153)	total: 3.41s	remaining: 5.04s
202:	learn: 0.4652265	test: 0.5078481	best: 0.5066905 (153)	total: 3.44s	remaining: 5.03s
203:	learn: 0.4648827	test: 0.5077755	best: 0.5066905 (153)	total: 3.46s	remaining: 5.01s
204:	learn: 0.4645421	test: 0.5076066	best: 0.5066905 (153)	total: 3.48s	remaining: 5.01s
205:	learn: 0.4645157	test: 0.5076244	best: 0.5066905 (153)	total: 3.5s	remaining: 5s
206:	learn: 0.

287:	learn: 0.4491052	test: 0.5101982	best: 0.5066905 (153)	total: 4.89s	remaining: 3.6s
288:	learn: 0.4490200	test: 0.5102385	best: 0.5066905 (153)	total: 4.91s	remaining: 3.58s
289:	learn: 0.4488846	test: 0.5103159	best: 0.5066905 (153)	total: 4.92s	remaining: 3.56s
290:	learn: 0.4487097	test: 0.5103403	best: 0.5066905 (153)	total: 4.94s	remaining: 3.55s
291:	learn: 0.4485892	test: 0.5103899	best: 0.5066905 (153)	total: 4.96s	remaining: 3.53s
292:	learn: 0.4484607	test: 0.5104072	best: 0.5066905 (153)	total: 4.97s	remaining: 3.51s
293:	learn: 0.4483339	test: 0.5104876	best: 0.5066905 (153)	total: 4.99s	remaining: 3.5s
294:	learn: 0.4480542	test: 0.5104138	best: 0.5066905 (153)	total: 5s	remaining: 3.48s
295:	learn: 0.4479021	test: 0.5103334	best: 0.5066905 (153)	total: 5.02s	remaining: 3.46s
296:	learn: 0.4478070	test: 0.5104297	best: 0.5066905 (153)	total: 5.04s	remaining: 3.44s
297:	learn: 0.4475444	test: 0.5103119	best: 0.5066905 (153)	total: 5.05s	remaining: 3.42s
298:	learn: 0.4

381:	learn: 0.4333661	test: 0.5132200	best: 0.5066905 (153)	total: 6.63s	remaining: 2.05s
382:	learn: 0.4331953	test: 0.5132454	best: 0.5066905 (153)	total: 6.65s	remaining: 2.03s
383:	learn: 0.4330316	test: 0.5132392	best: 0.5066905 (153)	total: 6.67s	remaining: 2.01s
384:	learn: 0.4330236	test: 0.5132309	best: 0.5066905 (153)	total: 6.69s	remaining: 2s
385:	learn: 0.4326488	test: 0.5133414	best: 0.5066905 (153)	total: 6.72s	remaining: 1.98s
386:	learn: 0.4325606	test: 0.5133339	best: 0.5066905 (153)	total: 6.74s	remaining: 1.97s
387:	learn: 0.4322959	test: 0.5132558	best: 0.5066905 (153)	total: 6.76s	remaining: 1.95s
388:	learn: 0.4321132	test: 0.5134593	best: 0.5066905 (153)	total: 6.78s	remaining: 1.93s
389:	learn: 0.4319421	test: 0.5133795	best: 0.5066905 (153)	total: 6.79s	remaining: 1.92s
390:	learn: 0.4318236	test: 0.5134175	best: 0.5066905 (153)	total: 6.81s	remaining: 1.9s
391:	learn: 0.4316279	test: 0.5133424	best: 0.5066905 (153)	total: 6.82s	remaining: 1.88s
392:	learn: 0.

479:	learn: 0.4203372	test: 0.5163742	best: 0.5066905 (153)	total: 8.37s	remaining: 349ms
480:	learn: 0.4201322	test: 0.5163187	best: 0.5066905 (153)	total: 8.39s	remaining: 331ms
481:	learn: 0.4200962	test: 0.5163098	best: 0.5066905 (153)	total: 8.4s	remaining: 314ms
482:	learn: 0.4200419	test: 0.5163596	best: 0.5066905 (153)	total: 8.42s	remaining: 297ms
483:	learn: 0.4198575	test: 0.5163378	best: 0.5066905 (153)	total: 8.44s	remaining: 279ms
484:	learn: 0.4197849	test: 0.5163353	best: 0.5066905 (153)	total: 8.46s	remaining: 262ms
485:	learn: 0.4196031	test: 0.5164155	best: 0.5066905 (153)	total: 8.48s	remaining: 244ms
486:	learn: 0.4195515	test: 0.5164861	best: 0.5066905 (153)	total: 8.49s	remaining: 227ms
487:	learn: 0.4195493	test: 0.5164991	best: 0.5066905 (153)	total: 8.51s	remaining: 209ms
488:	learn: 0.4195395	test: 0.5165208	best: 0.5066905 (153)	total: 8.54s	remaining: 192ms
489:	learn: 0.4194520	test: 0.5165669	best: 0.5066905 (153)	total: 8.56s	remaining: 175ms
490:	learn:

<catboost.core.CatBoostClassifier at 0x1de39862128>

In [61]:
model_catb.score(X_test, y_test)

0.7610294117647058