In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import catboost as cb
import re
import unidecode
import nltk
from nltk.tokenize import TweetTokenizer
import string
stopwords = nltk.corpus.stopwords.words('english')

import warnings
warnings.filterwarnings('ignore')

In [2]:
tweets = pd.read_csv("../data/train.csv")

In [3]:
def concatenate(x,char):
    words = ""
    for word in x:
        if word.startswith(char):
            words = words + word + " "
    return words

def count_vowels(x):
    return (x.count('a') + x.count('e') + x.count('i') + x.count('o') + x.count('u'))

def count_short_words(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if 1 <= len(word) <= 3:
            count += 1
    return count

def count_stopwords(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if word in stopwords:
            count += 1
    return count

In [4]:
def remove_punctuation(word):        
    clean_word = ''.join([char for char in word if char not in string.punctuation])
    return clean_word

def cleaning_text(text):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text_tokenize = tokenizer.tokenize(text)
    wordlist = []
    for word in text_tokenize:
        word = word.lower()
        word = re.sub('(?P<url>https?://[^\s]+)', ' ', word)
        word = remove_punctuation(word)
        word = re.sub(r'[^\w]', ' ', word)
        word = unidecode.unidecode(word)
        word = re.sub(r'[0-9]','', word)
        if((word != '')&(word != ' ')&(word not in stopwords)):
            wordlist.append(word)
    clean_text = ' '.join(wordlist)
    return clean_text

In [5]:
tweets["special_chars_count"] =  tweets["text"]
tweets["special_chars_count"] =  tweets["special_chars_count"].str.lower()
tweets["special_chars_count"] = tweets["special_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
tweets["special_chars_count"] = tweets["special_chars_count"].str.strip()
tweets["special_chars_count"] = tweets["special_chars_count"].apply(lambda x: re.sub(' +','', x))
tweets["special_chars_count"] = tweets["special_chars_count"].apply(lambda x: re.sub(r'[0-9]','', x))
tweets["special_chars_count"] = tweets["special_chars_count"].str.len()

tweets["hashtags"] = tweets["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'#'))
tweets["labels"] = tweets["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'@'))
tweets["hashtags_count"] = tweets["hashtags"].str.split(' ').apply(lambda x: len(x))-1
tweets["labels_count"] = tweets["labels"].str.split(' ').apply(lambda x: len(x))-1

tweets["num_chars_count"] = tweets["text"]
tweets["num_chars_count"] =  tweets["num_chars_count"].str.lower()
tweets["num_chars_count"] = tweets["num_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
tweets["num_chars_count"] = tweets["num_chars_count"].apply(lambda x: re.sub(r'[^\w]','',x))
tweets["num_chars_count"] = tweets["num_chars_count"].str.strip()
tweets["num_chars_count"] = tweets["num_chars_count"].str.len()

tweets["clean_text"] = tweets["text"].apply(lambda x: cleaning_text(x))

tweets["text"] = tweets["text"].str.lower()
tweets["text"] = tweets["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'_', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(' +',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: unidecode.unidecode(x))
tweets["text"] = tweets["text"].str.strip()
tweets["text_length"] = tweets["text"].str.len()

tweets["vowels_count"] = tweets["text"].apply(lambda x: count_vowels(x))
tweets["short_words_count"] = tweets["text"].apply(lambda x: count_short_words(x))
tweets["stopwords_count"] = tweets["text"].apply(lambda x: count_stopwords(x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
tweets["words_count"] = tweets["text"].str.split(' ').apply(lambda x: len(x))

tweets.rename(columns={"target":"target_label"}, inplace=True)
tweets.head()

Unnamed: 0,id,keyword,location,text,target_label,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,clean_text,text_length,vowels_count,short_words_count,stopwords_count,words_count
0,1,,,our deeds are the reason of this earthquake ma...,1,1,#earthquake,,1,0,0,deeds reason earthquake may allah forgive us,68,25,7,6,13
1,4,,,forest fire near la ronge sask canada,1,1,,,0,0,0,forest fire near la ronge sask canada,37,13,1,0,7
2,5,,,all residents asked to shelter in place are be...,1,3,,,0,0,0,residents asked shelter place notified officer...,130,45,9,11,22
3,6,,,people receive wildfires evacuation orders in ...,1,2,#wildfires,,1,0,5,people receive wildfires evacuation orders cal...,56,24,1,1,7
4,7,,,just got sent this photo from ruby alaska as s...,1,2,#alaska #wildfires,,2,0,0,got sent photo ruby alaska smoke wildfires pou...,85,25,3,7,16


In [6]:
tweets["keyword"] = tweets["keyword"].str.replace('%20',' ')
tweets["keyword"] = tweets["keyword"].astype('category')

In [7]:
#one hot encoding
dummies = pd.get_dummies(tweets["keyword"], prefix="keyword")
dummies.columns

Index(['keyword_ablaze', 'keyword_accident', 'keyword_aftershock',
       'keyword_airplane accident', 'keyword_ambulance', 'keyword_annihilated',
       'keyword_annihilation', 'keyword_apocalypse', 'keyword_armageddon',
       'keyword_army',
       ...
       'keyword_weapons', 'keyword_whirlwind', 'keyword_wild fires',
       'keyword_wildfire', 'keyword_windstorm', 'keyword_wounded',
       'keyword_wounds', 'keyword_wreck', 'keyword_wreckage',
       'keyword_wrecked'],
      dtype='object', length=221)

In [8]:
tweets_ohe = pd.concat([tweets,dummies], axis="columns")
tweets_ohe.shape

(7613, 238)

In [9]:
#BOW
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(tweets["clean_text"])
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [10]:
feature_words = vectorizer.get_feature_names()
df_words = pd.DataFrame(X.toarray(), columns=feature_words)
df_words.head()

Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_filter = df_words.loc[:,(df_words.sum()>5)]
df_filter.shape

(7613, 2080)

In [12]:
tweets_final = pd.concat([tweets_ohe,df_filter], axis="columns")
tweets_final.shape

(7613, 2318)

In [13]:
X = tweets_final.drop(["id","keyword","location","text","target_label","hashtags","labels","clean_text"], axis=1)
y = tweets_final["target_label"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [17]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5709, 2308)
(5709,)
(1904, 2308)
(1904,)


In [18]:
#usando las 2308 features
model_cb = cb.CatBoostClassifier()
model_cb.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x20c61a2c548>

In [19]:
y_test_hat = model_cb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.789916


In [25]:
model_cb = cb.CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1)
model_cb.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x20ca5824088>

In [26]:
y_test_hat = model_cb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat))) #al fin llegue a los 80% de accuracy!!!!

Accuracy score: 0.801471


In [27]:
#vamos a reducir las 2308 features a las 700 mas importantes
df_feat_importances = pd.DataFrame(model_cb.feature_importances_, index=X_train.columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)

In [28]:
df_feat_importances.head(20)

Unnamed: 0,importancia
special_chars_count,4.688143
text_length,3.280866
vowels_count,3.272293
num_chars_count,3.196521
stopwords_count,3.032422
short_words_count,2.562712
hiroshima,2.502308
labels_count,2.118243
suicide,2.038305
mh,1.388906


In [29]:
list_fi = df_feat_importances.index[:700].tolist()
X = X.filter(items=list_fi)
X.shape

(7613, 700)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [31]:
#usando las 700 features
model_cb = cb.CatBoostClassifier()
model_cb.fit(X_train, y_train)

Learning rate set to 0.021676
0:	learn: 0.6907455	total: 29.2ms	remaining: 29.2s
1:	learn: 0.6879474	total: 51.6ms	remaining: 25.8s
2:	learn: 0.6856062	total: 72.7ms	remaining: 24.2s
3:	learn: 0.6832936	total: 94.6ms	remaining: 23.6s
4:	learn: 0.6811160	total: 117ms	remaining: 23.3s
5:	learn: 0.6789029	total: 141ms	remaining: 23.3s
6:	learn: 0.6764053	total: 163ms	remaining: 23.1s
7:	learn: 0.6742749	total: 188ms	remaining: 23.4s
8:	learn: 0.6721338	total: 229ms	remaining: 25.2s
9:	learn: 0.6703967	total: 253ms	remaining: 25.1s
10:	learn: 0.6688481	total: 275ms	remaining: 24.8s
11:	learn: 0.6668057	total: 297ms	remaining: 24.5s
12:	learn: 0.6649954	total: 322ms	remaining: 24.5s
13:	learn: 0.6634660	total: 344ms	remaining: 24.2s
14:	learn: 0.6619595	total: 372ms	remaining: 24.5s
15:	learn: 0.6598568	total: 411ms	remaining: 25.3s
16:	learn: 0.6582692	total: 435ms	remaining: 25.2s
17:	learn: 0.6563841	total: 460ms	remaining: 25.1s
18:	learn: 0.6548242	total: 482ms	remaining: 24.9s
19:	lea

164:	learn: 0.5588693	total: 4.58s	remaining: 23.2s
165:	learn: 0.5585209	total: 4.62s	remaining: 23.2s
166:	learn: 0.5580485	total: 4.66s	remaining: 23.3s
167:	learn: 0.5577482	total: 4.69s	remaining: 23.2s
168:	learn: 0.5574882	total: 4.72s	remaining: 23.2s
169:	learn: 0.5572043	total: 4.76s	remaining: 23.2s
170:	learn: 0.5568673	total: 4.79s	remaining: 23.2s
171:	learn: 0.5565040	total: 4.82s	remaining: 23.2s
172:	learn: 0.5561443	total: 4.85s	remaining: 23.2s
173:	learn: 0.5557808	total: 4.88s	remaining: 23.2s
174:	learn: 0.5554069	total: 4.91s	remaining: 23.1s
175:	learn: 0.5550279	total: 4.93s	remaining: 23.1s
176:	learn: 0.5547200	total: 4.96s	remaining: 23s
177:	learn: 0.5543891	total: 4.98s	remaining: 23s
178:	learn: 0.5539419	total: 5s	remaining: 22.9s
179:	learn: 0.5536787	total: 5.02s	remaining: 22.9s
180:	learn: 0.5534111	total: 5.04s	remaining: 22.8s
181:	learn: 0.5530949	total: 5.07s	remaining: 22.8s
182:	learn: 0.5528595	total: 5.09s	remaining: 22.7s
183:	learn: 0.55261

324:	learn: 0.5156271	total: 8.68s	remaining: 18s
325:	learn: 0.5153296	total: 8.71s	remaining: 18s
326:	learn: 0.5150762	total: 8.75s	remaining: 18s
327:	learn: 0.5148075	total: 8.77s	remaining: 18s
328:	learn: 0.5144658	total: 8.8s	remaining: 17.9s
329:	learn: 0.5141614	total: 8.82s	remaining: 17.9s
330:	learn: 0.5139713	total: 8.85s	remaining: 17.9s
331:	learn: 0.5137690	total: 8.87s	remaining: 17.9s
332:	learn: 0.5135702	total: 8.9s	remaining: 17.8s
333:	learn: 0.5134020	total: 8.92s	remaining: 17.8s
334:	learn: 0.5131915	total: 8.94s	remaining: 17.8s
335:	learn: 0.5130422	total: 8.96s	remaining: 17.7s
336:	learn: 0.5128653	total: 8.98s	remaining: 17.7s
337:	learn: 0.5127140	total: 9.01s	remaining: 17.6s
338:	learn: 0.5124239	total: 9.03s	remaining: 17.6s
339:	learn: 0.5122217	total: 9.05s	remaining: 17.6s
340:	learn: 0.5119988	total: 9.08s	remaining: 17.5s
341:	learn: 0.5118470	total: 9.1s	remaining: 17.5s
342:	learn: 0.5116315	total: 9.12s	remaining: 17.5s
343:	learn: 0.5113502	t

486:	learn: 0.4797254	total: 12.7s	remaining: 13.4s
487:	learn: 0.4795240	total: 12.8s	remaining: 13.4s
488:	learn: 0.4793223	total: 12.8s	remaining: 13.4s
489:	learn: 0.4791493	total: 12.8s	remaining: 13.3s
490:	learn: 0.4789396	total: 12.8s	remaining: 13.3s
491:	learn: 0.4786889	total: 12.9s	remaining: 13.3s
492:	learn: 0.4785012	total: 12.9s	remaining: 13.3s
493:	learn: 0.4783539	total: 12.9s	remaining: 13.2s
494:	learn: 0.4781144	total: 12.9s	remaining: 13.2s
495:	learn: 0.4779508	total: 13s	remaining: 13.2s
496:	learn: 0.4777473	total: 13s	remaining: 13.2s
497:	learn: 0.4775434	total: 13s	remaining: 13.1s
498:	learn: 0.4773418	total: 13.1s	remaining: 13.1s
499:	learn: 0.4771148	total: 13.1s	remaining: 13.1s
500:	learn: 0.4769055	total: 13.1s	remaining: 13.1s
501:	learn: 0.4767225	total: 13.2s	remaining: 13.1s
502:	learn: 0.4764653	total: 13.2s	remaining: 13s
503:	learn: 0.4762634	total: 13.2s	remaining: 13s
504:	learn: 0.4760474	total: 13.2s	remaining: 13s
505:	learn: 0.4758225	to

647:	learn: 0.4488847	total: 16.4s	remaining: 8.91s
648:	learn: 0.4487028	total: 16.4s	remaining: 8.88s
649:	learn: 0.4485249	total: 16.4s	remaining: 8.86s
650:	learn: 0.4483720	total: 16.5s	remaining: 8.83s
651:	learn: 0.4482142	total: 16.5s	remaining: 8.8s
652:	learn: 0.4480662	total: 16.5s	remaining: 8.77s
653:	learn: 0.4479751	total: 16.5s	remaining: 8.75s
654:	learn: 0.4477642	total: 16.6s	remaining: 8.72s
655:	learn: 0.4475680	total: 16.6s	remaining: 8.69s
656:	learn: 0.4474712	total: 16.6s	remaining: 8.67s
657:	learn: 0.4473311	total: 16.6s	remaining: 8.64s
658:	learn: 0.4472226	total: 16.6s	remaining: 8.61s
659:	learn: 0.4470587	total: 16.7s	remaining: 8.59s
660:	learn: 0.4468686	total: 16.7s	remaining: 8.56s
661:	learn: 0.4466910	total: 16.7s	remaining: 8.53s
662:	learn: 0.4465333	total: 16.7s	remaining: 8.51s
663:	learn: 0.4463501	total: 16.8s	remaining: 8.48s
664:	learn: 0.4461874	total: 16.8s	remaining: 8.45s
665:	learn: 0.4459793	total: 16.8s	remaining: 8.43s
666:	learn: 0

806:	learn: 0.4244999	total: 19.9s	remaining: 4.75s
807:	learn: 0.4243903	total: 19.9s	remaining: 4.73s
808:	learn: 0.4242685	total: 19.9s	remaining: 4.7s
809:	learn: 0.4241543	total: 19.9s	remaining: 4.68s
810:	learn: 0.4240396	total: 20s	remaining: 4.65s
811:	learn: 0.4239604	total: 20s	remaining: 4.62s
812:	learn: 0.4238293	total: 20s	remaining: 4.6s
813:	learn: 0.4236326	total: 20s	remaining: 4.57s
814:	learn: 0.4235165	total: 20s	remaining: 4.55s
815:	learn: 0.4233219	total: 20.1s	remaining: 4.52s
816:	learn: 0.4231930	total: 20.1s	remaining: 4.5s
817:	learn: 0.4230531	total: 20.1s	remaining: 4.47s
818:	learn: 0.4228795	total: 20.1s	remaining: 4.45s
819:	learn: 0.4227462	total: 20.2s	remaining: 4.42s
820:	learn: 0.4225871	total: 20.2s	remaining: 4.4s
821:	learn: 0.4224746	total: 20.2s	remaining: 4.37s
822:	learn: 0.4223308	total: 20.2s	remaining: 4.35s
823:	learn: 0.4222158	total: 20.2s	remaining: 4.32s
824:	learn: 0.4220752	total: 20.3s	remaining: 4.3s
825:	learn: 0.4219073	total

965:	learn: 0.4038202	total: 23.4s	remaining: 822ms
966:	learn: 0.4036832	total: 23.4s	remaining: 798ms
967:	learn: 0.4035627	total: 23.4s	remaining: 774ms
968:	learn: 0.4034250	total: 23.4s	remaining: 750ms
969:	learn: 0.4033116	total: 23.5s	remaining: 725ms
970:	learn: 0.4031709	total: 23.5s	remaining: 701ms
971:	learn: 0.4030187	total: 23.5s	remaining: 677ms
972:	learn: 0.4029024	total: 23.5s	remaining: 652ms
973:	learn: 0.4027767	total: 23.5s	remaining: 628ms
974:	learn: 0.4026700	total: 23.6s	remaining: 604ms
975:	learn: 0.4025949	total: 23.6s	remaining: 580ms
976:	learn: 0.4024419	total: 23.6s	remaining: 556ms
977:	learn: 0.4023166	total: 23.6s	remaining: 531ms
978:	learn: 0.4021559	total: 23.6s	remaining: 507ms
979:	learn: 0.4020486	total: 23.7s	remaining: 483ms
980:	learn: 0.4019741	total: 23.7s	remaining: 459ms
981:	learn: 0.4018744	total: 23.7s	remaining: 434ms
982:	learn: 0.4017486	total: 23.7s	remaining: 410ms
983:	learn: 0.4016733	total: 23.7s	remaining: 386ms
984:	learn: 

<catboost.core.CatBoostClassifier at 0x20ca5783188>

In [32]:
y_test_hat = model_cb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.793067


In [45]:
# model_cb = cb.CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1)
model_cb = cb.CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1)
model_cb.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x20c61e1ddc8>

In [46]:
y_test_hat = model_cb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.804097


In [None]:
#hacer kfold y stratifiedkfold con esto

In [47]:
#ahora con tf-idf
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(tweets["clean_text"])
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [48]:
feature_words = vectorizer.get_feature_names()
df_words = pd.DataFrame(X.toarray(), columns=feature_words)
df_filter = df_words.loc[:,(df_words.sum()>2)]
df_filter.shape

(7613, 2072)

In [49]:
tweets_final = pd.concat([tweets_ohe,df_filter], axis="columns")
tweets_final.shape

(7613, 2310)

In [50]:
X = tweets_final.drop(["id","keyword","location","text","target_label","hashtags","labels","clean_text"], axis=1)
y = tweets_final["target_label"]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [52]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5709, 2300)
(5709,)
(1904, 2300)
(1904,)


In [53]:
#usando 2300 features
model_cb = cb.CatBoostClassifier()
model_cb.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x20ca57cd1c8>

In [54]:
y_test_hat = model_cb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.795168


In [55]:
model_cb = cb.CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1)
model_cb.fit(X_train, y_train, verbose=False)

<catboost.core.CatBoostClassifier at 0x20c9bff9b48>

In [56]:
y_test_hat = model_cb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.796744


In [57]:
df_feat_importances = pd.DataFrame(model_cb.feature_importances_, index=X_train.columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)
df_feat_importances.importancia.value_counts()

0.000000    1435
0.010956       1
0.365630       1
0.007435       1
0.002677       1
            ... 
0.003512       1
0.179656       1
0.013365       1
0.022956       1
0.084894       1
Name: importancia, Length: 866, dtype: int64

In [59]:
list_fi = df_feat_importances.index[:865].tolist()
X = X.filter(items=list_fi)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [61]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5709, 865)
(5709,)
(1904, 865)
(1904,)


In [76]:
#ahora usamos 865 features
model_cb = cb.CatBoostClassifier(iterations=800, depth=4, learning_rate=0.1)
model_cb.fit(X_train, y_train, verbose=True)

0:	learn: 0.6822298	total: 32.9ms	remaining: 26.2s
1:	learn: 0.6747758	total: 75.2ms	remaining: 30s
2:	learn: 0.6673190	total: 115ms	remaining: 30.5s
3:	learn: 0.6603658	total: 153ms	remaining: 30.5s
4:	learn: 0.6546292	total: 218ms	remaining: 34.7s
5:	learn: 0.6488310	total: 279ms	remaining: 36.9s
6:	learn: 0.6430662	total: 347ms	remaining: 39.3s
7:	learn: 0.6385619	total: 406ms	remaining: 40.2s
8:	learn: 0.6351030	total: 461ms	remaining: 40.6s
9:	learn: 0.6325583	total: 495ms	remaining: 39.1s
10:	learn: 0.6292539	total: 558ms	remaining: 40s
11:	learn: 0.6264676	total: 593ms	remaining: 39s
12:	learn: 0.6245421	total: 656ms	remaining: 39.7s
13:	learn: 0.6206137	total: 697ms	remaining: 39.1s
14:	learn: 0.6175113	total: 745ms	remaining: 39s
15:	learn: 0.6158293	total: 805ms	remaining: 39.5s
16:	learn: 0.6139893	total: 863ms	remaining: 39.7s
17:	learn: 0.6109340	total: 909ms	remaining: 39.5s
18:	learn: 0.6085737	total: 970ms	remaining: 39.9s
19:	learn: 0.6065204	total: 1.01s	remaining: 39

163:	learn: 0.4740604	total: 8.8s	remaining: 34.1s
164:	learn: 0.4733846	total: 8.84s	remaining: 34s
165:	learn: 0.4728488	total: 8.88s	remaining: 33.9s
166:	learn: 0.4723541	total: 8.91s	remaining: 33.8s
167:	learn: 0.4716626	total: 8.98s	remaining: 33.8s
168:	learn: 0.4711307	total: 9.04s	remaining: 33.8s
169:	learn: 0.4708204	total: 9.09s	remaining: 33.7s
170:	learn: 0.4699201	total: 9.14s	remaining: 33.6s
171:	learn: 0.4691024	total: 9.18s	remaining: 33.5s
172:	learn: 0.4685586	total: 9.25s	remaining: 33.5s
173:	learn: 0.4680169	total: 9.3s	remaining: 33.5s
174:	learn: 0.4673359	total: 9.34s	remaining: 33.4s
175:	learn: 0.4670807	total: 9.38s	remaining: 33.3s
176:	learn: 0.4665454	total: 9.43s	remaining: 33.2s
177:	learn: 0.4658573	total: 9.47s	remaining: 33.1s
178:	learn: 0.4652505	total: 9.52s	remaining: 33s
179:	learn: 0.4647917	total: 9.71s	remaining: 33.4s
180:	learn: 0.4643182	total: 9.77s	remaining: 33.4s
181:	learn: 0.4639420	total: 9.81s	remaining: 33.3s
182:	learn: 0.4633

325:	learn: 0.4057228	total: 15.8s	remaining: 22.9s
326:	learn: 0.4053346	total: 15.8s	remaining: 22.9s
327:	learn: 0.4049796	total: 15.8s	remaining: 22.8s
328:	learn: 0.4048540	total: 15.9s	remaining: 22.7s
329:	learn: 0.4045795	total: 15.9s	remaining: 22.7s
330:	learn: 0.4043272	total: 16s	remaining: 22.6s
331:	learn: 0.4042724	total: 16s	remaining: 22.5s
332:	learn: 0.4038294	total: 16.1s	remaining: 22.5s
333:	learn: 0.4034699	total: 16.1s	remaining: 22.5s
334:	learn: 0.4031869	total: 16.1s	remaining: 22.4s
335:	learn: 0.4029010	total: 16.2s	remaining: 22.3s
336:	learn: 0.4026419	total: 16.2s	remaining: 22.3s
337:	learn: 0.4022354	total: 16.3s	remaining: 22.2s
338:	learn: 0.4016796	total: 16.3s	remaining: 22.1s
339:	learn: 0.4013055	total: 16.4s	remaining: 22.1s
340:	learn: 0.4008513	total: 16.4s	remaining: 22.1s
341:	learn: 0.4006913	total: 16.4s	remaining: 22s
342:	learn: 0.4003572	total: 16.5s	remaining: 21.9s
343:	learn: 0.4001787	total: 16.5s	remaining: 21.9s
344:	learn: 0.3997

487:	learn: 0.3608297	total: 23.1s	remaining: 14.7s
488:	learn: 0.3607115	total: 23.1s	remaining: 14.7s
489:	learn: 0.3606183	total: 23.2s	remaining: 14.7s
490:	learn: 0.3603065	total: 23.2s	remaining: 14.6s
491:	learn: 0.3600989	total: 23.3s	remaining: 14.6s
492:	learn: 0.3600410	total: 23.3s	remaining: 14.5s
493:	learn: 0.3596979	total: 23.4s	remaining: 14.5s
494:	learn: 0.3593380	total: 23.5s	remaining: 14.5s
495:	learn: 0.3591960	total: 23.6s	remaining: 14.5s
496:	learn: 0.3590948	total: 23.7s	remaining: 14.4s
497:	learn: 0.3590388	total: 23.7s	remaining: 14.4s
498:	learn: 0.3589029	total: 23.8s	remaining: 14.3s
499:	learn: 0.3586356	total: 23.8s	remaining: 14.3s
500:	learn: 0.3582545	total: 23.9s	remaining: 14.2s
501:	learn: 0.3578984	total: 23.9s	remaining: 14.2s
502:	learn: 0.3578248	total: 24s	remaining: 14.1s
503:	learn: 0.3576927	total: 24s	remaining: 14.1s
504:	learn: 0.3576332	total: 24s	remaining: 14s
505:	learn: 0.3576119	total: 24.1s	remaining: 14s
506:	learn: 0.3573829	

648:	learn: 0.3296370	total: 31.4s	remaining: 7.31s
649:	learn: 0.3295390	total: 31.5s	remaining: 7.27s
650:	learn: 0.3293529	total: 31.5s	remaining: 7.22s
651:	learn: 0.3292870	total: 31.6s	remaining: 7.17s
652:	learn: 0.3290919	total: 31.6s	remaining: 7.12s
653:	learn: 0.3287297	total: 31.7s	remaining: 7.07s
654:	learn: 0.3284796	total: 31.7s	remaining: 7.02s
655:	learn: 0.3283902	total: 31.8s	remaining: 6.97s
656:	learn: 0.3281311	total: 31.8s	remaining: 6.92s
657:	learn: 0.3278429	total: 31.8s	remaining: 6.87s
658:	learn: 0.3276882	total: 31.9s	remaining: 6.83s
659:	learn: 0.3275001	total: 31.9s	remaining: 6.78s
660:	learn: 0.3274283	total: 32s	remaining: 6.73s
661:	learn: 0.3273702	total: 32.1s	remaining: 6.68s
662:	learn: 0.3272078	total: 32.1s	remaining: 6.63s
663:	learn: 0.3268987	total: 32.2s	remaining: 6.58s
664:	learn: 0.3268102	total: 32.2s	remaining: 6.54s
665:	learn: 0.3264448	total: 32.3s	remaining: 6.49s
666:	learn: 0.3263462	total: 32.3s	remaining: 6.44s
667:	learn: 0.

<catboost.core.CatBoostClassifier at 0x20c61e0f408>

In [77]:
y_test_hat = model_cb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.798845
