In [83]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
import re
import nltk
from nltk.tokenize import TweetTokenizer
import string
import unidecode

import warnings
warnings.filterwarnings('ignore')

stopwords = nltk.corpus.stopwords.words('english')

In [3]:
tweets = pd.read_csv("../data/train.csv")

In [5]:
def concatenate(x,char):
    words = ""
    for word in x:
        if word.startswith(char):
            words = words + word + " "
    return words

def count_vowels(x):
    return (x.count('a') + x.count('e') + x.count('i') + x.count('o') + x.count('u'))

def count_short_words(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if 1 <= len(word) <= 3:
            count += 1
    return count

def count_stopwords(x):
    count = 0
    words = x.split(' ')
    for word in words:
        if word in stopwords:
            count += 1
    return count

In [6]:
tweets["special_chars_count"] =  tweets["text"]
tweets["special_chars_count"] =  tweets["special_chars_count"].str.lower()
tweets["special_chars_count"] = tweets["special_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
tweets["special_chars_count"] = tweets["special_chars_count"].str.strip()
tweets["special_chars_count"] = tweets["special_chars_count"].apply(lambda x: re.sub(' +','', x))
tweets["special_chars_count"] = tweets["special_chars_count"].apply(lambda x: re.sub(r'[0-9]','', x))
tweets["special_chars_count"] = tweets["special_chars_count"].str.len()

tweets["hashtags"] = tweets["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'#'))
tweets["labels"] = tweets["text"].str.lower().str.split(' ').apply(lambda x: concatenate(x,'@'))
tweets["hashtags_count"] = tweets["hashtags"].str.split(' ').apply(lambda x: len(x))-1
tweets["labels_count"] = tweets["labels"].str.split(' ').apply(lambda x: len(x))-1

tweets["num_chars_count"] = tweets["text"]
tweets["num_chars_count"] =  tweets["num_chars_count"].str.lower()
tweets["num_chars_count"] = tweets["num_chars_count"].apply(lambda x: re.sub(r'[a-z]','',x))
tweets["num_chars_count"] = tweets["num_chars_count"].apply(lambda x: re.sub(r'[^\w]','',x))
tweets["num_chars_count"] = tweets["num_chars_count"].str.strip()
tweets["num_chars_count"] = tweets["num_chars_count"].str.len()

In [7]:
#primero lo haremos con BOW y luego con TFIDF
def remove_punctuation(word):        
    clean_word = ''.join([char for char in word if char not in string.punctuation])
    return clean_word

def cleaning_text(text):
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text_tokenize = tokenizer.tokenize(text)
    wordlist = []
    for word in text_tokenize:
        word = word.lower()
        word = re.sub('(?P<url>https?://[^\s]+)', ' ', word)
        word = remove_punctuation(word)
        word = re.sub(r'[^\w]', ' ', word)
        word = unidecode.unidecode(word)
        word = re.sub(r'[0-9]','', word)
        if((word != '')&(word != ' ')&(word not in stopwords)):
            wordlist.append(word)
    clean_text = ' '.join(wordlist)
    return clean_text

In [10]:
tweets["clean_text"] = tweets["text"].apply(lambda x: cleaning_text(x))
tweets.head()

Unnamed: 0,id,keyword,location,text,target,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,1,#earthquake,,1,0,0,deeds reason earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,1,,,0,0,0,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,3,,,0,0,0,residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,2,#wildfires,,1,0,5,people receive wildfires evacuation orders cal...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,2,#alaska #wildfires,,2,0,0,got sent photo ruby alaska smoke wildfires pou...


In [11]:
tweets["text"] = tweets["text"].str.lower()
tweets["text"] = tweets["text"].apply(lambda x: re.sub('(?P<url>https?://[^\s]+)', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[^\w]', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'_', ' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'[0-9]',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(' +',' ', x))
tweets["text"] = tweets["text"].apply(lambda x: unidecode.unidecode(x))
tweets["text"] = tweets["text"].str.strip()
tweets["text_length"] = tweets["text"].str.len()

tweets["vowels_count"] = tweets["text"].apply(lambda x: count_vowels(x))
tweets["short_words_count"] = tweets["text"].apply(lambda x: count_short_words(x))
tweets["stopwords_count"] = tweets["text"].apply(lambda x: count_stopwords(x))
tweets["text"] = tweets["text"].apply(lambda x: re.sub(r'\b\w{1}\b', '', x))
tweets["words_count"] = tweets["text"].str.split(' ').apply(lambda x: len(x))

In [24]:
tweets.rename(columns={"target":"target_label"}, inplace=True)
tweets.head()

Unnamed: 0,id,keyword,location,text,target_label,special_chars_count,hashtags,labels,hashtags_count,labels_count,num_chars_count,clean_text,text_length,vowels_count,short_words_count,stopwords_count,words_count
0,1,,,our deeds are the reason of this earthquake ma...,1,1,#earthquake,,1,0,0,deeds reason earthquake may allah forgive us,68,25,7,6,13
1,4,,,forest fire near la ronge sask canada,1,1,,,0,0,0,forest fire near la ronge sask canada,37,13,1,0,7
2,5,,,all residents asked to shelter in place are be...,1,3,,,0,0,0,residents asked shelter place notified officer...,130,45,9,11,22
3,6,,,people receive wildfires evacuation orders in ...,1,2,#wildfires,,1,0,5,people receive wildfires evacuation orders cal...,56,24,1,1,7
4,7,,,just got sent this photo from ruby alaska as s...,1,2,#alaska #wildfires,,2,0,0,got sent photo ruby alaska smoke wildfires pou...,85,25,3,7,16


#### Bag of Words

In [14]:
vectorizer = CountVectorizer(stop_words='english')
df_text = tweets["clean_text"]
X = vectorizer.fit_transform(df_text)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
feature_words = vectorizer.get_feature_names()
len(feature_words)

14190

In [16]:
df_words = pd.DataFrame(X.toarray(), columns=feature_words)
df_words.head()

Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
df_filter = df_words.loc[:,(df_words.sum()>5)]
df_filter.shape

(7613, 2080)

In [25]:
tweets_final = pd.concat([tweets,df_filter], axis="columns")
tweets_final.shape

(7613, 2097)

In [26]:
tweets_final.columns

Index(['id', 'keyword', 'location', 'text', 'target_label',
       'special_chars_count', 'hashtags', 'labels', 'hashtags_count',
       'labels_count',
       ...
       'young', 'youre', 'youth', 'youtube', 'youve', 'yr', 'yrs', 'yyc',
       'zombie', 'zone'],
      dtype='object', length=2097)

In [75]:
X = tweets_final.drop(["id","keyword","location","text","target_label","hashtags","labels","clean_text"], axis=1)
y = tweets_final["target_label"]

In [76]:
print(X.shape)
print(y.shape)

(7613, 2087)
(7613,)


In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [30]:
#usando las 2087 features
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [31]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.790966


In [32]:
#vamos a reducir las 2087 features a las 500 mas importantes
model_xgb.feature_importances_

array([0.00209026, 0.00159787, 0.00567669, ..., 0.0025696 , 0.        ,
       0.00187095], dtype=float32)

In [33]:
df_feat_importances = pd.DataFrame(model_xgb.feature_importances_, index=X_train.columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)

In [34]:
df_feat_importances.head(20)

Unnamed: 0,importancia
mh,0.014214
body,0.013192
suicide,0.013121
pm,0.012876
killed,0.012422
hiroshima,0.011302
families,0.010191
fatal,0.010156
train,0.009233
wildfire,0.009022


In [41]:
list_fi = df_feat_importances.index[:500].tolist()
X = X.filter(items=list_fi)
X.shape

(7613, 500)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [44]:
#usando las 500 features
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [45]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.796218


#### Ajustando hiper-parametros:
    - n_estimators=150, colsample_bytree=0.5, learning_rate=0.1, max_depth=7, subsample=1
    SCORE: 0.775210
    - n_estimators=150, colsample_bytree=0.5, learning_rate=0.1, max_depth=11, subsample=1
    SCORE: 0.788340
    - n_estimators=200, colsample_bytree=0.5, learnig_rate=0.1, max_depth=11, subsample=1
    SCORE: 0.794643
    - n_estimators=200, colsample_bytree=0.5, learnig_rate=0.1, max_depth=13, subsample=1
    SCORE: 0.788866
    - n_estimators=200, colsample_bytree=0.7, learnig_rate=0.1, max_depth=11, subsample=1
    SCORE: 0.790441
    - n_estimators=250, colsample_bytree=0.5, learnig_rate=0.1, max_depth=11, subsample=1
    SCORE: 0.796744
    - n_estimators=300, colsample_bytree=0.5, learnig_rate=0.1, max_depth=11, subsample=1
    SCORE: 0.795168

In [70]:
model_xgb = xgb.XGBClassifier(n_estimators=300, colsample_bytree=0.5, learning_rate=0.1, max_depth=11)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=11,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=300, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [71]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.795168


In [72]:
hiper_parametros = {"n_estimators":[200,250],
                   "max_depth":[9,10,11,12],
                   "colsample_bytree":[0.5],
                   "subsample":[1],
                    "n_jobs": [1]}

In [73]:
clasif = GridSearchCV(model_xgb, hiper_parametros, cv=4, scoring='accuracy')
clasif.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=11,
       min_child_weight=1, missing=... scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [200, 250], 'max_depth': [9, 10, 11, 12], 'colsample_bytree': [0.5], 'subsample': [1], 'n_jobs': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [74]:
print(clasif.best_estimator_)
print(clasif.best_score_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=11,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=250, n_jobs=1, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)
0.7863023296549309


In [77]:
#usando 700 features
list_fi = df_feat_importances.index[:700].tolist()
X = X.filter(items=list_fi)
X.shape

(7613, 700)

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [79]:
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [80]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.792017


In [81]:
model_xgb = xgb.XGBClassifier(n_estimators=250, colsample_bytree=0.5, learning_rate=0.1, max_depth=11)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=11,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=250, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [82]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.796218


#### TF-IDF

In [85]:
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(tweets["clean_text"])
X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [86]:
feature_words = vectorizer.get_feature_names()
len(feature_words)

14190

In [87]:
df_words = pd.DataFrame(X.toarray(), columns=feature_words)
df_words.head()

Unnamed: 0,aa,aaa,aaaand,aaalll,aaarrrgghhh,aaemiddleaged,aal,aan,aannnd,aar,...,zones,zonewolf,zoom,zotar,zouma,zrnf,zss,zumiez,zurich,zzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
df_filter = df_words.loc[:,(df_words.sum()>2)]
df_filter.shape

(7613, 2072)

In [91]:
tweets_final = pd.concat([tweets,df_filter], axis="columns")
tweets_final.shape

(7613, 2089)

In [92]:
X = tweets_final.drop(["id","keyword","location","text","target_label","hashtags","labels","clean_text"], axis=1)
y = tweets_final["target_label"]

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [94]:
#usando las 2081 features
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [95]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.780462


In [96]:
df_feat_importances = pd.DataFrame(model_xgb.feature_importances_, index=X_train.columns, columns=["importancia"]).\
        sort_values(by="importancia",ascending=False)

In [97]:
df_feat_importances.head(20)

Unnamed: 0,importancia
california,0.014099
suicide,0.013728
fatal,0.012141
killed,0.011885
warning,0.010431
searching,0.010284
mh,0.010246
hiroshima,0.010021
pm,0.009229
police,0.007856


In [98]:
list_fi = df_feat_importances.index[:500].tolist()
X = X.filter(items=list_fi)
X.shape

(7613, 500)

In [99]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [100]:
#usando las 500 features
model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.300000012, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=100, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [101]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.779937


In [102]:
model_xgb = xgb.XGBClassifier(n_estimators=200, colsample_bytree=0.5, learning_rate=0.1, max_depth=11)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=11,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=200, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [103]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.792542


In [104]:
model_xgb = xgb.XGBClassifier(n_estimators=250, colsample_bytree=0.5, learning_rate=0.1, max_depth=11)
model_xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=11,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=250, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)

In [105]:
y_test_hat = model_xgb.predict(X_test)
print("Accuracy score: %f" % (accuracy_score(y_test, y_test_hat)))

Accuracy score: 0.795168


In [106]:
hiper_parametros = {"n_estimators":[250],
                   "max_depth":[9,11,13],
                   "colsample_bytree":[0.5,0.7],
                   "learning_rate":[0.1],
                    "n_jobs": [1]}

In [107]:
clasif = GridSearchCV(model_xgb, hiper_parametros, cv=4, scoring='accuracy')
clasif.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=11,
       min_child_weight=1, missing=... scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [250], 'max_depth': [9, 11, 13], 'colsample_bytree': [0.5, 0.7], 'learning_rate': [0.1], 'n_jobs': [1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [108]:
print(clasif.best_estimator_)
print(clasif.best_score_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints='',
       learning_rate=0.1, max_delta_step=0, max_depth=13,
       min_child_weight=1, missing=nan, monotone_constraints='()',
       n_estimators=250, n_jobs=1, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
       validate_parameters=1, verbosity=None)
0.7834997372569626
