In [20]:
import pandas as pd
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [2]:
# TD/IDF
tf_idf = pd.read_csv("/home/andrea/Desktop/neotec/data/sentiment_analysis/tf_idf/labelled_tf_idf.csv")
tf_idf.drop('Unnamed: 0',inplace=True,axis = 1)
tf_idf.head(2)

Unnamed: 0,conversation_id,abajo,abatido,abel,abelio,abierto,aborrecido,abrazo,abrir,absoluto,...,yeah,yi,yo,yogurt,youtubir,zapatilla,zapato,zona,zono,primary_label
0,0H9OEQs318oQ55Rn0SvK103578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.089276,0.0,0.0,0.0,0.0,0.0,0.0,negative
1,0H9OEQs318oQ55Rn0SvK913137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,neutral


In [3]:
# Create sets
X = tf_idf.drop(columns=["conversation_id","primary_label"], axis = 1)
y = tf_idf["primary_label"]

In [14]:
# Label encoding -> negative (0), neutral(1), positive (2)
le = preprocessing.LabelEncoder()
y_encoded = le.fit_transform(y)
print(list(le.classes_)) 
display(y_encoded)

['negative', 'neutral', 'positive']


array([0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 2, 0, 0, 2, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       2, 1, 1, 2, 2, 1, 1, 1, 1, 0, 1, 0, 2, 2, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 2, 2, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 2, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 2, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,

In [16]:
# Divide data in training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1)

In [17]:
# Convert dataset into DMatrix
#data_dmatrix = xgb.DMatrix(data=X,label=y_encoded)
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)

In [25]:
model = XGBClassifier(
    objective= 'multi:softprob',
    nthread=4
)

parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}


clf = GridSearchCV(model, parameters, n_jobs=-1, scoring='balanced_accuracy', verbose=2, refit=True)

clf.fit(X_train, y_train)

# Best parameters
best_parameter = clf.best_params_ 
best_score = clf.best_score_ 
best_estimator = clf.best_estimator_

print(best_score)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END ....learning_rate=0.1, max_depth=2, n_estimators=60; total time= 1.5min
[CV] END ....learning_rate=0.1, max_depth=2, n_estimators=60; total time= 1.5min
[CV] END ....learning_rate=0.1, max_depth=2, n_estimators=60; total time= 1.6min
[CV] END ....learning_rate=0.1, max_depth=2, n_estimators=60; total time= 1.6min
[CV] END ....learning_rate=0.1, max_depth=2, n_estimators=60; total time= 2.0min
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time= 2.3min
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time= 2.5min
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time= 3.0min
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time= 3.0min
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=100; total time= 3.2min
[CV] END ...learning_rate=0.1, max_depth=2, n_estimators=140; total time= 2.9min
[CV] END ...learning_rate=0.1, max_depth=2, n_e

In [26]:
print(best_parameter)

{'learning_rate': 0.1, 'max_depth': 8, 'n_estimators': 140}


In [27]:
print(best_estimator)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=8, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=140, n_jobs=4, nthread=4,
              num_parallel_tree=1, objective='multi:softprob', ...)
