In [1]:
#Auxiliares
import os
import pickle
from time import time
from datetime import datetime

#Dados
import pandas as pd


#Machine learning
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
#Metrics
from sklearn.metrics import classification_report

seed = 42

In [9]:
#Variables
base_path = 'C:/Users/arthu/Desktop/22032020 - Experimentos/05. Organizado/03. Datasets/Atendimento-Balanced-Multiclass'
save_path = 'output'

label = 'Atendimento'

x_train_file = 'X_train.csv'
y_train_file = 'y_train.csv'
x_test_file = 'X_test.csv'
y_test_file = 'y_test.csv'

#Load data
X_train = pd.read_csv(os.path.join(base_path, x_train_file), sep=';', encoding='utf-8')
y_train = pd.read_csv(os.path.join(base_path, y_train_file), sep=';', encoding='utf-8')
X_test = pd.read_csv(os.path.join(base_path, x_test_file), sep=';', encoding='utf-8')
y_test = pd.read_csv(os.path.join(base_path, y_test_file), sep=';', encoding='utf-8')


#Checking on data
print(X_train.columns)
print(X_train.shape)

print(y_train[label].value_counts())
print(y_test[label].value_counts())

Index(['pid', 'req-text', 'resp-text', '1funct-request', '2pronoun-request',
       '3ppron-request', '4i-request', '5we-request', '6you-request',
       '7shehe-request',
       ...
       '58home-response', '59money-response', '60relig-response',
       '61death-response', '62assent-response', '63nonfl-response',
       '64filler-response', 'Clareza', 'Atendimento', 'tempo_resposta'],
      dtype='object', length=134)
(6982, 134)
0    2367
1    2319
2    2296
Name: Atendimento, dtype: int64
2    1029
1    1006
0     958
Name: Atendimento, dtype: int64


In [10]:
# Define pipeline and GridSearch CV
pipelineClf = Pipeline([                
    ('clf', LogisticRegression(random_state=seed, n_jobs=6, class_weight='balanced'))
])

parameters = {    
    'clf__C': (0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10),
}

# Define grid search
grid_search_tempo = GridSearchCV(pipelineClf,
                               parameters,
                               cv=10,
                               scoring='f1_macro',
                               n_jobs=6,
                               verbose=10
                               )

In [11]:
print("Executando Gridsearch para baseline tempo - Classe Atendimento")

now = str(datetime.now()).split('.')[0].replace('-', '_').replace(' ', '_').replace(':', '_')
print(now)

t0 = time()
grid_search_tempo.fit(X_train[['tempo_resposta']], y_train[label])
print("done in %0.3fs" % (time() - t0))
print("Best score: %0.3f" % grid_search_tempo.best_score_)
print("Best parameters set:")
best_parameters = grid_search_tempo.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Executando Gridsearch para baseline tempo - Classe Atendimento
2020_05_19_23_56_04
Fitting 10 folds for each of 7 candidates, totalling 70 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Batch computation too fast (0.1890s.) Setting batch_size=2.
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done  14 tasks      | elapsed:    1.0s
[Parallel(n_jobs=6)]: Done  28 tasks      | elapsed:    1.5s
[Parallel(n_jobs=6)]: Done  46 tasks      | elapsed:    2.3s
[Parallel(n_jobs=6)]: Done  59 out of  70 | elapsed:    2.8s remaining:    0.4s


done in 3.279s
Best score: 0.301
Best parameters set:
	clf__C: 0.0001


[Parallel(n_jobs=6)]: Done  67 out of  70 | elapsed:    3.0s remaining:    0.0s
[Parallel(n_jobs=6)]: Done  70 out of  70 | elapsed:    3.1s finished


In [12]:
# Saving Model
f_save = now + '_Atendimento_Baseline_Tempo_Multiclass_Balanced.sav'
pickle.dump(grid_search_tempo, open(os.path.join(save_path, f_save),'wb'))

# Saving Parameters
with open(os.path.join(save_path, 'params.txt'),'a') as f:
    f.write(f_save + '\n\n')
    for param_name in sorted(parameters.keys()):        
        f.write("%s: %r" % (param_name, best_parameters[param_name]))

# Validation

In [13]:
# Predictions
y_pred_train = grid_search_tempo.predict(X_train[['tempo_resposta']])
y_pred_test = grid_search_tempo.predict(X_test[['tempo_resposta']])

In [14]:
print('#'*50)
print('Report for TRAIN')
print('#'*50)
print(classification_report(y_train[label], y_pred_train))

print('#'*50)
print('Report for TEST')
print('#'*50)
print(classification_report(y_test[label], y_pred_test))

##################################################
Report for TRAIN
##################################################
              precision    recall  f1-score   support

           0       0.31      0.08      0.13      2367
           1       0.34      0.34      0.34      2319
           2       0.35      0.61      0.44      2296

    accuracy                           0.34      6982
   macro avg       0.33      0.34      0.30      6982
weighted avg       0.33      0.34      0.30      6982

##################################################
Report for TEST
##################################################
              precision    recall  f1-score   support

           0       0.28      0.08      0.13       958
           1       0.36      0.35      0.35      1006
           2       0.38      0.64      0.47      1029

    accuracy                           0.36      2993
   macro avg       0.34      0.35      0.32      2993
weighted avg       0.34      0.36      0.32      2993



In [8]:
with open(os.path.join(save_path, 'params.txt'),'a') as f:
    f.write('\n\n' + ('#'*60))
    f.write('\nReport for TRAIN')
    f.write('\n' + ('#'*60))
    f.write('\n' + classification_report(y_train.Atendimento, y_pred_train))
    
    f.write('\n\n' + ('#'*60))
    f.write('\nReport for TEST')
    f.write('\n' + ('#'*60))
    f.write('\n' + classification_report(y_test.Atendimento, y_pred_test))