In [1]:
#Auxiliares
import os
import pickle
from time import time
from datetime import datetime
from json import dumps

#Dados
import pandas as pd

#preprocessing and transformation
from sklearn.preprocessing import MaxAbsScaler

#Machine learning
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

#Metrics
from sklearn.metrics import classification_report

seed = 42

In [5]:
#Variables
base_path = 'C:/Users/arthu/Desktop/22032020 - Experimentos/05. Organizado/03. Datasets/Atendimento-Unbalanced-Binary'
save_path = 'output'

x_train_file = 'X_train.csv'
y_train_file = 'y_train.csv'
x_test_file = 'X_test.csv'
y_test_file = 'y_test.csv'

label = 'Atendimento'
data = 'req-text'

#Load data
X_train = pd.read_csv(os.path.join(base_path, x_train_file), sep=';', encoding='utf-8')
y_train = pd.read_csv(os.path.join(base_path, y_train_file), sep=';', encoding='utf-8')
X_test = pd.read_csv(os.path.join(base_path, x_test_file), sep=';', encoding='utf-8')
y_test = pd.read_csv(os.path.join(base_path, y_test_file), sep=';', encoding='utf-8')

#Checking on data
print(X_train.columns)
print(X_train.shape)
print(y_train[label].value_counts())
print(y_test[label].value_counts())

Index(['pid', 'req-text', 'resp-text', '1funct-request', '2pronoun-request',
       '3ppron-request', '4i-request', '5we-request', '6you-request',
       '7shehe-request',
       ...
       '58home-response', '59money-response', '60relig-response',
       '61death-response', '62assent-response', '63nonfl-response',
       '64filler-response', 'Clareza', 'Atendimento', 'tempo_resposta'],
      dtype='object', length=134)
(31586, 134)
2    21682
0     9904
Name: Atendimento, dtype: int64
2    9338
0    4200
Name: Atendimento, dtype: int64


In [12]:
# Define pipeline and GridSearch CV

params = {
    'vect_analyzer': 'word',
    'vect_max_features': None,
    'vect_min_df': 1,
    'vect_max_df': 0.75,
    'vect_ngram_range': (1,2),
    'tfidf_use_idf': True,
    'tf_idf_norm': 'l2',
    'clf_solver':'lbfgs',
    'clf_max_iter': 10000,
    'gs_cv': 10,
    'gs_scoring': 'f1_macro'
}

pipelineWord = Pipeline([
    ('vect',   CountVectorizer(analyzer=params.get('vect_analyzer'),
                               max_features=params.get('vect_max_features'),
                               min_df=params.get('vect_min_df'),
                               max_df=params.get('vect_max_df'),
                               ngram_range=params.get('vect_ngram_range'))),
    
    ('tfidf', TfidfTransformer(use_idf=params.get('tfidf_use_idf'),
                               norm=params.get('tf_idf_norm'))),
    
    #('scaler', MaxAbsScaler()),
    
    ('clf', LogisticRegression(random_state=seed,
                               n_jobs=6,
                               C=1,
                               class_weight='balanced',
                               solver=params.get('clf_solver'),
                               max_iter=params.get('clf_max_iter')))
])

gs_parameters = {
    #'vect__min_df': (1, 0.1, 0.25),
    #'vect__max_df': (1.0, 0.75),
    #'clf__C': (0.1, 1, 10, 100)
}

# Define grid search
grid_search_word = GridSearchCV(pipelineWord,
                               gs_parameters,
                               cv=params.get('gs_cv'),
                               scoring=params.get('gs_scoring'),
                               n_jobs=6,
                               verbose=10
                               )

In [13]:
print("Executando Gridsearch para Features Word - Classe Atendimento")

now = str(datetime.now()).split('.')[0].replace('-', '_').replace(' ', '_').replace(':', '_')

t0 = time()
grid_search_word.fit(X_train[data], y_train[label])
print("done in %0.3fs" % (time() - t0))
print("Best score: %0.3f" % grid_search_word.best_score_)
print("Best parameters set:")
best_parameters = grid_search_word.best_estimator_.get_params()
for param_name in sorted(gs_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Executando Gridsearch para Features Word - Classe Atendimento
Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   23.1s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   24.0s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   49.0s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  1.0min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  1.8min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  2.1min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:  2.5min
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:  3.5min
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:  3.9min
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:  5.1min
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:  5.5min
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed:  8.0min
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:  8.6min
[Parallel(n_jobs=6)]: Done 169 tasks      | elapsed: 11.0min
[Parallel(

done in 1312.159s
Best score: 0.644
Best parameters set:
	clf__C: 1
	vect__max_df: 0.75
	vect__min_df: 1


In [10]:
# Saving Model
f_save = now + '_Atendimento_Solicitacao_Binary_Unbalanced.sav'
pickle.dump(grid_search_word, open(os.path.join(save_path, f_save),'wb'))

# Saving Parameters
with open(os.path.join(save_path, 'params.txt'),'a') as f:
    f.write('\n\n' + ('#'*60))
    f.write('\n'+f_save + '\n\n')
    f.write('Parameters:\n')
    f.write(dumps(params) + '\n')
    f.write('\nGridSearch Best Parameters:\n')
    for param_name in sorted(gs_parameters.keys()):        
        f.write("%s: %r" % (param_name, best_parameters[param_name]) + '\n')    

# Validation

In [14]:
# Predictions
y_pred_train = grid_search_word.predict(X_train[data])
y_pred_test = grid_search_word.predict(X_test[data])

In [15]:
print('#'*50)
print('Report for TRAIN')
print('#'*50)
print(classification_report(y_train[label], y_pred_train))

print('#'*50)
print('Report for TEST')
print('#'*50)
print(classification_report(y_test[label], y_pred_test))

##################################################
Report for TRAIN
##################################################
              precision    recall  f1-score   support

           0       0.73      0.91      0.81      9904
           2       0.95      0.85      0.90     21682

    accuracy                           0.86     31586
   macro avg       0.84      0.88      0.85     31586
weighted avg       0.88      0.86      0.87     31586

##################################################
Report for TEST
##################################################
              precision    recall  f1-score   support

           0       0.48      0.63      0.55      4200
           2       0.81      0.69      0.75      9338

    accuracy                           0.67     13538
   macro avg       0.64      0.66      0.65     13538
weighted avg       0.71      0.67      0.68     13538



In [11]:
with open(os.path.join(save_path, 'params.txt'),'a') as f:
    f.write('\n\n' + ('#'*60))
    f.write('\nReport for TRAIN')
    f.write('\n' + ('#'*60))
    f.write('\n' + classification_report(y_train[label], y_pred_train))
    
    f.write('\n\n' + ('#'*60))
    f.write('\nReport for TEST')
    f.write('\n' + ('#'*60))
    f.write('\n' + classification_report(y_test[label], y_pred_test))