In [1]:
#Auxiliares
import os
import pickle
from time import time
from datetime import datetime
from json import dumps

#Dados
import pandas as pd

#preprocessing and transformation
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler

#Machine learning
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

#Metrics
from sklearn.metrics import classification_report

seed = 42

In [5]:
#Variables
base_path = 'C:/Users/arthu/Desktop/22032020 - Experimentos/05. Organizado/03. Datasets/Clareza-Balanced-Multiclass-COH-METRIX'
save_path = 'output'

x_train_file = 'X_train.csv'
y_train_file = 'y_train.csv'
x_test_file = 'X_test.csv'
y_test_file = 'y_test.csv'

label='Clareza'

#Load data
X_train = pd.read_csv(os.path.join(base_path, x_train_file), sep=';', encoding='utf-8', decimal='.')
y_train = pd.read_csv(os.path.join(base_path, y_train_file), sep=';', encoding='utf-8', decimal='.')
X_test = pd.read_csv(os.path.join(base_path, x_test_file), sep=';', encoding='utf-8', decimal='.')
y_test = pd.read_csv(os.path.join(base_path, y_test_file), sep=';', encoding='utf-8', decimal='.')

#Checking on data
print(X_train.columns)
print(X_train.shape)
print(y_train[label].value_counts())
print(y_test[label].value_counts())

Index(['pid', 'req-text', 'resp-text', '1funct-request', '2pronoun-request',
       '3ppron-request', '4i-request', '5we-request', '6you-request',
       '7shehe-request',
       ...
       'sentence_length_min', 'sentence_length_standard_deviation',
       'short_sentence_ratio', 'std_noun_phrase', 'verb_diversity',
       'verbs_max', 'verbs_min', 'verbs_standard_deviation',
       'long_sentence_ratio', 'tempo_resposta'],
      dtype='object', length=310)
(4832, 310)
0    1628
1    1623
2    1581
Name: Clareza, dtype: int64
2    720
1    678
0    673
Name: Clareza, dtype: int64


In [4]:
#type(X_train.adjective_ratio.iloc[0])
#type(X_train.flesch.iloc[0])
#X_train.to_clipboard(excel=True)

In [6]:
# Define pipeline and GridSearch CV

params = {
    'scaler': MaxAbsScaler(),
    'clf_solver':'lbfgs',
    'clf_max_iter': 10000,
    'gs_cv': 10,
    'gs_scoring': 'f1_macro'
}

pipelineWord = Pipeline([    
    #('scaler', MaxAbsScaler()),
    ('scaler', params.get('scaler')),    
    #('scaler', MinMaxScaler()),
    ('clf', LogisticRegression(random_state=seed,
                               n_jobs=6,
                               solver=params.get('clf_solver'),
                               max_iter=params.get('clf_max_iter')))
])

gs_parameters = {    
    'clf__C': (1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100)
}

# Define grid search
grid_search_word = GridSearchCV(pipelineWord,
                               gs_parameters,
                               cv=params.get('gs_cv'),
                               scoring=params.get('gs_scoring'),
                               n_jobs=6,
                               verbose=10
                               )

In [14]:
X = X_train.iloc[:, 3+128:-1]
Y = y_train[label]
x = X_test.iloc[:, 3+128:-1]
y = y_test[label]

In [23]:
print("Executando Gridsearch para Respostas - Multiclass COH-METRIX - Classe Clareza")

now = str(datetime.now()).split('.')[0].replace('-', '_').replace(' ', '_').replace(':', '_')
print(now)

t0 = time()
grid_search_word.fit(X, Y)
print("done in %0.3fs" % (time() - t0))
print("Best score: %0.3f" % grid_search_word.best_score_)
print("Best parameters set:")
best_parameters = grid_search_word.best_estimator_.get_params()
for param_name in sorted(gs_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Executando Gridsearch para Respostas - Multiclass COH-METRIX - Classe Clareza
2020_05_20_00_25_51
Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Batch computation too fast (0.1990s.) Setting batch_size=2.
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=6)]: Done  14 tasks      | elapsed:    0.6s
[Parallel(n_jobs=6)]: Done  28 tasks      | elapsed:    1.0s
[Parallel(n_jobs=6)]: Done  46 tasks      | elapsed:    2.0s
[Parallel(n_jobs=6)]: Batch computation too slow (2.4073s.) Setting batch_size=1.
[Parallel(n_jobs=6)]: Done  64 tasks      | elapsed:    6.5s
[Parallel(n_jobs=6)]: Done  90 out of  90 | elapsed:   19.3s finished


done in 19.758s
Best score: 0.387
Best parameters set:
	clf__C: 0.001


In [19]:
# Saving Model
f_save = now + '_Clareza_Resposta_Multiclass_Balanced_coh_metrix.sav'
pickle.dump(grid_search_word, open(os.path.join(save_path, f_save),'wb'))

# Saving Parameters
with open(os.path.join(save_path, 'params.txt'),'a') as f:
    f.write('\n\n' + ('#'*60))
    f.write('\n'+f_save + '\n\n')
    f.write('Parameters:\n')
    f.write(dumps(str(params)) + '\n')
    f.write('\nGridSearch Best Parameters:\n')
    for param_name in sorted(gs_parameters.keys()):        
        f.write("%s: %r" % (param_name, best_parameters[param_name]) + '\n')    

# Validation

In [20]:
# Predictions
y_pred_train = grid_search_word.predict(X)
y_pred_test = grid_search_word.predict(x)

In [21]:
print('#'*50)
print('Report for TRAIN')
print('#'*50)
print(classification_report(Y, y_pred_train))

print('#'*50)
print('Report for TEST')
print('#'*50)
print(classification_report(y, y_pred_test))

##################################################
Report for TRAIN
##################################################
              precision    recall  f1-score   support

           0       0.45      0.48      0.47      1628
           1       0.43      0.43      0.43      1623
           2       0.45      0.42      0.44      1581

    accuracy                           0.45      4832
   macro avg       0.45      0.45      0.44      4832
weighted avg       0.45      0.45      0.44      4832

##################################################
Report for TEST
##################################################
              precision    recall  f1-score   support

           0       0.40      0.47      0.43       673
           1       0.35      0.33      0.34       678
           2       0.39      0.35      0.37       720

    accuracy                           0.38      2071
   macro avg       0.38      0.38      0.38      2071
weighted avg       0.38      0.38      0.38      2071



In [22]:
with open(os.path.join(save_path, 'params.txt'),'a') as f:
    f.write('\n\n' + ('#'*60))
    f.write('\nReport for TRAIN')
    f.write('\n' + ('#'*60))
    f.write('\n' + classification_report(Y, y_pred_train))
    
    f.write('\n\n' + ('#'*60))
    f.write('\nReport for TEST')
    f.write('\n' + ('#'*60))
    f.write('\n' + classification_report(y, y_pred_test))