### SCRIPT DE TREINO

In [42]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# -*- coding: utf-8 -*-
"""
@author: dib_n
"""
#################################################################
#Imports
#################################################################
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Learning
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

#SearchGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Saving 
from sklearn.externals import joblib 

#################################################################
#Prep train/validation set from external file
#################################################################
exec(open('../scripts/dataprep_treino_validacao.py').read())
#################################################################
#Loads
#################################################################
#Importando base de treino
df_train = pd.read_csv('../data/data.csv',index_col=False)
df_valid = pd.read_csv('../data/data_validacao.csv',index_col=False)
# Dropping unneccesary columns
df_train = df_train.drop(['filename'],axis=1)
df_valid = df_valid.drop('filename',axis=1)
#################################################################
#Encoding
#################################################################
class_list = df_train.iloc[:, -1]
encoder = LabelEncoder()
encoder.fit(class_list)
y_train = encoder.transform(class_list)
y_valid = encoder.transform(df_valid.iloc[:,-1])
joblib.dump(encoder,'../models/encoder.pkl')
#################################################################
#Scailing
#################################################################
scaler = StandardScaler()
df = df_train.append(df_valid,ignore_index=True)
scaler.fit(np.array(df.iloc[:, :-1], dtype = float))
X_train = scaler.transform(np.array(df_train.iloc[:, :-1], dtype = float))
X_valid = scaler.transform(np.array(df_valid.iloc[:, :-1], dtype = float))
joblib.dump(scaler,'../models/scaler.pkl')
#################################################################
#Training svm with two different kernels
#################################################################
#Linear
SVClassifier = svm.SVC(kernel='linear')
SVClassifier.fit(X_train,y_train)
print('Accuracy of linear SVM:',SVClassifier.score(X_valid,y_valid))
joblib.dump(SVClassifier,'../models/svclinear.pkl')
#Rbf
##Param grid
Cs = np.arange(0.5,100,0.5)
gammas = [0.001, 0.01, 0.1, 1]

param_grid = {'C':Cs,'gamma':gammas}
#Grid Search
print('Tuning RBF Kernel parameters')
grid_search = GridSearchCV(svm.SVC(kernel='rbf'),param_grid)
grid_search.fit(X_train,y_train)
print('Search grid for RBF returned parameters:')
print(grid_search.best_params_)
#Get model  
SVCrbf = grid_search.best_estimator_
print('Accuracy score of RBF Kernel:',SVCrbf.score(X_valid,y_valid))
joblib.dump(SVCrbf, '../models/svcrbf.pkl') 
##################################################################
#Re-treino com validação
##################################################################
print('Re-training with validation data')
X = np.concatenate((X_train,X_valid))
y = np.concatenate((y_train,y_valid))

SVCrbf = joblib.load('../models/svcrbf.pkl')
SVCrbf.fit(X,y)
#Scoring
scores = cross_val_score(SVCrbf, X, y, cv = 3)
print('Training + Validation score(cross-val)',scores.mean())
##################################
#Paradigma One vs All
##################################
print('Implementing OneVsAll')
#Separando uma coluna para cada target
for target in df['label'].unique():
    df[target] = (df['label']==target).astype(int)
    df_train[target] = (df_train['label']==target).astype(int)
    df_valid[target] = (df_valid['label']==target).astype(int)
#Treinando um modelo para cada classe
models={}
    
models['geral'] = joblib.load('../models/SVCrbf.pkl')
for target in df['label'].unique():
    SVCrbf = svm.SVC(
        kernel='rbf',
        C=3.0,
        gamma=1.0,
        probability=False
    )
    SVCrbf.fit(X_train,df_train[target])
    print("Acuracia para a classe "+target+":",SVCrbf.score(X_valid,df_valid[target]))
    X = np.concatenate((X_train,X_valid))
    y = np.concatenate((df_train[target],df_valid[target]))
    SVCrbf.fit(X,y)
    #Scoring
    scores = cross_val_score(SVCrbf, X, y, cv = 3)
    print('Training + Validation score(cross-val) para classe '+target+':',scores.mean())
    
    SVCrbf = joblib.load('../models/svcrbf.pkl')
    SVCrbf.fit(X,y)
    joblib.dump(SVCrbf,'../models/SVCrbf_'+target+'.pkl')
    models[target]=joblib.load('../models/SVCrbf_'+target+'.pkl')
#Ok, agora vamos criar uma nova base com a predição de todos os modelos
stacking_df_train = pd.DataFrame(columns=list(models.keys()))
stacking_df_valid = pd.DataFrame(columns=list(models.keys()))
for item in models.items():
    label=item[0]
    model = item[1]
    values_train = model.predict(X_train)
    stacking_df_train[label]=values_train
    values_valid = model.predict(X_valid)
    stacking_df_valid[label]=values_valid
stacking_df_train['label']=y_train
stacking_df_valid['label']=y_valid
X_train = stacking_df_train.iloc[:,:-1]
X_valid = stacking_df_valid.iloc[:,:-1]
y_train = stacking_df_train.iloc[:,-1]
y_valid = stacking_df_valid.iloc[:,-1]
X = np.concatenate((X_train,X_valid))
y = np.concatenate((y_train,y_valid))
#Param Grid
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}
#RandomSearch
# Use the random grid to search for best hyperparameters
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 100,
    random_state=42,
    n_jobs = -1)# Fit the random search model

rf_random.fit(X_train, y_train)
RFC = rf_random.best_estimator_
print('Acuracia para a base de validação:',RFC.Score(X_valid,y_valid))
print('Retreinando com validação...')
RFC.fit(X,y)
#Scoring
scores = cross_val_score(RFC, X, y, cv = 10)
print('Training + Validation score(cross-val):',scores.mean())

Creating training files...
Extract training features...
class a extracted
class b extracted
class c extracted


In [36]:
##################################
#Paradigma One vs All
##################################
#DataFrame com as features extraidas

#Importando base de treino
df_train = pd.read_csv('../data/data.csv',index_col=False)
df_valid = pd.read_csv('../data/data_validacao.csv',index_col=False)
# Dropping unneccesary columns
df_train = df_train.drop(['filename'],axis=1)
df_valid = df_valid.drop('filename',axis=1)
#################################################################
#Encoding
#################################################################
class_list = df_train.iloc[:, -1]
encoder = LabelEncoder()
encoder.fit(class_list)
joblib.dump(encoder,'../models/encoder.pkl')
#################################################################
#Scailing
#################################################################
scaler = StandardScaler()
df = df_train.append(df_valid,ignore_index=True)
scaler.fit(np.array(df.iloc[:, :-1], dtype = float))
X_train = scaler.transform(np.array(df_train.iloc[:, :-1], dtype = float))
X_valid = scaler.transform(np.array(df_valid.iloc[:, :-1], dtype = float))
joblib.dump(scaler,'../models/scaler.pkl')

['../models/scaler.pkl']

In [37]:
#Separando uma coluna para cada target
for target in df['label'].unique():
    df[target] = (df['label']==target).astype(int)
    df_train[target] = (df_train['label']==target).astype(int)
    df_valid[target] = (df_valid['label']==target).astype(int)

In [54]:
#Treinando um modelo para cada classe
models={}
    
models['geral'] = joblib.load('../models/SVCrbf.pkl')
for target in df['label'].unique():
    SVCrbf = svm.SVC(
        kernel='rbf',
        C=3.0,
        gamma=1.0,
        probability=False
    )
    SVCrbf.fit(X_train,df_train[target])
    print("Acuracia para a classe "+target+":",SVCrbf.score(X_valid,df_valid[target]))
    X = np.concatenate((X_train,X_valid))
    y = np.concatenate((df_train[target],df_valid[target]))
    SVCrbf.fit(X,y)
    #Scoring
    scores = cross_val_score(SVCrbf, X, y, cv = 3)
    print('Training + Validation score(cross-val) para classe '+target+':',scores.mean())
    
    SVCrbf = joblib.load('../models/svcrbf.pkl')
    SVCrbf.fit(X,y)
    joblib.dump(SVCrbf,'../models/SVCrbf_'+target+'.pkl')
    models[target]=joblib.load('../models/SVCrbf_'+target+'.pkl')

Acuracia para a classe a: 0.9101123595505618
Training + Validation score(cross-val) para classe a: 0.9101121694962764
Acuracia para a classe b: 0.900749063670412
Training + Validation score(cross-val) para classe b: 0.8944643960457214
Acuracia para a classe c: 0.900749063670412
Training + Validation score(cross-val) para classe c: 0.9028890532043939
Acuracia para a classe d: 0.897003745318352
Training + Validation score(cross-val) para classe d: 0.9008843064565956
Acuracia para a classe h: 0.8867041198501873
Training + Validation score(cross-val) para classe h: 0.89646953197405
Acuracia para a classe m: 0.9026217228464419
Training + Validation score(cross-val) para classe m: 0.8992774950584529
Acuracia para a classe n: 0.8941947565543071
Training + Validation score(cross-val) para classe n: 0.9004813477737664
Acuracia para a classe x: 0.9110486891385767
Training + Validation score(cross-val) para classe x: 0.9065010946312325
Acuracia para a classe 6: 0.9063670411985019
Training + Valid

In [55]:
#Ok, agora vamos criar uma nova base com a predição de todos os modelos
stacking_df_train = pd.DataFrame(columns=list(models.keys()))
stacking_df_valid = pd.DataFrame(columns=list(models.keys()))
for item in models.items():
    label=item[0]
    model = item[1]
    values_train = model.predict(X_train)
    stacking_df_train[label]=values_train
    values_valid = model.predict(X_valid)
    stacking_df_valid[label]=values_valid
stacking_df_train['label']=y_train
stacking_df_valid['label']=y_valid

In [60]:
X_train = stacking_df_train.iloc[:,:-1]
X_valid = stacking_df_valid.iloc[:,:-1]
y_train = stacking_df_train.iloc[:,-1]
y_valid = stacking_df_valid.iloc[:,-1]
X = np.concatenate((X_train,X_valid))
y = np.concatenate((y_train,y_valid))

### Param Grid

In [61]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

## Training Grid

In [62]:
# Use the random grid to search for best hyperparameters
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 100,
    random_state=42,
    n_jobs = -1)# Fit the random search model

rf_random.fit(X_train, y_train)

RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs

In [66]:
RFC = rf_random.best_estimator_

In [67]:
RFC.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 40,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [69]:
RFC.score(X_valid,y_valid)

0.9054307116104869

In [4]:
#Importando base de treino
df_train = pd.read_csv('../data/data.csv',index_col=False)
df_valid = pd.read_csv('../data/data_validacao.csv',index_col=False)
# Dropping unneccesary columns
df_train = df_train.drop(['filename'],axis=1)
df_valid = df_valid.drop('filename',axis=1)
#################################################################
#Encoding
#################################################################
class_list = df_train.iloc[:, -1]
encoder = LabelEncoder()
encoder.fit(class_list)
y_train = encoder.transform(class_list)
y_valid = encoder.transform(df_valid.iloc[:,-1])
joblib.dump(encoder,'../models/encoder.pkl')
#################################################################
#Scailing
#################################################################
scaler = StandardScaler()
df = df_train.append(df_valid,ignore_index=True)
scaler.fit(np.array(df.iloc[:, :-1], dtype = float))
X_train = scaler.transform(np.array(df_train.iloc[:, :-1], dtype = float))
X_valid = scaler.transform(np.array(df_valid.iloc[:, :-1], dtype = float))
joblib.dump(scaler,'../models/scaler.pkl')
#################################################################
#Training svm with two different kernels
#################################################################
#Linear
#SVClassifier = svm.SVC(kernel='linear')
#SVClassifier.fit(X_train,y_train)
#print('Accuracy of linear SVM:',SVClassifier.score(X_valid,y_valid))
#joblib.dump(SVClassifier,'../models/svclinear.pkl')
#################################################################
#Divisor
divisor = StratifiedKFold(n_splits = 5, random_state = 1)

In [5]:
X = np.concatenate((X_train,X_valid))
y = np.concatenate((y_train,y_valid))

SVCrbf = joblib.load('../models/svcrbf.pkl')



In [15]:
predictions=SVCrbf.predict(X_valid)

In [16]:
encoder.classes_.astype(str)

array(['6', '7', 'a', 'b', 'c', 'd', 'h', 'm', 'n', 'x'], dtype='<U1')

In [20]:
predictions

array([2, 2, 2, ..., 1, 1, 1])

In [17]:
print(classification_report(y_valid,predictions,labels=encoder.classes_.astype(str)))

              precision    recall  f1-score   support

           6       0.92      0.96      0.94       122
           7       0.51      0.52      0.52       104
           a       0.00      0.00      0.00         0
           b       0.00      0.00      0.00         0
           c       0.00      0.00      0.00         0
           d       0.00      0.00      0.00         0
           h       0.00      0.00      0.00         0
           m       0.00      0.00      0.00         0
           n       0.00      0.00      0.00         0
           x       0.00      0.00      0.00         0

   micro avg       0.74      0.76      0.75       226
   macro avg       0.14      0.15      0.15       226
weighted avg       0.73      0.76      0.75       226



  mask &= (ar1 != a)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [28]:
##################################################################
#Re-treino com validação
##################################################################
print('Re-training with validation data')
SVCrbf.fit(X,y)
#Scoring
scores = cross_val_score(SVCrbf, X, y, cv = 3)
print('Training + Validation score(cross-val)',scores.mean())

Re-training with validation data
Score for validation + training set: 0.9530497592295345


In [12]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report