### SCRIPT DE TREINO

In [None]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [1]:
# -*- coding: utf-8 -*-
"""
@author: dib_n
"""
#################################################################
#Imports
#################################################################
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Learning
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

#SearchGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

#Split
from sklearn.model_selection import train_test_split

#Resample
from sklearn.utils import resample

#Saving 
from sklearn.externals import joblib 

#################################################################
#Prep train/validation set from external file
#################################################################
exec(open('../scripts/dataprep_treino_validacao.py').read())
#################################################################
#Loads
#################################################################
#Importando base de treino
df_train = pd.read_csv('../data/data.csv',index_col=False)
df_valid = pd.read_csv('../data/data_validacao.csv',index_col=False)
# Dropping unneccesary columns
df_train = df_train.drop(['filename'],axis=1)
df_valid = df_valid.drop('filename',axis=1)
#################################################################
#Encoding
#################################################################
class_list = df_train.iloc[:, -1]
encoder = LabelEncoder()
encoder.fit(class_list)
y_train = encoder.transform(class_list)
y_valid = encoder.transform(df_valid.iloc[:,-1])
joblib.dump(encoder,'../models/encoder.pkl')
#################################################################
#Scailing
#################################################################
scaler = StandardScaler()
df = df_train.append(df_valid,ignore_index=True)
scaler.fit(np.array(df.iloc[:, :-1], dtype = float))
X_train = scaler.transform(np.array(df_train.iloc[:, :-1], dtype = float))
X_valid = scaler.transform(np.array(df_valid.iloc[:, :-1], dtype = float))
joblib.dump(scaler,'../models/scaler.pkl')
#################################################################
#Training svm with two different kernels
#################################################################
#Linear
SVClassifier = svm.SVC(kernel='linear')
SVClassifier.fit(X_train,y_train)
print('Accuracy of linear SVM:',SVClassifier.score(X_valid,y_valid))
joblib.dump(SVClassifier,'../models/svclinear.pkl')
#Rbf
##################################################################
#Param grid
Cs = np.arange(0.5,100,0.5)
gammas = [0.001, 0.01, 0.1, 1]

param_grid = {'C':Cs,'gamma':gammas}
#################################################################
#Grid Search
print('Tuning RBF Kernel parameters')
grid_search = GridSearchCV(svm.SVC(kernel='rbf'),param_grid)
grid_search.fit(X_train,y_train)
print('Search grid for RBF returned parameters:')
print(grid_search.best_params_)
#Get model  
SVCrbf = grid_search.best_estimator_
print('Params:')
print(SVCrbf.get_params())
#SVCrbf.fit(X_train,y_train)
print('Accuracy score of RBF Kernel:',SVCrbf.score(X_valid,y_valid))
joblib.dump(SVCrbf, '../models/svcrbf.pkl') 
##################################################################
#Re-treino com validação
##################################################################
#print('Re-training with validation data')
#X = np.concatenate((X_train,X_valid))
#y = np.concatenate((y_train,y_valid))

#SVCrbf = joblib.load('../models/svcrbf.pkl')
#SVCrbf.fit(X,y)
#Scoring
#scores = cross_val_score(SVCrbf, X, y, cv = 3)
#print('Training + Validation score(cross-val)',scores.mean())



Creating training files...
Extract training features...
class a extracted
class b extracted
class c extracted
class d extracted
class h extracted
class m extracted
class n extracted
class x extracted
class 6 extracted
class 7 extracted
Creating validation files...
Extracting validation features
class a extracted
class b extracted
class c extracted
class d extracted
class h extracted
class m extracted
class n extracted
class x extracted
class 6 extracted
class 7 extracted
Training set saved at data.csv
Validation set save at data_validacao.csv
Accuracy of linear SVM: 0.653558052434457
Tuning RBF Kernel parameters
Search grid for RBF returned parameters:
{'C': 3.0, 'gamma': 0.1}
Params:
{'C': 3.0, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.1, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Accuracy score of RBF Kernel: 0.7303370786516854


['../models/svcrbf.pkl']

In [2]:
#Importando base de treino
df_train = pd.read_csv('../data/data.csv',index_col=False)
df_valid = pd.read_csv('../data/data_validacao.csv',index_col=False)
# Dropping unneccesary columns
df_train = df_train.drop(['filename'],axis=1)
df_valid = df_valid.drop('filename',axis=1)
#################################################################
#Encoding
#################################################################
encoder = joblib.load('../models/encoder.pkl')
y_train = df_train.iloc[:,-1]
y_train = encoder.transform(y_train)
y_valid = df_valid.iloc[:,-1]
y_valid = encoder.transform(y_valid)
#################################################################
#Scailing
#################################################################
scaler = joblib.load('../models/scaler.pkl')
df = df_train.append(df_valid,ignore_index=True)
#scaler.fit(np.array(df.iloc[:, :-1], dtype = float))
X_train = scaler.transform(np.array(df_train.iloc[:, :-1], dtype = float))
X_valid = scaler.transform(np.array(df_valid.iloc[:, :-1], dtype = float))

In [3]:
##################################################################
##################################################################
##################################################################
#Paradigma One vs All
##################################
print('Implementing OneVsAll')
#Separando uma coluna para cada target
for target in df['label'].unique():
    df[target] = (df['label']==target).astype(int)
    df_train[target] = (df_train['label']==target).astype(int)
    df_valid[target] = (df_valid['label']==target).astype(int)
#Treinando um modelo para cada classe
models={}
    
models['geral'] = joblib.load('../models/SVCrbf.pkl')

Implementing OneVsAll


In [None]:
for target in df['label'].unique():
    
    train_temp = pd.DataFrame(X_train)
    train_temp[str(target)]=df[target].copy()
    
    train_temp_nao_target = train_temp.loc[train_temp[target]==0]
    train_temp_target = train_temp.loc[train_temp[target]==1]
    
    train_temp_upsampled = resample(train_temp_target,
                          replace=True, # sample with replacement
                          n_samples=len(train_temp_nao_target), # match number in majority class
                          random_state=27) # reproducible results
    
    # combine majority and upsampled minority
    train_temp = pd.concat([train_temp_nao_target, train_temp_upsampled])
    
    X_train = train_temp.iloc[:,:-1]
    y_train = train_temp.iloc[:,-1]
    
    #del SVCrbf
    SVCrbf = svm.SVC(
        kernel='rbf',
        gamma=0.1,
        C=3
    )
    #print(X_train.shape)
    #print(y_train.shape)
    
    SVCrbf.fit(X_train,y_train)
    print("**********************************************************************")
    print("Acuracia para a classe "+target+":",SVCrbf.score(X_valid,df_valid[target]))
    predictions = SVCrbf.predict(X_valid)
    print("Roc AUC score para a classe "+target+":",roc_auc_score(df_valid[target],predictions))
    print("Precision:",precision_score(df_valid[target],predictions))
    print("Recall:",recall_score(df_valid[target],predictions))
    #X = np.concatenate((X_train,X_valid))
    #y = np.concatenate((df_train[target],df_valid[target]))
    #SVCrbf.fit(X,y)
    #Scoring
    #scores = cross_val_score(SVCrbf, X, y, cv = 3)
    #print('Training + Validation score(cross-val) para classe '+target+':',scores.mean())
    joblib.dump(SVCrbf,'../models/SVCrbf_'+target+'.pkl')
    models[target]=joblib.load('../models/SVCrbf_'+target+'.pkl')
    
print("**********************************************************************")

**********************************************************************
Acuracia para a classe a: 0.9822097378277154
Roc AUC score para a classe a: 0.9206206801363246
Precision: 0.9534883720930233
Recall: 0.845360824742268
**********************************************************************
Acuracia para a classe b: 0.9101123595505618
Roc AUC score para a classe b: 0.7654258031616523
Precision: 0.543859649122807
Recall: 0.5849056603773585
**********************************************************************
Acuracia para a classe c: 0.9588014981273408
Roc AUC score para a classe c: 0.8234607641961741
Precision: 0.9090909090909091
Recall: 0.6542056074766355
**********************************************************************
Acuracia para a classe d: 0.9044943820224719
Roc AUC score para a classe d: 0.6812108559498956
Precision: 0.55
Recall: 0.4
**********************************************************************
Acuracia para a classe h: 0.9850187265917603
Roc AUC score para a cl

In [None]:
#Creating a new dataframe for stacking
#The training and validation datasets will come from the original validation dataset
stacking_df_train = pd.DataFrame(columns=list(models.keys()))
stacking_df_valid = pd.DataFrame(columns=list(models.keys()))
X_train, X_test, y_train, y_test = train_test_split(X_valid, y_valid, test_size=0.5, random_state=42)

In [None]:
for item in models.items():
    label=item[0]
    model = item[1]
    values_train = model.predict(X_train)
    stacking_df_train[label]=values_train
    values_valid = model.predict(X_test)
    stacking_df_valid[label]=values_valid

for t in stacking_df_train['geral'].unique():
    stacking_df_train['geral_'+str(t)] = (stacking_df_train['geral']==t).astype(int)
    stacking_df_valid['geral_'+str(t)] = (stacking_df_valid['geral']==t).astype(int)
    
stacking_df_train['label']=y_train
stacking_df_valid['label']=y_test

stacking_df_train=stacking_df_train.drop('geral',axis=1)
stacking_df_valid=stacking_df_valid.drop('geral',axis=1)

In [None]:
X_train = stacking_df_train.iloc[:,:-1]
X_valid = stacking_df_valid.iloc[:,:-1]
y_train = stacking_df_train.iloc[:,-1]
y_valid = stacking_df_valid.iloc[:,-1]
X = np.concatenate((X_train,X_valid))
y = np.concatenate((y_train,y_valid))
#Stacking into random forest
print('Stacking all models into a Decision Tree...')
"""
#Param Grid
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}
#RandomSearch
# Use the random grid to search for best hyperparameters
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 100,
    random_state=42,
    n_jobs = -1)# Fit the random search model

rf_random.fit(X_train, y_train)
RFC = rf_random.best_estimator_
"""
RFC = DecisionTreeClassifier(
    min_samples_split=2
    
)
RFC.fit(X_train,y_train)

print('Acuracia para a base de validação:',RFC.score(X_valid,y_valid))
#print('Retreinando com validação...')
#RFC.fit(X,y)
#Scoring
#scores = cross_val_score(RFC, X, y, cv = 10)
#print('Training + Validation score(cross-val):',scores.mean())
#joblib.dump(RFC,'../models/RFC.pkl')

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
##################################
#Paradigma One vs All
##################################
#DataFrame com as features extraidas

#Importando base de treino
df_train = pd.read_csv('../data/data.csv',index_col=False)
df_valid = pd.read_csv('../data/data_validacao.csv',index_col=False)
# Dropping unneccesary columns
df_train = df_train.drop(['filename'],axis=1)
df_valid = df_valid.drop('filename',axis=1)
#################################################################
#Encoding
#################################################################
class_list = df_train.iloc[:, -1]
encoder = LabelEncoder()
encoder.fit(class_list)
joblib.dump(encoder,'../models/encoder.pkl')
#################################################################
#Scailing
#################################################################
scaler = joblib.load('../models/scaler.pkl')
df = df_train.append(df_valid,ignore_index=True)
#scaler.fit(np.array(df.iloc[:, :-1], dtype = float))
X_train = scaler.transform(np.array(df_train.iloc[:, :-1], dtype = float))
X_valid = scaler.transform(np.array(df_valid.iloc[:, :-1], dtype = float))
#joblib.dump(scaler,'../models/scaler.pkl')

In [None]:
#Separando uma coluna para cada target
for target in df['label'].unique():
    df[target] = (df['label']==target).astype(int)
    df_train[target] = (df_train['label']==target).astype(int)
    df_valid[target] = (df_valid['label']==target).astype(int)

In [None]:
#Treinando um modelo para cada classe
models={}
    
models['geral'] = joblib.load('../models/SVCrbf.pkl')
for target in df['label'].unique():
    SVCrbf = svm.SVC(
        kernel='rbf',
        C=3.0,
        gamma=1.0,
        probability=False
    )
    SVCrbf.fit(X_train,df_train[target])
    print("Acuracia para a classe "+target+":",SVCrbf.score(X_valid,df_valid[target]))
    X = np.concatenate((X_train,X_valid))
    y = np.concatenate((df_train[target],df_valid[target]))
    #SVCrbf.fit(X,y)
    #Scoring
    #scores = cross_val_score(SVCrbf, X, y, cv = 3)
    #print('Training + Validation score(cross-val) para classe '+target+':',scores.mean())
    
    SVCrbf = joblib.load('../models/svcrbf.pkl')
    SVCrbf.fit(X,y)
    joblib.dump(SVCrbf,'../models/SVCrbf_'+target+'.pkl')
    models[target]=joblib.load('../models/SVCrbf_'+target+'.pkl')

In [None]:
#Ok, agora vamos criar uma nova base com a predição de todos os modelos
stacking_df_train = pd.DataFrame(columns=list(models.keys()))
stacking_df_valid = pd.DataFrame(columns=list(models.keys()))
for item in models.items():
    label=item[0]
    model = item[1]
    values_train = model.predict(X_train)
    stacking_df_train[label]=values_train
    values_valid = model.predict(X_valid)
    stacking_df_valid[label]=values_valid
stacking_df_train['label']=y_train
stacking_df_valid['label']=y_valid

In [None]:
X_train = stacking_df_train.iloc[:,:-1]
X_valid = stacking_df_valid.iloc[:,:-1]
y_train = stacking_df_train.iloc[:,-1]
y_valid = stacking_df_valid.iloc[:,-1]
X = np.concatenate((X_train,X_valid))
y = np.concatenate((y_train,y_valid))

### Param Grid

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap}

## Training Grid

In [None]:
# Use the random grid to search for best hyperparameters
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    n_iter = 100,
    random_state=42,
    n_jobs = -1)# Fit the random search model

rf_random.fit(X_train, y_train)

In [None]:
RFC = rf_random.best_estimator_

In [None]:
RFC.get_params()

In [None]:
RFC.score(X_valid,y_valid)

In [None]:
#Importando base de treino
df_train = pd.read_csv('../data/data.csv',index_col=False)
df_valid = pd.read_csv('../data/data_validacao.csv',index_col=False)
# Dropping unneccesary columns
df_train = df_train.drop(['filename'],axis=1)
df_valid = df_valid.drop('filename',axis=1)
#################################################################
#Encoding
#################################################################
class_list = df_train.iloc[:, -1]
encoder = LabelEncoder()
encoder.fit(class_list)
y_train = encoder.transform(class_list)
y_valid = encoder.transform(df_valid.iloc[:,-1])
joblib.dump(encoder,'../models/encoder.pkl')
#################################################################
#Scailing
#################################################################
scaler = StandardScaler()
df = df_train.append(df_valid,ignore_index=True)
scaler.fit(np.array(df.iloc[:, :-1], dtype = float))
X_train = scaler.transform(np.array(df_train.iloc[:, :-1], dtype = float))
X_valid = scaler.transform(np.array(df_valid.iloc[:, :-1], dtype = float))
joblib.dump(scaler,'../models/scaler.pkl')
#################################################################
#Training svm with two different kernels
#################################################################
#Linear
#SVClassifier = svm.SVC(kernel='linear')
#SVClassifier.fit(X_train,y_train)
#print('Accuracy of linear SVM:',SVClassifier.score(X_valid,y_valid))
#joblib.dump(SVClassifier,'../models/svclinear.pkl')
#################################################################
#Divisor
divisor = StratifiedKFold(n_splits = 5, random_state = 1)

In [None]:
X = np.concatenate((X_train,X_valid))
y = np.concatenate((y_train,y_valid))

SVCrbf = joblib.load('../models/svcrbf.pkl')

In [None]:
predictions=SVCrbf.predict(X_valid)

In [None]:
encoder.classes_.astype(str)

In [None]:
predictions

In [None]:
print(classification_report(y_valid,predictions,labels=encoder.classes_.astype(str)))

In [None]:
##################################################################
#Re-treino com validação
##################################################################
print('Re-training with validation data')
SVCrbf.fit(X,y)
#Scoring
scores = cross_val_score(SVCrbf, X, y, cv = 3)
print('Training + Validation score(cross-val)',scores.mean())

In [None]:
from sklearn.metrics import multilabel_confusion_matrix, classification_report

In [None]:
type(X_train)