In [24]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder, MinMaxScaler

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score
from warnings import simplefilter

from sklearn.svm import SVC

simplefilter(action='ignore', category=FutureWarning)

In [25]:
df = pd.read_csv('datasets/dataset_cler.csv', sep=";", encoding='utf-16')
df.shape

(1642, 115)

In [26]:
X = df.drop(['take','status','oper'], axis=1) #df[train_features]
y = df['status']

x_columns = X.columns

In [27]:
encoder = LabelEncoder()
encoder.fit(['inside', 'up', 'down'])
columns_objects = X.select_dtypes(include=['object']).columns
for col_obj in columns_objects:
    X[col_obj] = encoder.transform(X[col_obj])

In [28]:
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

In [29]:
X[0]

array([0.17910448, 0.14596273, 0.16949153, 0.7143    , 0.16      ,
       0.31576039, 0.        , 0.        , 0.04032258, 0.05785124,
       0.        , 0.06818182, 0.09807074, 0.05405405, 0.5455    ,
       0.04347826, 0.0978893 , 0.16      , 0.39155718, 0.09774436,
       0.16129032, 0.        , 0.08235294, 0.07398274, 0.10666667,
       0.8889    , 0.        , 0.        , 0.03225806, 0.12220878,
       0.02531646, 0.05970149, 1.        , 0.04347826, 0.05471125,
       0.05084746, 0.5       , 0.08571429, 0.53333333, 0.        ,
       0.        , 0.03      , 0.06451613, 0.        , 0.25714286,
       0.41666667, 0.03636364, 0.1       , 0.53125   , 0.97142857,
       0.02564103, 0.0538445 , 0.31707317, 0.01851852, 1.        ,
       0.06711409, 0.03074434, 0.0390625 , 0.4167    , 0.21428571,
       0.53844497, 0.03703704, 0.09162908, 0.03351955, 0.00970874,
       1.        , 0.08888889, 0.12784588, 0.08333333, 0.6364    ,
       0.        , 0.        , 0.16666667, 0.42968565, 0.00740

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=True)

In [13]:
svc = SVC(kernel = 'rbf', gamma='auto')
scores = cross_val_score(svc, X, y, cv=10)
scores.mean()

0.6693101900344157

In [62]:
# Variáveis para coletar os resultados
models = np.array([])
opt_params = dict()
scores = np.array([])

## Optimizing of the algorithm SVC

In [63]:
def optimize(C, kernel, degree, gamma, coef0, shrinking, probability, tol,
             cache_size, class_weight, max_iter, decision_function_shape, params, cv = 5):
  np.random.seed(0)

  svc = SVC(C = C,
          kernel = kernel,
          degree = degree,
          gamma = gamma,
          coef0 = coef0,
          shrinking = shrinking,
          probability = probability,
          tol = tol,
          cache_size = cache_size,
          class_weight = class_weight,
          max_iter = max_iter,
          decision_function_shape = decision_function_shape)
    
  grid_search = GridSearchCV(estimator = svc, param_grid = params, scoring = 'roc_auc',
                             n_jobs = -1, iid = False, cv = cv)

  grid_search.fit(X_train, y_train)
  results = grid_search.cv_results_
  best_params = grid_search.best_params_
  best_score = grid_search.best_score_
  print(best_params, best_score)
  
  return svc, best_params, best_score

#### MODELO 0

In [64]:
C=1.0
kernel='rbf'
degree=3
gamma='auto_deprecated'
coef0=0.0
shrinking=True
probability=False
tol=0.001
cache_size=200
class_weight='balanced'
max_iter=-1
decision_function_shape='ovr'
random_state=None
params = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10]}

In [65]:
svc, opt_param, score = optimize(C = C,
              kernel =kernel,
              degree = degree,
              gamma = gamma,
              coef0 = coef0,
              shrinking = shrinking,
              probability = probability,
              tol = tol,
              cache_size = cache_size,
              class_weight = class_weight,
              max_iter = max_iter,
              decision_function_shape = decision_function_shape,
              params = params)

{'C': 0.0001} 0.4832170065916196


In [66]:
# Atualizando as variáveis com os resultados
models = np.append(models, svc)
opt_params = {**opt_params, **opt_param} # ** para fundir um dict no outro
scores = np.append(scores, score)

#### MODELO 1

In [69]:
C= opt_params['C']
kernel='rbf'
degree=3
gamma='auto_deprecated'
coef0=0.0
shrinking=True
probability=False
tol=0.001
cache_size=200
class_weight='balanced'
max_iter=-1
decision_function_shape='ovr'
random_state=None
params = {'kernel': ['poly', 'rbf', 'sigmoid']}

In [70]:
svc, opt_param, score = optimize(C = C,
              kernel =kernel,
              degree = degree,
              gamma = gamma,
              coef0 = coef0,
              shrinking = shrinking,
              probability = probability,
              tol = tol,
              cache_size = cache_size,
              class_weight = class_weight,
              max_iter = max_iter,
              decision_function_shape = decision_function_shape,
              params = params)

{'kernel': 'rbf'} 0.4832170065916196


In [71]:
# Atualizando as variáveis com os resultados
models = np.append(models, svc)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

#### MODELO 2

In [77]:
C= opt_params['C']
kernel= opt_params['kernel']
degree=3
gamma='auto_deprecated'
coef0=0.0
shrinking=True
probability=False
tol=0.001
cache_size=200
class_weight='balanced'
max_iter=-1
decision_function_shape='ovr'
random_state=None

#Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
params = {'degree': [1, 2, 5]}

In [78]:
svc, opt_param, score = optimize(C = C,
              kernel =kernel,
              degree = degree,
              gamma = gamma,
              coef0 = coef0,
              shrinking = shrinking,
              probability = probability,
              tol = tol,
              cache_size = cache_size,
              class_weight = class_weight,
              max_iter = max_iter,
              decision_function_shape = decision_function_shape,
              params = params)

{'degree': 1} 0.4832170065916196


#### MODELO 3

In [79]:
C= opt_params['C']
kernel= opt_params['kernel']
degree=3
gamma='auto_deprecated'
coef0=0.0
shrinking=True
probability=False
tol=0.001
cache_size=200
class_weight='balanced'
max_iter=-1
decision_function_shape='ovr'
random_state=None
params = {'gamma': [0.0001, 0.001, 0.01, 0.1]}

In [80]:
svc, opt_param, score = optimize(C = C,
              kernel =kernel,
              degree = degree,
              gamma = gamma,
              coef0 = coef0,
              shrinking = shrinking,
              probability = probability,
              tol = tol,
              cache_size = cache_size,
              class_weight = class_weight,
              max_iter = max_iter,
              decision_function_shape = decision_function_shape,
              params = params)

{'gamma': 0.1} 0.4844571980020896


In [83]:
# Atualizando as variáveis com os resultados
models = np.append(models, svc)
opt_params = {**opt_params, **opt_param}
scores = np.append(scores, score)

#### MODELO 4

In [86]:
C= opt_params['C']
kernel= opt_params['kernel']
degree=3
gamma=opt_params['gamma']
coef0=0.0
shrinking=True
probability=False
tol=0.001
cache_size=200
class_weight='balanced'
max_iter=-1
decision_function_shape='ovr'
random_state=None
params = {'coef0': [0, 1, 2]}

In [87]:
svc, opt_param, score = optimize(C = C,
              kernel =kernel,
              degree = degree,
              gamma = gamma,
              coef0 = coef0,
              shrinking = shrinking,
              probability = probability,
              tol = tol,
              cache_size = cache_size,
              class_weight = class_weight,
              max_iter = max_iter,
              decision_function_shape = decision_function_shape,
              params = params)

{'coef0': 0} 0.4844571980020896


In [90]:
svc.fit(X_train, y_train)

SVC(C=0.0001, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [92]:
y_predict = svc.predict(X_test)

## Best Model

Aparentemente o modelo com os parâmetros padrão se mostrou ser melhor do que nos 4 modelos otimizados

In [31]:
# vamos manter o padrão
svc = SVC()
scores = cross_val_score(svc, X, y, cv=10)
scores.mean()

0.6693101900344157

In [32]:
# ajusta o modelo
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

In [38]:
# savel to disk
# save the model to disk
pickle.dump(svc, open('SVC.ml', 'wb'))

#exporting the departure encoder
output = open('LabelEncoder.pkl', 'wb')
pickle.dump(encoder, output)
output.close()

#exporting the departure encoder
output = open('MinMaxScaler.pkl', 'wb')
pickle.dump(scaler, output)
output.close()