In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from warnings import simplefilter

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

plt.rcParams['figure.figsize'] = (25, 8)
#sns.set_style('whitegrid')
sns.set_style('white')
sns.set_context('poster')

In [86]:
#df = pd.read_csv('datasets/dataset_cler.csv', sep=";", encoding='utf-16',error_bad_lines=False)
df = pd.read_csv('datasets/dataset_cler.csv', sep=";", encoding='utf-16')
df.drop(df.loc[(df['status']!='Loss') & (df['status']!='Gain')].index.values, inplace=True)
df['status'].value_counts()

Gain    2269
Loss    1189
Name: status, dtype: int64

In [87]:
df.shape

(3458, 412)

In [88]:
df.head(3)

Unnamed: 0,ampl_20,desv_amp_20,body_20,body_per_20,pavio_sup_20,pavio_sup_per_20,pavio_inf_20,pavio_inf_per_20,high_dist_20,low_dist_20,...,pavio_inf_h2_0,pavio_inf_per_h2_0,high_dist_h2_0,low_dist_h2_0,type_h2_0,atr,qt_bars,oper,take,status
0,65.0,2.17,40.0,61.54,0.0,0.0,25.0,38.46,10.0,45.0,...,0.0,0.0,125.0,325.0,up,58.21,5,Sell,20.0,Gain
1,45.0,0.82,35.0,77.78,10.0,22.22,0.0,0.0,35.0,45.0,...,0.0,0.0,180.0,325.0,up,59.64,4,Sell,-40.0,Loss
2,55.0,0.65,45.0,81.82,10.0,18.18,0.0,0.0,40.0,70.0,...,0.0,0.0,380.0,325.0,up,59.64,5,Sell,-40.0,Loss


In [91]:
X = df.drop(['take','status','oper'], axis=1) #df[train_features]
y = df['status']

x_columns = X.columns

In [92]:
encoder = pickle.load(open('LabelEncoder.pkl', 'rb'))
columns_categorical = X.select_dtypes(include=['object']).columns
for col_cat in columns_categorical:
    X[col_cat] = encoder.transform(X[col_cat])

scaler = encoder = pickle.load(open('MinMaxScaler.pkl', 'rb'))
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=True)

In [98]:
#https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
# load the model from disk
best_model = pickle.load(open('model_cler.sav', 'rb'))

In [99]:
best_model.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.005, loss='deviance', max_depth=9,
                           max_features=27, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=25, min_samples_split=150,
                           min_weight_fraction_leaf=0.0, n_estimators=2000,
                           n_iter_no_change=None, presort='auto',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

## Teste

In [100]:
y_pred_probs = best_model.predict_proba(X_test)

In [101]:
y_pred = best_model.predict(X_test)

In [102]:
roc_auc_score(y_test, y_pred_probs[:, 1])

0.478447509356713

In [103]:
accuracy_score(y_test, y_pred)

0.6348747591522158

In [20]:
#https://kiwidamien.github.io/are-you-sure-thats-a-probability.html

In [19]:
matrix = confusion_matrix(y_test, y_pred)
matrix, matrix.sum()

(array([[1085,   41],
        [ 216,  399]], dtype=int64), 1741)

In [54]:
dict(zip(best_model.classes_, y_pred_probs[:,1]))

{'Gain': 0.0559467629483045, 'Loss': 0.06179955239448644}

In [48]:
y_pred_probs

array([[0.94405324, 0.05594676],
       [0.93820045, 0.06179955],
       [0.17244544, 0.82755456],
       ...,
       [0.8102207 , 0.1897793 ],
       [0.73481661, 0.26518339],
       [0.95847974, 0.04152026]])