In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from warnings import simplefilter

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

plt.rcParams['figure.figsize'] = (25, 8)
#sns.set_style('whitegrid')
sns.set_style('white')
sns.set_context('poster')

In [2]:
df = pd.read_csv('datasets/dataset_cler.csv', sep=";", encoding='utf-16')
df.drop(df.loc[(df['status']!='Loss') & (df['status']!='Gain')].index.values, inplace=True)
df['status'].value_counts()

Gain    3787
Loss    2015
Name: status, dtype: int64

In [11]:
df_test = pd.read_csv('to_csv.csv', sep=",")

In [12]:
df.head(3)

Unnamed: 0,ampl_20,desv_amp_20,body_20,body_per_20,pavio_sup_20,pavio_sup_per_20,pavio_inf_20,pavio_inf_per_20,high_dist_20,low_dist_20,...,pavio_inf_h2_0,pavio_inf_per_h2_0,high_dist_h2_0,low_dist_h2_0,type_h2_0,atr,qt_bars,oper,take,status
0,55.0,1.38,25.0,45.45,15.0,27.27,15.0,27.27,0.0,15.0,...,215.0,47.78,615.0,620.0,up,41.43,3,Buy,20.0,Gain
1,70.0,4.67,10.0,14.29,30.0,42.86,30.0,42.86,35.0,20.0,...,0.0,0.0,85.0,325.0,up,58.57,3,Buy,20.0,Gain
2,65.0,2.17,40.0,61.54,0.0,0.0,25.0,38.46,10.0,45.0,...,0.0,0.0,180.0,325.0,up,58.21,5,Sell,20.0,Gain


In [13]:
df['ampl_20'][0]

55.0

In [16]:
df_test.head()

Unnamed: 0,ampl_20,desv_amp_20,body_20,body_per_20,pavio_sup_20,pavio_sup_per_20,pavio_inf_20,pavio_inf_per_20,high_dist_20,low_dist_20,...,body_per_h2_0,pavio_sup_h2_0,pavio_sup_per_h2_0,pavio_inf_h2_0,pavio_inf_per_h2_0,high_dist_h2_0,low_dist_h2_0,type_h2_0,atr,qt_bars
0,45.0,0.69,5.0,11.11,10.0,22.22,30.0,66.67,30.0,10.0,...,61.9,50.0,23.81,30.0,14.29,800.0,565.0,down,121.43,6


In [17]:
df.shape

(5802, 412)

In [18]:
X = df.drop(['take','status','oper'], axis=1) #df[train_features]
y = df['status']

x_columns = X.columns

In [19]:
encoder = LabelEncoder()
columns_categorical = X.select_dtypes(include=['object']).columns
for col_cat in columns_categorical:
    X[col_cat] = encoder.fit_transform(X[col_cat])

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=True)

ValueError: bad input shape (5802, 409)

In [6]:
X

array([[0.06040268, 0.14390244, 0.06097561, ..., 1.        , 0.05901789,
        0.        ],
       [0.08053691, 0.54512195, 0.02439024, ..., 1.        , 0.10783867,
        0.        ],
       [0.0738255 , 0.2402439 , 0.09756098, ..., 1.        , 0.10681326,
        0.25      ],
       ...,
       [0.01342282, 0.07317073, 0.02439024, ..., 1.        , 0.03053435,
        0.        ],
       [0.05369128, 0.2195122 , 0.1097561 , ..., 0.        , 0.03053435,
        0.25      ],
       [0.00671141, 0.03658537, 0.01219512, ..., 0.        , 0.03155976,
        0.25      ]])

In [7]:
#https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/
import pickle
filename = 'model_cler.sav'

# load the model from disk
best_model = pickle.load(open(filename, 'rb'))

## Teste

In [25]:
y_pred_probs = best_model.predict_proba(X_test)

In [26]:
y_pred = best_model.predict(X_test)

In [27]:
roc_auc_score(y_test, y_pred_probs[:, 1])

0.9227295701020953

In [28]:
accuracy_score(y_test, y_pred)

0.8523836875358989

In [20]:
#https://kiwidamien.github.io/are-you-sure-thats-a-probability.html

In [19]:
matrix = confusion_matrix(y_test, y_pred)
matrix, matrix.sum()

(array([[1085,   41],
        [ 216,  399]], dtype=int64), 1741)

In [54]:
dict(zip(best_model.classes_, y_pred_probs[:,1]))

{'Gain': 0.0559467629483045, 'Loss': 0.06179955239448644}

In [48]:
y_pred_probs

array([[0.94405324, 0.05594676],
       [0.93820045, 0.06179955],
       [0.17244544, 0.82755456],
       ...,
       [0.8102207 , 0.1897793 ],
       [0.73481661, 0.26518339],
       [0.95847974, 0.04152026]])