# Classificação

**TAREFA**

Modelos: KNN, SVM, regressão logística 
Classificação: funcionamento, avaliação e seleção de features

Os dados consistem em uma base que contém features construídas a partir da cotação do ativo em questão, tentando prever seu cenário binário futuro (alta/baixa) como target

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
from ta import add_all_ta_features
from ta.utils import dropna

In [4]:
data = yf.download("ITUB4.SA",
                   start = '2013-01-01',
                   period = "10y", # 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max
                   interval = "1d") # 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo

data.head(2)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-01-02,15.481638,15.727523,15.440657,15.700202,10.290003,21347017
2013-01-03,15.622794,16.278486,15.531725,16.169205,10.59739,29351983


In [13]:
df = dropna(data)
df = add_all_ta_features(df,
                        open="Open",
                        high="High",
                        low="Low",
                        close="Adj Close",
                        volume="Volume",
                        fillna=True)

df.dropna(axis=1, inplace=True)
df["Target"] = df["Adj Close"].pct_change().shift(-1)
df["Target"] = np.where(df["Target"] > 0, 1, 0)
df.dropna(axis=0, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2508 entries, 2013-01-02 to 2023-02-24
Data columns (total 93 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Open                       2508 non-null   float64
 1   High                       2508 non-null   float64
 2   Low                        2508 non-null   float64
 3   Close                      2508 non-null   float64
 4   Adj Close                  2508 non-null   float64
 5   Volume                     2508 non-null   float64
 6   volume_adi                 2508 non-null   float64
 7   volume_obv                 2508 non-null   float64
 8   volume_cmf                 2508 non-null   float64
 9   volume_fi                  2508 non-null   float64
 10  volume_em                  2508 non-null   float64
 11  volume_sma_em              2508 non-null   float64
 12  volume_vpt                 2508 non-null   float64
 13  volume_vwap                250

# Feature Engineering 

In [None]:
X = df.drop("Target", axis = 1)
y = df["Target"]

### Supervisionados

In [54]:
from sklearn.feature_selection import f_classif, SelectKBest, chi2, SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, RocCurveDisplay



In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)

selector = SelectKBest(score_func=f_classif, k=15)
X_train = selector.fit_transform(X_train, y_train,)
X_test = selector.transform(X_test)

modelo = RandomForestClassifier()
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("AUC - Teste :", roc_auc)

0.5179282868525896
[[202  53]
 [189  58]]
              precision    recall  f1-score   support

           0       0.52      0.79      0.63       255
           1       0.52      0.23      0.32       247

    accuracy                           0.52       502
   macro avg       0.52      0.51      0.47       502
weighted avg       0.52      0.52      0.48       502

AUC - Teste : 0.5134873382551401


In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)

selector = SelectFromModel(RandomForestClassifier())
X_train = selector.fit_transform(X_train, y_train,)
X_test = selector.transform(X_test)

modelo = RandomForestClassifier()
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("AUC - Teste :", roc_auc)

0.5179282868525896
[[192  63]
 [179  68]]
              precision    recall  f1-score   support

           0       0.52      0.75      0.61       255
           1       0.52      0.28      0.36       247

    accuracy                           0.52       502
   macro avg       0.52      0.51      0.49       502
weighted avg       0.52      0.52      0.49       502

AUC - Teste : 0.5141224100976423


### Não-Supervisionados

In [60]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, RocCurveDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA, KernelPCA, FactorAnalysis 

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = False)

pca = PCA(n_components=0.90)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

modelo = RandomForestClassifier()
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("AUC - Teste :", roc_auc)

0.5059760956175299
[[183  72]
 [176  71]]
              precision    recall  f1-score   support

           0       0.51      0.72      0.60       255
           1       0.50      0.29      0.36       247

    accuracy                           0.51       502
   macro avg       0.50      0.50      0.48       502
weighted avg       0.50      0.51      0.48       502

AUC - Teste : 0.5025482257680399


In [67]:
pca = KernelPCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

modelo = RandomForestClassifier()
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("AUC - Teste :", roc_auc)

0.5059760956175299
[[182  73]
 [175  72]]
              precision    recall  f1-score   support

           0       0.51      0.71      0.59       255
           1       0.50      0.29      0.37       247

    accuracy                           0.51       502
   macro avg       0.50      0.50      0.48       502
weighted avg       0.50      0.51      0.48       502

AUC - Teste : 0.5026117329522902


In [71]:
pca = FactorAnalysis()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

modelo = RandomForestClassifier()
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)

print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
print("AUC - Teste :", roc_auc)

0.5179282868525896
[[187  68]
 [174  73]]
              precision    recall  f1-score   support

           0       0.52      0.73      0.61       255
           1       0.52      0.30      0.38       247

    accuracy                           0.52       502
   macro avg       0.52      0.51      0.49       502
weighted avg       0.52      0.52      0.49       502

AUC - Teste : 0.5144399460188934
