# Bibliotecas

In [1]:
import pandas as pd

from ml_core.normalization import z_score_normalization
from ml_core.data_process import *
from ml_core.regression.gradient_descent import GradientDescent
from sklearn.model_selection import train_test_split
from ml_core.metrics.error_metrics import *
from ml_core.classification import LogisticBinaryModel
from ml_core.classification import StatisticalClassifiers, GaussianDiscriminant, NaiveBayesGaussian
from ml_core.cross_validation import Kfold

# Leitura dos Dados

In [2]:
df = pd.read_csv('data/breastcancer.csv', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


# Preparar os dados para experimentos


In [4]:
arr = df.to_numpy()

In [5]:
X = arr[:, :30]
Y = arr[:, -1]

In [6]:
X_norm = np.empty_like(X)

for idx, col in enumerate(X.T):
    
    norm = z_score_normalization(col)
    
    X_norm[:, idx] = norm

In [7]:
y_norm = Y.reshape([-1, 1])

In [8]:
X_ones = add_ones_column(X_norm)

In [9]:
X_trn, X_tst, y_trn, y_tst = train_test_split(X_ones, y_norm, test_size=0.2, random_state=42)

# Classificação Binária

## Regressão Logistica

In [10]:
first_model = LogisticBinaryModel.first_model(
    lenght=X_trn.shape[1],
    fill_value=1
)

In [11]:
gd = GradientDescent(ephocs=1000, first_model=first_model.__copy__())

In [12]:
gd_model = gd.fit(X_trn, y_trn)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [13]:
preds = gd_model.predict_ajust(X_tst)

In [14]:
acuracy(y_tst, preds), f1_score(y_tst, preds), precision(y_tst, preds)

(0.956140350877193, 0.7777777777777777, 0.6422018348623854)

## Analise Discriminante Gaussiano

In [15]:
X_trn_ = X_trn[:, 1:]  
y_trn_dummies = get_dummies(y_trn.reshape([-1]))

In [16]:
gaussian_discriminant = StatisticalClassifiers(model=GaussianDiscriminant)

In [17]:
model_gaussian = gaussian_discriminant.fit(X_trn_, y_trn_dummies)

In [18]:
preds = model_gaussian.predict(X_tst[:, 1:])

In [19]:
acuracy(y_tst, preds), f1_score(y_tst, preds), precision(y_tst, preds)

(0.956140350877193, 0.7555555555555555, 0.6238532110091743)

## Naive Bayes

In [20]:
naive_bayes = StatisticalClassifiers(model=NaiveBayesGaussian)

In [21]:
model_naive = naive_bayes.fit(X_trn_, y_trn_dummies)

In [22]:
preds = model_naive.predict(X_tst[:, 1:])

In [23]:
acuracy(y_tst, preds), f1_score(y_tst, preds), precision(y_tst, preds)

(0.9649122807017544, 0.7734806629834254, 0.6363636363636364)

# Experimentos Com o K Fold

## Regressão Linear

In [24]:
lr_metrics = {
    'Acurácia': lambda x, y: acuracy(x, np.around(y)),
    'F1-Score': lambda x, y: f1_score(x, np.around(y)),
    'Precisão': lambda x, y: precision(x, np.around(y))
}

In [25]:
k_fold = Kfold(k=10, metrics=lr_metrics, verbose=True)

In [26]:
first_model = LogisticBinaryModel.first_model(
    lenght=X_trn.shape[1],
    fill_value=1
)

gd = GradientDescent(ephocs=1000, first_model=first_model.__copy__())

In [27]:
y_trn.shape

(455, 1)

In [28]:
_ = k_fold(gd, X_ones, y_norm)

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

--------------REPORTANDO OS RESULTADOS OBTIDOS--------------
--------------ACURÁCIA--------------
Média: 0.9437969924812029
Desvio Padrão: 0.035746204353523804
--------------F1-SCORE--------------
Média: 0.7643965624756128
Desvio Padrão: 0.153530884398655
--------------PRECISÃO--------------
Média: 0.6485269100735249
Desvio Padrão: 0.17228154557955555


# Discriminante Gaussiano

In [29]:
y_dummies = get_dummies(y_norm.reshape([-1]))

In [30]:
gaussian_metrics = {
    'Acurácia': acuracy,
    'F1-Score': f1_score,
    'Precisão': precision
}

In [31]:
k_fold = Kfold(k=10, metrics=gaussian_metrics, verbose=True)

In [32]:
gaussian_discriminant = StatisticalClassifiers(model=GaussianDiscriminant)

In [33]:
_ = k_fold(gaussian_discriminant, X_norm, y_dummies, y_1d=y_norm)

--------------REPORTANDO OS RESULTADOS OBTIDOS--------------
--------------ACURÁCIA--------------
Média: 0.9578320802005011
Desvio Padrão: 0.011600957988127433
--------------F1-SCORE--------------
Média: 0.7485236821761121
Desvio Padrão: 0.15384972531104124
--------------PRECISÃO--------------
Média: 0.6338142430595262
Desvio Padrão: 0.17426430886305966


## Naive Bayes

In [34]:
naive_bayes = StatisticalClassifiers(model=NaiveBayesGaussian)

In [35]:
_ = k_fold(naive_bayes, X_norm, y_dummies, y_1d=y_norm)

--------------REPORTANDO OS RESULTADOS OBTIDOS--------------
--------------ACURÁCIA--------------
Média: 0.9262218045112782
Desvio Padrão: 0.03741164269199176
--------------F1-SCORE--------------
Média: 0.7469782521666168
Desvio Padrão: 0.1503770375766516
--------------PRECISÃO--------------
Média: 0.6396399157733786
Desvio Padrão: 0.1722279442097974
