In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

from naive_bayes import train, eval

data = pd.read_csv("diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
train_data = data.sample(frac=0.7, random_state=42)
val_data = data.drop(train_data.index)

In [15]:
# Öncelikle en iyi scaling yöntemini belirleyelim algoritma dağılım bazlı olduğundan
# Sadece non-linear scaling yöntemleri denenecektir.
scalers = {
    'No Scaling': None,
    'PowerTransformer': PowerTransformer(),
    'QuantileTransformerUniform': QuantileTransformer(n_quantiles=5),
    'QuantileTransformerNormal3': QuantileTransformer(n_quantiles=3, output_distribution='normal'),
    'QuantileTransformerNormal5': QuantileTransformer(n_quantiles=5, output_distribution='normal')
}
results = {}
for scaler_name, scaler in scalers.items():
    
    scaled_train_data = train_data.copy()
    scaled_val_data = val_data.copy()
    if scaler is not None:
        scaled_train_data.iloc[:, :-1] = scaler.fit_transform(train_data.iloc[:, :-1])
        scaled_val_data.iloc[:, :-1] = scaler.transform(val_data.iloc[:, :-1]) 
        
        class_stats = train(scaled_train_data)
        acc = eval(class_stats, scaled_val_data)
        results[scaler_name] = acc
    else:
        # yeni oluşan dataframelere hiçbir şey yapmadan eğit
        class_stats = train(scaled_train_data)
        acc = eval(class_stats, scaled_val_data)
        results["No Scaling"] = acc

for scaler_name, acc in results.items():
    print(f"Accuracy with {scaler_name}: {acc}")

Accuracy with No Scaling: 0.7478260869565218
Accuracy with PowerTransformer: 0.7217391304347827
Accuracy with QuantileTransformerUniform: 0.7086956521739131
Accuracy with QuantileTransformerNormal3: 0.6304347826086957
Accuracy with QuantileTransformerNormal5: 0.6956521739130435


In [25]:
param_results = {}
var_smoothing_grid = np.linspace(1e-5, 1e-15, 10)
n = len(var_smoothing_grid)
class_stats = train(train_data)
for i in range(n):
    acc = eval(class_stats, val_data, var_smoothing=var_smoothing_grid[i])
    param_results[i] = acc

In [26]:
param_results

{0: 0.7478260869565218,
 1: 0.7478260869565218,
 2: 0.7478260869565218,
 3: 0.7478260869565218,
 4: 0.7478260869565218,
 5: 0.7478260869565218,
 6: 0.7478260869565218,
 7: 0.7478260869565218,
 8: 0.7478260869565218,
 9: 0.7478260869565218}