# Data Exploration

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
data = pd.read_csv('data.csv', decimal=',')
data['Sedentary_hours_daily'] = data['Sedentary_hours_daily'].astype(float)
data['Regular_fiber_diet'] = data['Regular_fiber_diet'].astype(float)
data['Age'] = data['Age'].astype(float)
data['Est_avg_calorie_intake'] = data['Est_avg_calorie_intake'].astype(int)
data['Main_meals_daily'] = data['Main_meals_daily'].astype(int)
data['Height'] = data['Height'].astype(float)
data['Water_daily'] = data['Water_daily'].astype(int)
data['Weight'] = data['Weight'].astype(float)
data['Physical_activity_level'] = data['Physical_activity_level'].astype(int)
data['Technology_time_use'] = data['Technology_time_use'].astype(int)
numeric_attributes = data[['Sedentary_hours_daily', 'Regular_fiber_diet', 'Age', 'Est_avg_calorie_intake', 'Main_meals_daily',\
                      'Height', 'Water_daily', 'Weight', 'Physical_activity_level', 'Technology_time_use']]
categorical_attributes = data[['Transportation', 'Diagnostic_in_family_history',\
                          'High_calorie_diet', 'Alcohol', 'Snacks', 'Smoker', 'Calorie_monitoring', 'Gender']]

In [None]:
# Class equilibrium
plt.figure(figsize=(6, 4))
sns.countplot(x='Diagnostic', data=data)
plt.title('Class Frequency in the Dataset')
plt.xlabel('Diagnosis')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Numerical Attributes
mad = lambda x: np.mean(np.abs(x - np.mean(x)))
mead = lambda x: np.median(np.abs(x - np.median(x)))

for column in numeric_attributes.columns:
    print("Statistics for column", column)
    print("Mean:", numeric_attributes[column].mean())
    print("Std Deviation:", numeric_attributes[column].std())
    print("Mean Abs Deviation:", mad(numeric_attributes[column]))
    print("Min Val:", numeric_attributes[column].min())
    print("Max Val:", numeric_attributes[column].max())
    print("Amplitude:", numeric_attributes[column].max() - numeric_attributes[column].min())
    print("Median:", numeric_attributes[column].median())
    print("Median Abs Deviation:", mead(numeric_attributes[column]))
    q1 = numeric_attributes[column].quantile(0.25)
    q3 = numeric_attributes[column].quantile(0.75)
    print("Interquantil Interval:", q3 - q1)
    print()

plt.figure(figsize=(8, 6))
numeric_attributes.hist(figsize=(12, 10))
plt.suptitle('Numerical Attributes Histogram', fontsize=16)
plt.show()

for column in categorical_attributes.columns:
    print("Statistics for column", column)
    print("Unique values:", categorical_attributes[column].unique())
    print()

plt.figure(figsize=(12, 10))
for i, column in enumerate(categorical_attributes.columns, 1):
    plt.subplot(3, 3, i)
    sns.countplot(x=column, data=data)
    for tick in plt.gca().get_xticklabels():
        tick.set_rotation(45)
    plt.title(column)
    plt.xlabel("")
    plt.ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

# encode categorical attributes
encoder = LabelEncoder()
for column in categorical_attributes.columns:
    data[column] = encoder.fit_transform(data[column])
data['Diagnostic'] = encoder.fit_transform(data['Diagnostic'])

In [None]:
# Covariance Matrix
covariance_matrix = data.corr()
mask = np.triu(np.ones_like(covariance_matrix, dtype=bool))
covariance_matrix = covariance_matrix.mask(mask)
plt.figure(figsize=(10, 8))
sns.heatmap(covariance_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Covariance Matrix')
plt.show()

class_cov = data.corrwith(data['Diagnostic'])
plt.figure(figsize=(10, 8))
class_cov.plot(kind='bar')
plt.title('Covariance Matrix by Class')
plt.xlabel('Attribute')
plt.ylabel('Covariance')
plt.show()

# Learning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import SelectPercentile
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy.stats import zscore
from sklearn.base import clone

In [None]:
# remove outliers
z_scores = numeric_attributes.apply(zscore)
numeric_attributes[z_scores.abs() > 3] = np.nan

In [None]:
# treat missing values
imputer = IterativeImputer(missing_values=-1, max_iter=10)
data['Weight'] = imputer.fit_transform(data[['Weight']])
for column in numeric_attributes:
    data[column] = imputer.fit_transform(data[[column]])

In [None]:
# feature selection

print("Initial features:")
print(data.columns)
selector = SelectPercentile(percentile=80)
selector.fit(data.drop(columns=['Diagnostic']), data['Diagnostic'])
selected_indices = selector.get_support(indices=True)
X_selected = data.iloc[:, selected_indices]
print("Features after selection:")
selector.get_feature_names_out()

In [None]:
# standardize data

scaler = StandardScaler()
data[numeric_attributes.columns] = scaler.fit_transform(data[numeric_attributes.columns])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_selected, data['Diagnostic'], test_size=0.2, random_state=42)

In [None]:
# hyperparameters

param_grid_rf = {'n_estimators': [100, 200, 300],
                 'max_depth': [None, 10, 20],
                 'max_features': ['sqrt', 'log2']}
param_grid_et = {'n_estimators': [100, 200, 300],
                 'max_depth': [None, 10, 20],
                 'max_features': ['sqrt', 'log2']}
param_grid_gb = {'n_estimators': [100, 200, 300],
                 'max_depth': [3, 5, 7],
                 'learning_rate': [0.1, 0.05, 0.01]}
param_grid_svm = {'kernel': ['linear', 'poly', 'rbf'],
                  'C': [0.1, 1, 10]}

In [None]:
models = {'RandomForestClassifier': (RandomForestClassifier(), param_grid_rf),
          'ExtraTreesClassifier': (ExtraTreesClassifier(), param_grid_et),
          'XGBClassifier': (xgb.XGBClassifier(), param_grid_gb),
          'SVC': (SVC(), param_grid_svm)}

results_dict = {}

for model_name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, Y_train)
    
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)
    
    accuracy = accuracy_score(Y_test, predictions)
    precision = precision_score(Y_test, predictions, average=None)
    recall = recall_score(Y_test, predictions, average=None)
    f1 = f1_score(Y_test, predictions, average=None)
    cm = confusion_matrix(Y_test, predictions)
    
    results_dict[model_name] = {
        'best_params': grid_search.best_params_,
        'accuracy_mean': accuracy.mean(),
        'accuracy_var': accuracy.var(),
        'precision_mean': precision.mean(),
        'precision_var': precision.var(),
        'recall_mean': recall.mean(),
        'recall_var': recall.var(),
        'f1_mean': f1.mean(),
        'f1_var': f1.var(),
        'confusion_matrix': cm
    }


In [None]:
from IPython.display import display

results_df = pd.DataFrame(results_dict).T

results_df = results_df.drop(columns='best_params')

for col in results_df.columns:
    if 'mean' in col or 'var' in col:
        max_value = results_df[col].max()
        results_df[col] = results_df[col].apply(lambda x: f"**{x:.2f}**" if x == max_value else f"{x:.2f}")

display(results_df)

for model_name, best_params in results_dict.items():
    if isinstance(best_params, dict):
        print(f"\nBest Parameters for {model_name}:")
        print(best_params)

for model_name, cm in results_df['confusion_matrix'].items():
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')