# Data Exploration

In [None]:
# imports

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np

In [None]:
data = pd.read_csv('data.csv', decimal=',')
data['Sedentary_hours_daily'] = data['Sedentary_hours_daily'].astype(float)
data['Regular_fiber_diet'] = data['Regular_fiber_diet'].astype(float)
data['Age'] = data['Age'].astype(float)
data['Est_avg_calorie_intake'] = data['Est_avg_calorie_intake'].astype(int)
data['Main_meals_daily'] = data['Main_meals_daily'].astype(int)
data['Height'] = data['Height'].astype(float)
data['Water_daily'] = data['Water_daily'].astype(int)
data['Weight'] = data['Weight'].astype(float)
data['Physical_activity_level'] = data['Physical_activity_level'].astype(int)
data['Technology_time_use'] = data['Technology_time_use'].astype(int)
numeric_attributes = data[['Sedentary_hours_daily', 'Regular_fiber_diet', 'Age', 'Est_avg_calorie_intake', 'Main_meals_daily',\
                      'Height', 'Water_daily', 'Weight', 'Physical_activity_level', 'Technology_time_use']]
categorical_attributes = data[['Transportation', 'Diagnostic_in_family_history',\
                          'High_calorie_diet', 'Alcohol', 'Snacks', 'Smoker', 'Calorie_monitoring', 'Gender']]

In [None]:
# Class equilibrium

plt.figure(figsize=(6, 4))
sns.countplot(x='Diagnostic', data=data)
plt.title('Class Frequency in the Dataset')
plt.xlabel('Diagnosis')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Numerical Attributes

mad = lambda x: np.mean(np.abs(x - np.mean(x)))
mead = lambda x: np.median(np.abs(x - np.median(x)))

for column in numeric_attributes.columns:
    print("Statistics for column", column)
    print("Mean:", numeric_attributes[column].mean())
    print("Std Deviation:", numeric_attributes[column].std())
    print("Mean Abs Deviation:", mad(numeric_attributes[column]))
    print("Min Val:", numeric_attributes[column].min())
    print("Max Val:", numeric_attributes[column].max())
    print("Amplitude:", numeric_attributes[column].max() - numeric_attributes[column].min())
    print("Median:", numeric_attributes[column].median())
    print("Median Abs Deviation:", mead(numeric_attributes[column]))
    q1 = numeric_attributes[column].quantile(0.25)
    q3 = numeric_attributes[column].quantile(0.75)
    print("Interquantil Interval:", q3 - q1)
    print()

plt.figure(figsize=(8, 6))
numeric_attributes.hist(figsize=(12, 10))
plt.suptitle('Numerical Attributes Histogram', fontsize=16)
plt.show()

for column in categorical_attributes.columns:
    print("Statistics for column", column)
    print("Unique values:", categorical_attributes[column].unique())
    print()

plt.figure(figsize=(12, 10))
for i, column in enumerate(categorical_attributes.columns, 1):
    plt.subplot(3, 3, i)
    sns.countplot(x=column, data=data)
    plt.title(column)
    plt.xlabel("")
    plt.ylabel("Count")

plt.tight_layout()
plt.show()

In [None]:
# Covariance Matrix

covariance_matrix = numeric_attributes.corr()
plt.figure(figsize=(6, 4))
sns.heatmap(covariance_matrix, annot=True, cmap='coolwarm')
plt.title('Covariance Matrix')
plt.show()

plt.figure(figsize=(10, 8))
sns.pairplot(data, hue='Diagnostic', diag_kind='kde')
plt.title('Pair Diagram and Class Distribution')
plt.show()

# Learning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.feature_selection import VarianceThreshold, SelectPercentile
from sklearn.preprocessing import StandardScaler

In [None]:
# standardize data

if data.select_dtypes(include=['float64', 'int64']).shape[1] > 0:
    scaler = StandardScaler()
    data[data.select_dtypes(include=['float64', 'int64']).columns] = scaler.fit_transform(data.select_dtypes(include=['float64', 'int64']))

In [None]:
# treat missing values

imputers = [SimpleImputer(missing_values=-1, strategy='mean'), IterativeImputer(max_iter=10)]
data['Weight'] = imputers[0].fit_transform(data[['Weight']])

In [None]:
# feature selection

selectors = [VarianceThreshold(), SelectPercentile()]
data_reduced = selectors[0].fit_transform(data)

In [None]:
# hyperparameters

param_grid_rf = {'n_estimators': [100, 200, 300],
                 'max_depth': [None, 10, 20],
                 'max_features': ['auto', 'sqrt', 'log2']}
param_grid_et = {'n_estimators': [100, 200, 300],
                 'max_depth': [None, 10, 20],
                 'max_features': ['auto', 'sqrt', 'log2']}
param_grid_gb = {'n_estimators': [100, 200, 300],
                 'max_depth': [3, 5, 7],
                 'learning_rate': [0.1, 0.05, 0.01]}
param_grid_svm = {'kernel': ['linear', 'poly', 'rbf'],
                  'C': [0.1, 1, 10]}

In [None]:
models = [RandomForestClassifier(), ExtraTreesClassifier(), xgb.XGBClassifier(), SVC()]

for model in models:
    if isinstance(model, RandomForestClassifier):
        new_model = GridSearchCV(model, param_grid_rf, cv=5)
    elif isinstance(model, ExtraTreesClassifier):
        new_model = GridSearchCV(model, param_grid_et, cv=5)
    elif isinstance(model, xgb.XGBClassifier):
        new_model = GridSearchCV(model, param_grid_gb, cv=5)
    elif isinstance(model, SVC):
        new_model = GridSearchCV(model, param_grid_svm, cv=5)
    new_model.fit(data_reduced, data['Diagnostic'])
    print(f"{new_model.__name__}: {new_model.best_params_}")