In [2]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_score, accuracy_score
from sklearn.metrics import confusion_matrix

import seaborn as sns
from matplotlib import pyplot as plt

from imblearn.over_sampling import SMOTE

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

### THORACIC SURGERY

In [None]:
# fetch dataset 
thoracic_surgery_data = pd.read_csv("data/thoracic_surgery/thoracic_surgery.csv")

thoracic_surgery_data.head()

In [9]:
def preprocess_thoracic(data):

    # drop the ID column
    data = data.drop(columns=["id"], axis=1)
    
    numerical_cols = [col for col in data.columns if data[col].dtype != 'object']
    non_numerical_cols = [col for col in data.columns if data[col].dtype == 'object']
    
    # print the indeces of the numerical and non-numerical columns
    print("Numerical columns indeces: " , [data.columns.get_loc(col) for col in numerical_cols])  
    print("Non-numerical columns indeces: " , [data.columns.get_loc(col) for col in non_numerical_cols])
    
    print("Numerical columns: ", numerical_cols)
    print("Non-numerical columns: ", non_numerical_cols)
        
    pre_six = {"PRZ0": 0, "PRZ1": 1, "PRZ2": 2}
    pre_14 = {"OC11": 0, "OC12": 1, "OC13": 2, "OC14": 3}

    data = pd.get_dummies(data, columns=["DGN"], drop_first=True)

    data.loc[:, "PRE6"] = data["PRE6"].map(pre_six)
    data.loc[:, "PRE14"] = data["PRE14"].map(pre_14)

    bool_columns = data.columns[data.isin(['T', 'F']).any()]
    data.loc[:, bool_columns] = data[bool_columns].replace({'T': 1, 'F': 0}).infer_objects(copy=False)

    data.loc[:, "Risk1Yr"] = data["Risk1Yr"].replace({'T': 1, 'F': 0}).infer_objects(copy=False)
    
    ind_out = data["PRE5"] > 15
    data.drop(data[ind_out].index, inplace=True)
    
    # boolean columns to 0, 1
    data = data.astype(float)
    
    return data, numerical_cols, non_numerical_cols

In [None]:
thoracic_surgery_data, numerical_cols, non_numerical_cols = preprocess_thoracic(thoracic_surgery_data)

In [11]:
# save thoracic_surgery_data

thoracic_surgery_data.to_csv("data/thoracic_surgery/thoracic_surgery.csv", index=False)

In [36]:
import json

metadata = {
    "name": "thoracic_surgery",
    "task_type": "binclass",  # binclass or regression or multiclass
    "header": "infer",
    "column_names": None,
    "num_col_idx": [1,2,15],  # list of indices of numerical columns
    "cat_col_idx": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,16],  # list of indices of categorical columns
    "target_col_idx": [16],  # list of indices of the target columns (for MLE)
    "file_type": "csv",
    "data_path": f"data/thoracic_surgery/thoracic_surgery.csv",
    "test_path": None,
}
with open("data/Info/thoracic_surgery.json", "w") as json_file:
    json.dump(metadata, json_file, indent=4)

In [5]:
thoracic_X = thoracic_surgery_data.drop(columns=["Risk1Yr"])
thoracic_y = thoracic_surgery_data["Risk1Yr"]

In [None]:
thoracic_surgery_data.head(20)

In [None]:
p = thoracic_surgery_data.hist(figsize = (20,20))

In [None]:
thoracic_X.isna().sum()

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(thoracic_surgery_data.corr(), annot=True, fmt='.2f', cmap='coolwarm')

In [None]:
thoracic_y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(thoracic_X, thoracic_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST
pred = model.predict(X_test)

# AUROC, F1, PRECISSION, ACCURACY
roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

print(f"True Positive: {conf_matrix[1][1]}")
print(f"True Negative: {conf_matrix[0][0]}")
print(f"False Positive: {conf_matrix[0][1]}")
print(f"False Negative: {conf_matrix[1][0]}")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(thoracic_X, thoracic_y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Training the model on the resampled data
model = XGBClassifier()
model.fit(X_train_resampled, y_train_resampled)

# Testing the model
pred = model.predict(X_test)

# Metrics
roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

print(f"True Positive: {conf_matrix[1][1]}")
print(f"True Negative: {conf_matrix[0][0]}")
print(f"False Positive: {conf_matrix[0][1]}")
print(f"False Negative: {conf_matrix[1][0]}")

### PRIMA

In [17]:
prima_df = pd.read_csv('data/diabetes/diabetes.csv')

In [None]:
prima_df.isna().sum()

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(prima_df.corr(), annot=True, fmt='.2f', cmap='coolwarm')

In [None]:
prima_df.head()

In [None]:
numerical_pima_cols = [col for col in prima_df.columns if prima_df[col].dtype != 'object']
non_numerical_pima_cols = [col for col in prima_df.columns if prima_df[col].dtype == 'object']

print("Numerical columns indeces: " , [prima_df.columns.get_loc(col) for col in numerical_pima_cols])
print("Non-numerical columns indeces: " , [prima_df.columns.get_loc(col) for col in non_numerical_pima_cols])

In [None]:
p = prima_df.hist(figsize = (20,20))

In [24]:
import json

metadata = {
    "name": "diabetes",
    "task_type": "binclass",  # binclass or regression or multiclass
    "header": "infer",
    "column_names": None,
    "num_col_idx": [0, 1, 2, 3, 4, 5, 6, 7],  # list of indices of numerical columns
    "cat_col_idx": [8],  # list of indices of categorical columns
    "target_col_idx": [8],  # list of indices of the target columns (for MLE)
    "file_type": "csv",
    "data_path": f"data/diabetes/diabetes.csv",
    "test_path": None,
}
with open("data/Info/diabetes.json", "w") as json_file:
    json.dump(metadata, json_file, indent=4)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(prima_X, prima_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST
pred = model.predict(X_test)

# AUROC, F1, PRECISSION, ACCURACY
roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

### CERVICAL CANCER

In [25]:
  # fetch dataset 
#cervical_cancer_risk_factors = fetch_ucirepo(id=383) 
  
# data (as pandas dataframes) 
cervical = pd.read_csv("data/cervical/cervical.csv")

In [None]:
cervical_data = pd.DataFrame(cervical)

cervical_data.head()

In [32]:
# replace all ? in the dataset with NaN
import numpy as np
cervical_data = cervical_data.replace('?', np.nan)

In [None]:
cervical_data.isna().sum()

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(cervical_data.corr(), annot=True, fmt='.2f', cmap='coolwarm')

In [18]:
cervical_y = cervical["Dx:Cancer"]
cervical_X = cervical.drop(columns=["Dx:Cancer"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cervical_X, cervical_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST
pred = model.predict(X_test)


# AUROC, F1, PRECISSION, ACCURACY
roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(f"True Positive: {conf_matrix[1][1]}")
print(f"True Negative: {conf_matrix[0][0]}")
print(f"False Positive: {conf_matrix[0][1]}")
print(f"False Negative: {conf_matrix[1][0]}")

### PARKINSON

In [78]:
# fetch dataset 
#parkinsons = fetch_ucirepo(id=174) 
  
# data (as pandas dataframes) 
#parkinsons_X = parkinsons.data.features 
#parkinsons_y = parkinsons.data.targets

#parkinsons_X = parkinsons_X.loc[:, ~parkinsons_X.columns.duplicated()]

In [35]:
#parkinsons = pd.concat([parkinsons_X, parkinsons_y], axis=1)
parkinsons = pd.read_csv("data/parkinsons/parkinsons.csv")

In [None]:
parkinsons.status.value_counts()

In [81]:
#parkinsons.to_csv("data/parkinsons/parkinsons.csv", index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(parkinsons_X, parkinsons_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST
pred = model.predict(X_test)


# AUROC, F1, PRECISSION, ACCURACY
roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

In [None]:
print(y_train.dtypes)

### HEART

In [6]:
heart_data = pd.read_csv("data/heart/heart.csv")

In [None]:
heart_data

In [11]:
# replace all NAN with 0

heart_data = heart_data.fillna(0)

heart_data.to_csv("data/heart/heart.csv", index=False)

In [None]:
p = heart_data.hist(figsize = (20,20))

In [None]:
for i, col in enumerate(heart_data.columns):
    print(i, col)

In [None]:
cat = [0, 2, 3, 6, 7, 8, 15]
num = [1, 4, 5, 9, 10, 11, 12, 13, 14]

In [None]:
heart_data.isna().sum()

In [49]:
heart_data_info = {
    "name": "heart",
    "task_type": "binclass",  # binclass or regression or multiclass
    "header" : "infer",
    "column_names": None,
    "num_col_idx": [1, 4, 9, 10, 11, 12, 13, 14],
    "cat_col_idx": [0, 2, 3, 5, 6, 7, 8, 15],
    "target_col_idx": [13],  # list of indices of the target columns (for MLE)
    "file_type": "csv",
    "data_path": f"data/heart/heart.csv",
    "test_path": None,
}

with open("data/Info/heart.json", "w") as json_file:
    json.dump(heart_data_info, json_file, indent=4)
    

### RETINOPATHY

In [68]:
# fetch dataset 
diabetic_retinopathy_debrecen = fetch_ucirepo(id=329) 
  
# data (as pandas dataframes) 
retinopathy_X = diabetic_retinopathy_debrecen.data.features 
retinopathy_y = diabetic_retinopathy_debrecen.data.targets 

retinopathy = pd.concat([retinopathy_X, retinopathy_y], axis=1)

In [None]:
retinopathy.head()

In [51]:
# save csv
#retinopathy.to_csv("data/retinopathy/diabetic_retinopathy.csv", index=False)
retinopathy = pd.read_csv("data/retinopathy/diabetic_retinopathy.csv")

In [None]:
retinopathy.head()

In [None]:
retinopathy["quality"].value_counts()

In [None]:
plt.figure(figsize=(20, 20))
sns.heatmap(retinopathy.corr(), annot=True, fmt='.2f', cmap='coolwarm')

In [None]:
p = retinopathy.hist(figsize = (20,20))

In [None]:
retinopathy_info = {
    "name": "retinopathy",
    "task_type": "binclass",  # binclass or regression or multiclass
    "header" : "infer",
    "column_names": None,
    "num_col_idx": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
    "cat_col_idx": [0, 1, 18, 19],
    "target_col_idx": [9],  # list of indices of the target columns (for MLE)
    "file_type": "csv",
    "data_path": f"data/retinopathy/diabetic_retinopathy.csv",
    "test_path": None,
}

with open("data/Info/retinopathy.json", "w") as json_file:
    json.dump(retinopathy_info, json_file, indent=4)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(retinopathy_X, retinopathy_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST

pred = model.predict(X_test)

# AUROC, F1, PRECISSION, ACCURACY

roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

### BREAST CANCER

In [71]:
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
breast_X = breast_cancer_wisconsin_diagnostic.data.features 
breast_Y = breast_cancer_wisconsin_diagnostic.data.targets 

In [72]:
full_data = pd.concat([breast_X, breast_Y], axis=1)
d = full_data[["concave_points1", "perimeter3", "concave_points3", "Diagnosis"]]

In [None]:
full_data

In [None]:
# imbalance in breast_Y

breast_Y.replace({'M': 1, 'B': 0}, inplace=True)

print(breast_Y.value_counts())

In [None]:
breast = pd.concat([breast_X, breast_Y], axis=1)

breast.head()

In [77]:
breast.to_csv("data/breast_cancer/breast_cancer.csv", index=False)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(breast_X, breast_Y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST

pred = model.predict(X_test)


# AUROC, F1, PRECISSION, ACCURACY

roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

In [None]:
from matplotlib import pyplot as plt

# Get feature importance
importance = model.feature_importances_

# If you have feature names
feature_names = X_train.columns  # If X_train is a DataFrame

# Sort feature importances in descending order and create a DataFrame
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

# Optionally, you can plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.gca().invert_yaxis()  # To have the most important at the top
plt.title('Feature Importance from XGBoost')
plt.xlabel('Importance')
plt.show()

### OBESITY

In [None]:
# fetch dataset 
obesity = fetch_ucirepo(id=544) 
  
# data (as pandas dataframes) 
obesity_X = obesity.data.features 
obesity_y = obesity.data.targets 

obesity_y = obesity_y.replace({"Insufficient_Weight": 0, "Normal_Weight": 1, "Overweight_Level_I": 2, "Overweight_Level_II": 3, "Obesity_Type_I": 4, "Obesity_Type_II": 5, "Obesity_Type_III": 6})

In [3]:
obesity = pd.concat([obesity_X, obesity_y], axis=1)

In [None]:
obesity.head()

In [None]:
obesity.isna().sum()

In [6]:
from sklearn.preprocessing import LabelEncoder

def clean_and_convert(df):
    # Label Encoding for binary categorical variables
    binary_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC']
    
    le = LabelEncoder()
    for col in binary_columns:
        df[col] = le.fit_transform(df[col])
    
    # One-Hot Encoding for non-binary categorical variables
    one_hot_columns = ['MTRANS']
    df = pd.get_dummies(df, columns=one_hot_columns)
    
    # Lickert Scale
    likert_columns = ['CAEC', 'CALC']
    lickert_dict = {"no": 0, "Sometimes": 1, "Frequently": 2, "Always": 3}
    
    df[likert_columns] = df[likert_columns].replace(lickert_dict)
    
    # set "NObeyesdad" column the last
    cols = df.columns.tolist()
    cols.remove("NObeyesdad")
    cols.append("NObeyesdad")
    
    df = df[cols]
    
    return df

In [None]:
obesity_processed = clean_and_convert(obesity)
obesity_processed.head()

In [None]:
p = obesity_processed.hist(figsize = (20,20))

In [12]:
import os
os.makedirs("data/obesity", exist_ok=True)

obesity.to_csv("data/obesity/obesity.csv", index=False)