In [124]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, precision_score, accuracy_score
from sklearn.metrics import confusion_matrix

import seaborn as sns
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### THORACIC SURGERY

In [106]:
# fetch dataset 
thoracic_surgery_data = fetch_ucirepo(id=277) 
  
thoracic_X = thoracic_surgery_data.data.features 
thoracic_y = thoracic_surgery_data.data.targets

In [104]:
import pandas as pd

def preprocess_thoracic(X, y):
    pre_six = {"PRZ0": 0, "PRZ1": 1, "PRZ2": 2}
    pre_14 = {"OC11": 0, "OC12": 1, "OC13": 2, "OC14": 3}

    X = pd.get_dummies(X, columns=["DGN"], drop_first=True)

    X.loc[:, "PRE6"] = X["PRE6"].map(pre_six)
    X.loc[:, "PRE14"] = X["PRE14"].map(pre_14)

    bool_columns = X.columns[X.isin(['T', 'F']).any()]
    X.loc[:, bool_columns] = X[bool_columns].replace({'T': 1, 'F': 0}).infer_objects(copy=False)

    y.loc[:, "Risk1Yr"] = y["Risk1Yr"].replace({'T': 1, 'F': 0}).infer_objects(copy=False)
    
    # boolean columns to 0, 1
    X = X.astype(float)
    
    y = y.astype(float)

    return X, y

In [None]:
thoracic_X, thoracic_y = preprocess_thoracic(thoracic_X, thoracic_y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(thoracic_X, thoracic_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST
pred = model.predict(X_test)


# AUROC, F1, PRECISSION, ACCURACY
roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

print(f"True Positive: {conf_matrix[1][1]}")
print(f"True Negative: {conf_matrix[0][0]}")
print(f"False Positive: {conf_matrix[0][1]}")
print(f"False Negative: {conf_matrix[1][0]}")

### PRIMA

In [83]:
prima_df = pd.read_csv('data/diabetes.csv')
prima_label = prima_df[["Outcome"]]
prima_df = prima_df.drop(columns=["Outcome"])

prima_X = prima_df
prima_y = prima_label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(prima_X, prima_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST
pred = model.predict(X_test)


# AUROC, F1, PRECISSION, ACCURACY
roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

### CERVICAL CANCER

In [17]:
  # fetch dataset 
cervical_cancer_risk_factors = fetch_ucirepo(id=383) 
  
# data (as pandas dataframes) 
cervical = cervical_cancer_risk_factors.data.features 

In [18]:
cervical_y = cervical["Dx:Cancer"]
cervical_X = cervical.drop(columns=["Dx:Cancer"])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cervical_X, cervical_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST
pred = model.predict(X_test)


# AUROC, F1, PRECISSION, ACCURACY
roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(f"True Positive: {conf_matrix[1][1]}")
print(f"True Negative: {conf_matrix[0][0]}")
print(f"False Positive: {conf_matrix[0][1]}")
print(f"False Negative: {conf_matrix[1][0]}")

### PARKINSON

In [51]:
# fetch dataset 
parkinsons = fetch_ucirepo(id=174) 
  
# data (as pandas dataframes) 
parkinsons_X = parkinsons.data.features 
parkinsons_y = parkinsons.data.targets

parkinsons_X = parkinsons_X.loc[:, ~parkinsons_X.columns.duplicated()]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(parkinsons_X, parkinsons_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST
pred = model.predict(X_test)


# AUROC, F1, PRECISSION, ACCURACY
roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

In [None]:
print(y_train.dtypes)

### RETINOPATHY

In [57]:
# fetch dataset 
diabetic_retinopathy_debrecen = fetch_ucirepo(id=329) 
  
# data (as pandas dataframes) 
retinopathy_X = diabetic_retinopathy_debrecen.data.features 
retinopathy_y = diabetic_retinopathy_debrecen.data.targets 


In [73]:
retinopathy_X = retinopathy_X.loc[:, ~retinopathy_X.columns.duplicated()]

In [None]:
retinopathy_y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(retinopathy_X, retinopathy_y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST

pred = model.predict(X_test)

# AUROC, F1, PRECISSION, ACCURACY

roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

### BREAST CANCER

In [49]:
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
breast_X = breast_cancer_wisconsin_diagnostic.data.features 
breast_Y = breast_cancer_wisconsin_diagnostic.data.targets 

In [82]:
pd.set_option('display.max_rows', None)
full_data = pd.concat([breast_X, breast_Y], axis=1)
d = full_data[["concave_points1", "perimeter3", "concave_points3", "Diagnosis"]]

In [None]:
import seaborn as sns

sns.heatmap(d.corr(), annot=True)

In [None]:
# imbalance in breast_Y

breast_Y.replace({'M': 1, 'B': 0}, inplace=True)

print(breast_Y.value_counts())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(breast_X, breast_Y, test_size=0.2, random_state=42)

model = XGBClassifier()

model.fit(X_train, y_train)

# TEST

pred = model.predict(X_test)


# AUROC, F1, PRECISSION, ACCURACY

roc_auc = roc_auc_score(y_test, pred)
f1 = f1_score(y_test, pred)
precision = precision_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)

print(f"ROC_AUC: {roc_auc}")
print(f"F1: {f1}")
print(f"PRECISION: {precision}")
print(f"ACCURACY: {accuracy}")

conf_matrix = confusion_matrix(y_test, pred)

print(conf_matrix)

In [None]:
from matplotlib import pyplot as plt

# Get feature importance
importance = model.feature_importances_

# If you have feature names
feature_names = X_train.columns  # If X_train is a DataFrame

# Sort feature importances in descending order and create a DataFrame
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

# Optionally, you can plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.gca().invert_yaxis()  # To have the most important at the top
plt.title('Feature Importance from XGBoost')
plt.xlabel('Importance')
plt.show()

### OBESITY

In [122]:
# fetch dataset 
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 
  
# data (as pandas dataframes) 
obesity_X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features 
obesity_y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets 

obesity_y = y.replace({"Insufficient_Weight": 0, "Normal_Weight": 1, "Overweight_Level_I": 2, "Overweight_Level_II": 3, "Obesity_Type_I": 4, "Obesity_Type_II": 5, "Obesity_Type_III": 6})