In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import math

warnings.filterwarnings('ignore')


df = pd.read_csv('diabetes_unclean.csv')

# We don't need these columns hence we drop them
df.drop(labels=['ID', 'No_Pation', 'Gender', 'Urea', 'Cr'], axis=1, inplace=True)

print("First 5 rows:\n", df.head())
print("\nLast 5 rows:\n", df.tail())
print("\nInfo:")
df.info()
print("\nDescription:\n", df.describe())

# All outliers in this list are true outliers hence we keep them as it is
num_cols = ['AGE', 'HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
for var in num_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[var])
    plt.title(f'Boxplot of {var}')
    plt.show()

#0 cannot be a value for these columns, hence we replace them with the median
meas = ['HbA1c', 'Chol', 'TG', 'HDL', 'LDL', 'VLDL', 'BMI']
for var in meas:
    if var in df.columns:
        df[var] = df[var].replace(0, np.nan)
        df[var] = df[var].fillna(df[var].median())


if 'AGE' in df.columns:
    df['AGE'] = df['AGE'].fillna(df['AGE'].median())

print("\nMissing values after imputation:")
print(df.isnull().sum())

# Class contains duplicates like 'y' and 'Y' hence we make a common value for all
if 'CLASS' in df.columns:
    df['CLASS'] = df['CLASS'].astype(str).str.strip().str.upper()
    print("Unique CLASS values:", df['CLASS'].unique())


def bar_plot(var):
    cnt = df[var].value_counts()
    plt.figure(figsize=(10, 3))
    plt.bar(cnt.index.astype(str), cnt.values, color='skyblue', edgecolor='black')
    plt.xlabel(var)
    plt.ylabel('Count')
    plt.title(f'{var} Distribution')
    plt.show()
    print(f"{var} counts:\n", cnt)

if 'CLASS' in df.columns:
    bar_plot('CLASS')


def hist_plot(var):
    plt.figure(figsize=(8, 4))
    plt.hist(df[var].dropna(), bins=30, color='lightgreen', edgecolor='black')
    plt.xlabel(var)
    plt.ylabel('Frequency')
    plt.title(f'Histogram of {var}')
    plt.show()

for var in num_cols:
    hist_plot(var)


if 'AGE' in df.columns and 'Chol' in df.columns:
    age_chol = df.groupby('AGE')['Chol'].mean().reset_index()
    plt.figure(figsize=(10, 5))
    plt.plot(age_chol['AGE'], age_chol['Chol'], marker='o', linestyle='-', color='#f28e2b')
    plt.xlabel('Age')
    plt.ylabel('Avg Cholesterol')
    plt.title('Avg Cholesterol vs. Age')
    plt.show()


if all(x in df.columns for x in ['BMI', 'HbA1c']):
    plt.figure(figsize=(8, 5))
    if 'CLASS' in df.columns:
        sns.scatterplot(data=df, x='BMI', y='HbA1c', hue='CLASS', palette="viridis")
    plt.xlabel('BMI')
    plt.ylabel('HbA1c')
    plt.title('BMI vs. HbA1c')
    plt.show()


plt.figure(figsize=(10, 8))
corr = df[num_cols].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()
