In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
df = pd.read_csv(r"C:\Users\ghans\Downloads\diabetes_prediction_dataset.csv")

In [None]:
print("--- Data Quality and Integrity Check (Part 1) ---")
print(df.describe())

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
print(df['smoking_history'].value_counts())

In [None]:
print(df['diabetes'].value_counts(normalize=True))

In [None]:
print("--- Data Quality and Integrity Check (Part 2: Outlier Detection) ---")
continuous_features = ['bmi', 'HbA1c_level', 'blood_glucose_level']
plt.figure(figsize=(15, 5))
for i, col in enumerate(continuous_features):
    plt.subplot(1, 3, i + 1)
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot of {col}')
plt.show()


In [None]:
 print("--- Data Quality and Integrity Check (Part 3: IQR Outlier Removal) ---")
for col in continuous_features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

In [None]:
print("\n--- Univariate Analysis (Histograms) ---")
plt.figure(figsize=(15, 10))
for i, col in enumerate(['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']):
    plt.subplot(2, 2, i + 1)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.show()

In [None]:
print("\n--- Univariate Analysis (BMI Categorization) ---")
bins = [0, 18.5, 25, 30, float('inf')]
labels = ['Underweight', 'Normal', 'Overweight', 'Obese']
df['bmi_category'] = pd.cut(df['bmi'], bins=bins, labels=labels, right=False)
print(df['bmi_category'].value_counts())

In [None]:
print("\n--- Bivariate Analysis (Age and BMI vs. Diabetes) ---")
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.boxplot(x='diabetes', y='age', data=df)
plt.title('Age vs. Diabetes')
plt.subplot(1, 2, 2)
sns.boxplot(x='diabetes', y='bmi', data=df)
plt.title('BMI vs. Diabetes')
plt.show()

In [None]:
print("\n--- Bivariate Analysis (Smoking History vs. Diabetes) ---")
plt.figure(figsize=(8, 5))
sns.countplot(x='smoking_history', hue='diabetes', data=df)
plt.title('Smoking History vs. Diabetes')
plt.xticks(rotation=45)
plt.show()


In [None]:
print("\n--- Multivariate Analysis (Correlation Heatmap) ---")
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
print("\n--- Comorbidity Analysis (Hypertension and Heart Disease) ---")
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
sns.countplot(x='hypertension', hue='diabetes', data=df)
plt.title('Hypertension vs. Diabetes')
plt.subplot(1, 2, 2)
sns.countplot(x='heart_disease', hue='diabetes', data=df)
plt.title('Heart Disease vs. Diabetes')
plt.show()

In [None]:
print("\n--- Comorbidity Analysis (Number of Comorbidities) ---")
df['comorbidities'] = df['hypertension'] + df['heart_disease']
plt.figure(figsize=(8, 5))
sns.countplot(x='comorbidities', hue='diabetes', data=df)
plt.title('Number of Comorbidities vs. Diabetes')
plt.show()

In [None]:
print("\n--- Gender and Health Outcome Disparities ---")
plt.figure(figsize=(8, 5))
sns.countplot(x='gender', hue='diabetes', data=df)
plt.title('Gender vs. Diabetes')
plt.show()

In [None]:
print("\n--- Anomaly Detection and Risk Stratification (Z-scores) ---")
z_scores = np.abs(stats.zscore(df['blood_glucose_level']))
anomalies = df[z_scores > 3]
print("Anomalies (high blood glucose):")
print(anomalies)


In [None]:
print("\n--- Anomaly Detection and Risk Stratification (Risk Profiles) ---")
high_risk = df[(df['age'] > 60) & (df['bmi'] > 30) & (df['HbA1c_level'] > 7)]
print("\nHigh-Risk Profiles:")
print(high_risk)


In [None]:
print("\n--- Feature Engineering Opportunities (Age Groups) ---")
bins_age = [0, 20, 40, 60, float('inf')]
labels_age = ['0-20', '21-40', '41-60', '61+']
df['age_group'] = pd.cut(df['age'], bins=bins_age, labels=labels_age, right=False)

In [None]:
print("\n--- Feature Engineering Opportunities (Composite Risk Score) ---")
df['risk_score'] = (df['bmi'] / df['bmi'].max() +
                    df['HbA1c_level'] / df['HbA1c_level'].max() +
                    df['blood_glucose_level'] / df['blood_glucose_level'].max()) / 3


In [None]:
print("\n--- Feature Engineering Opportunities (Interaction Terms) ---")
df['age_bmi'] = df['age'] * df['bmi']
df['bmi_HbA1c'] = df['bmi'] * df['HbA1c_level']
print(df.head())