In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('c:/Users/shifttech/Desktop/Univ_M/AIOT/data/diabetes.csv')

# Separate by outcome (0 = No Diabetes/Low Risk, 1 = Diabetes/High Risk)
low_risk = df[df['Outcome'] == 0]
high_risk = df[df['Outcome'] == 1]

print("=" * 80)
print("DIABETES RISK ANALYSIS - Dataset Statistics")
print("=" * 80)

print(f"\nüìä Dataset Overview:")
print(f"   Total samples: {len(df)}")
print(f"   Low Risk (No Diabetes): {len(low_risk)} ({len(low_risk)/len(df)*100:.1f}%)")
print(f"   High Risk (Diabetes): {len(high_risk)} ({len(high_risk)/len(df)*100:.1f}%)")

print("\n" + "=" * 80)
print("FEATURE RANGES BY RISK LEVEL")
print("=" * 80)

features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

for feature in features:
    print(f"\nüîç {feature}:")
    print(f"   {'LOW RISK (No Diabetes)':<30} {'HIGH RISK (Diabetes)':<30}")
    print(f"   {'-'*30} {'-'*30}")
    
    low_mean = low_risk[feature].mean()
    low_median = low_risk[feature].median()
    low_25 = low_risk[feature].quantile(0.25)
    low_75 = low_risk[feature].quantile(0.75)
    
    high_mean = high_risk[feature].mean()
    high_median = high_risk[feature].median()
    high_25 = high_risk[feature].quantile(0.25)
    high_75 = high_risk[feature].quantile(0.75)
    
    print(f"   Mean: {low_mean:>24.2f}   Mean: {high_mean:>24.2f}")
    print(f"   Median: {low_median:>22.2f}   Median: {high_median:>22.2f}")
    print(f"   25th-75th: {low_25:>9.2f} - {low_75:<9.2f}   25th-75th: {high_25:>9.2f} - {high_75:<9.2f}")

print("\n" + "=" * 80)
print("RISK THRESHOLDS (Based on Statistical Analysis)")
print("=" * 80)

thresholds = {
    'Pregnancies': (low_risk['Pregnancies'].quantile(0.75), high_risk['Pregnancies'].quantile(0.25)),
    'Glucose': (low_risk['Glucose'].quantile(0.75), high_risk['Glucose'].quantile(0.25)),
    'BloodPressure': (low_risk['BloodPressure'].quantile(0.75), high_risk['BloodPressure'].quantile(0.25)),
    'SkinThickness': (low_risk['SkinThickness'].quantile(0.75), high_risk['SkinThickness'].quantile(0.25)),
    'Insulin': (low_risk['Insulin'].quantile(0.75), high_risk['Insulin'].quantile(0.25)),
    'BMI': (low_risk['BMI'].quantile(0.75), high_risk['BMI'].quantile(0.25)),
    'DiabetesPedigreeFunction': (low_risk['DiabetesPedigreeFunction'].quantile(0.75), high_risk['DiabetesPedigreeFunction'].quantile(0.25)),
    'Age': (low_risk['Age'].quantile(0.75), high_risk['Age'].quantile(0.25))
}

print("\nüìä Suggested Thresholds:")
print(f"{'Feature':<30} {'Low Risk (‚â§)':<20} {'High Risk (‚â•)':<20}")
print("-" * 70)
for feature, (low_thresh, high_thresh) in thresholds.items():
    print(f"{feature:<30} {low_thresh:<20.2f} {high_thresh:<20.2f}")

print("\n" + "=" * 80)
print("KEY INDICATORS FOR HIGH RISK")
print("=" * 80)
print("\nüî¥ HIGH RISK if:")
print(f"   ‚Ä¢ Glucose ‚â• {high_risk['Glucose'].quantile(0.25):.0f} mg/dL")
print(f"   ‚Ä¢ BMI ‚â• {high_risk['BMI'].quantile(0.25):.1f}")
print(f"   ‚Ä¢ Age ‚â• {high_risk['Age'].quantile(0.25):.0f} years")
print(f"   ‚Ä¢ DiabetesPedigreeFunction ‚â• {high_risk['DiabetesPedigreeFunction'].quantile(0.25):.3f}")
print(f"   ‚Ä¢ Insulin ‚â• {high_risk['Insulin'].quantile(0.25):.0f} ŒºU/mL (if not 0)")
print(f"   ‚Ä¢ Pregnancies ‚â• {high_risk['Pregnancies'].quantile(0.25):.0f}")

print("\nüü¢ LOW RISK if:")
print(f"   ‚Ä¢ Glucose ‚â§ {low_risk['Glucose'].quantile(0.75):.0f} mg/dL")
print(f"   ‚Ä¢ BMI ‚â§ {low_risk['BMI'].quantile(0.75):.1f}")
print(f"   ‚Ä¢ Age ‚â§ {low_risk['Age'].quantile(0.75):.0f} years")
print(f"   ‚Ä¢ DiabetesPedigreeFunction ‚â§ {low_risk['DiabetesPedigreeFunction'].quantile(0.75):.3f}")

print("\n" + "=" * 80)

DIABETES RISK ANALYSIS - Dataset Statistics

üìä Dataset Overview:
   Total samples: 768
   Low Risk (No Diabetes): 500 (65.1%)
   High Risk (Diabetes): 268 (34.9%)

FEATURE RANGES BY RISK LEVEL

üîç Pregnancies:
   LOW RISK (No Diabetes)         HIGH RISK (Diabetes)          
   ------------------------------ ------------------------------
   Mean:                     3.30   Mean:                     4.87
   Median:                   2.00   Median:                   4.00
   25th-75th:      1.00 - 5.00        25th-75th:      1.75 - 8.00     

üîç Glucose:
   LOW RISK (No Diabetes)         HIGH RISK (Diabetes)          
   ------------------------------ ------------------------------
   Mean:                   109.98   Mean:                   141.26
   Median:                 107.00   Median:                 140.00
   25th-75th:     93.00 - 125.00      25th-75th:    119.00 - 167.00   

üîç BloodPressure:
   LOW RISK (No Diabetes)         HIGH RISK (Diabetes)          
   -----------