# Diabetes Dataset Download and Initial Setup
## Masters Project: Diabetes Risk Prediction System

This notebook downloads and performs initial checks on the Pima Indians Diabetes Dataset.

In [None]:
# Import required libraries
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.diabetes_data_loader import DiabetesDataLoader
import warnings
warnings.filterwarnings('ignore')

# Set plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print('‚úì Libraries imported successfully')

## 1. Download Dataset

In [None]:
# Initialize the diabetes data loader
loader = DiabetesDataLoader(data_path='../data/raw/diabetes.csv')

# Load the dataset (will auto-download if not found)
df = loader.load_data(auto_download=True)

## 2. Dataset Overview

In [None]:
# Get detailed dataset information
info = loader.get_data_info()

In [None]:
# Display first few rows
print('\nüìä First 5 rows of the dataset:')
df.head()

In [None]:
# Display dataset info
print('\nüìã Dataset Information:')
df.info()

In [None]:
# Statistical summary
print('\nüìà Statistical Summary:')
df.describe()

## 3. Quick Data Quality Check

In [None]:
# Check for missing values (encoded as zeros)
zero_counts = {}
cols_to_check = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

print('\n‚ö†Ô∏è  Zero Values (Potential Missing Data):')
for col in cols_to_check:
    if col in df.columns:
        zero_count = (df[col] == 0).sum()
        zero_pct = (zero_count / len(df)) * 100
        zero_counts[col] = zero_count
        print(f'   {col}: {zero_count} ({zero_pct:.1f}%)')

# Visualize zero counts
plt.figure(figsize=(10, 6))
plt.bar(zero_counts.keys(), zero_counts.values(), color='coral')
plt.xlabel('Features')
plt.ylabel('Number of Zero Values')
plt.title('Zero Values by Feature (Potential Missing Data)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Class distribution
print('\nüéØ Target Variable (Outcome) Distribution:')
outcome_counts = df['Outcome'].value_counts()
print(outcome_counts)
print(f'\nClass Balance:')
print(f'  No Diabetes (0): {outcome_counts[0]} ({outcome_counts[0]/len(df)*100:.1f}%)')
print(f'  Diabetes (1): {outcome_counts[1]} ({outcome_counts[1]/len(df)*100:.1f}%)')

# Visualize class distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Bar plot
outcome_counts.plot(kind='bar', ax=ax1, color=['skyblue', 'salmon'])
ax1.set_title('Class Distribution')
ax1.set_xlabel('Outcome')
ax1.set_ylabel('Count')
ax1.set_xticklabels(['No Diabetes (0)', 'Diabetes (1)'], rotation=0)

# Pie chart
ax2.pie(outcome_counts, labels=['No Diabetes (0)', 'Diabetes (1)'], 
        autopct='%1.1f%%', startangle=90, colors=['skyblue', 'salmon'])
ax2.set_title('Class Distribution (Percentage)')

plt.tight_layout()
plt.show()

## 4. Quick Feature Visualization

In [None]:
# Distribution of key features
key_features = ['Glucose', 'BMI', 'Age', 'Insulin']

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, feature in enumerate(key_features):
    if feature in df.columns:
        # Separate by outcome
        df[df['Outcome']==0][feature].hist(ax=axes[idx], bins=30, alpha=0.6, label='No Diabetes', color='skyblue')
        df[df['Outcome']==1][feature].hist(ax=axes[idx], bins=30, alpha=0.6, label='Diabetes', color='salmon')
        axes[idx].set_title(f'Distribution of {feature} by Outcome')
        axes[idx].set_xlabel(feature)
        axes[idx].set_ylabel('Frequency')
        axes[idx].legend()

plt.tight_layout()
plt.show()

## 5. Save Dataset Info

In [None]:
# Create a summary report
summary_report = f"""
{'='*60}
DIABETES DATASET SUMMARY REPORT
{'='*60}

Dataset: Pima Indians Diabetes Database
Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

BASIC INFORMATION:
- Total Samples: {len(df)}
- Features: {df.shape[1] - 1}
- Target Variable: Outcome (0: No Diabetes, 1: Diabetes)

CLASS DISTRIBUTION:
- No Diabetes (0): {outcome_counts[0]} ({outcome_counts[0]/len(df)*100:.1f}%)
- Diabetes (1): {outcome_counts[1]} ({outcome_counts[1]/len(df)*100:.1f}%)
- Imbalance Ratio: {outcome_counts[0]/outcome_counts[1]:.2f}:1

DATA QUALITY:
- Missing Values: {df.isnull().sum().sum()}
- Zero Values (Potential Missing):
"""

for col, count in zero_counts.items():
    summary_report += f"  - {col}: {count} ({count/len(df)*100:.1f}%)\n"

summary_report += f"""
KEY STATISTICS:
- Glucose Mean: {df['Glucose'].mean():.2f} mg/dL
- BMI Mean: {df['BMI'].mean():.2f}
- Age Mean: {df['Age'].mean():.2f} years

STATUS: ‚úì Dataset downloaded and validated
NEXT STEPS: Proceed to exploratory data analysis (EDA)

{'='*60}
"""

print(summary_report)

# Save report to file
with open('../data/raw/dataset_summary.txt', 'w') as f:
    f.write(summary_report)

print('\n‚úì Summary report saved to data/raw/dataset_summary.txt')

## 6. Data Preparation for Modeling (Preview)

In [None]:
# Prepare data for modeling
X_train, X_test, y_train, y_test = loader.prepare_for_modeling(
    test_size=0.2,
    random_state=42,
    handle_zeros=True,
    scale_features=True
)

print('\n‚úì Data prepared and ready for modeling!')
print(f'\nTraining set: {X_train.shape}')
print(f'Testing set: {X_test.shape}')

## Summary

### Dataset Status: ‚úì Ready

**Key Findings:**
1. Dataset loaded successfully with 768 instances
2. Class imbalance detected (65% No Diabetes, 35% Diabetes)
3. Missing values identified (encoded as zeros)
4. Data preprocessed and ready for modeling

**Next Steps:**
1. ‚úÖ Dataset downloaded and validated
2. ‚Üí Proceed to comprehensive EDA (Notebook 01)
3. ‚Üí Feature engineering and selection
4. ‚Üí Model development and evaluation

**Files Created:**
- `data/raw/diabetes.csv` - Original dataset
- `data/raw/dataset_summary.txt` - Summary report

---
**Project:** Machine Learning-Based Diabetes Risk Prediction System  
**Student:** Mohammed Azhar  
**Date:** January 2026