# **1. Perkenalan Dataset**

Dataset yang digunakan: **Diabetes Prediction Dataset**
- Sumber: Healthcare/Medical Data
- Jumlah sampel: Variable (depends on dataset size)
- Jumlah fitur: 8 (gender, age, hypertension, heart_disease, smoking_history, bmi, HbA1c_level, blood_glucose_level)
- Target: diabetes (0 = No Diabetes, 1 = Diabetes)
- Tipe data: Mixed (numerik untuk sebagian besar fitur, kategorikal untuk gender dan smoking_history)

# **2. Import Library**

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Utilities
import warnings
warnings.filterwarnings('ignore')

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

# **3. Memuat Dataset**

In [None]:
# Load Diabetes dataset
df = pd.read_csv('diabetes_prediction_dataset.csv')

# Define feature categories
categorical_features = ['gender', 'smoking_history']
numerical_features = ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']
target_column = 'diabetes'

# Save raw data
df.to_csv('diabetes_raw.csv', index=False)

print(f"Dataset shape: {df.shape}")
print(f"\nDataset columns: {list(df.columns)}")
print(f"\nDataset info:")
print(df.info())
print(f"\nFirst 5 rows:")
df.head()

# **4. Exploratory Data Analysis (EDA)**

In [None]:
# Basic statistics
print("=== BASIC STATISTICS ===")
print("\nNumerical Features:")
print(df[numerical_features].describe())

print("\n=== MISSING VALUES ===")
missing_values = df.isnull().sum()
print(missing_values)
print(f"Total missing values: {missing_values.sum()}")

print("\n=== TARGET DISTRIBUTION ===")
target_dist = df[target_column].value_counts()
print(target_dist)
print(f"Diabetes prevalence: {target_dist[1]/len(df)*100:.2f}%")

print("\n=== CATEGORICAL FEATURES DISTRIBUTION ===")
for col in categorical_features:
    print(f"\n{col}:")
    print(df[col].value_counts())

In [None]:
# Comprehensive visualizations
fig, axes = plt.subplots(3, 3, figsize=(20, 18))
axes = axes.flatten()

# 1. Target distribution
target_counts = df[target_column].value_counts()
axes[0].pie(target_counts.values, labels=['No Diabetes', 'Diabetes'], autopct='%1.1f%%', 
           colors=['lightblue', 'lightcoral'])
axes[0].set_title('Diabetes Distribution')

# 2. Age distribution
axes[1].hist(df['age'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
axes[1].set_title('Age Distribution')
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Frequency')

# 3. BMI distribution
axes[2].hist(df['bmi'], bins=30, alpha=0.7, color='lightgreen', edgecolor='black')
axes[2].set_title('BMI Distribution')
axes[2].set_xlabel('BMI')
axes[2].set_ylabel('Frequency')

# 4. Gender distribution
gender_counts = df['gender'].value_counts()
axes[3].bar(gender_counts.index, gender_counts.values, color=['pink', 'lightblue'])
axes[3].set_title('Gender Distribution')
axes[3].set_xlabel('Gender')
axes[3].set_ylabel('Count')
axes[3].tick_params(axis='x', rotation=45)

# 5. Smoking history distribution
smoking_counts = df['smoking_history'].value_counts()
axes[4].bar(range(len(smoking_counts)), smoking_counts.values, color='orange', alpha=0.7)
axes[4].set_xticks(range(len(smoking_counts)))
axes[4].set_xticklabels(smoking_counts.index, rotation=45, ha='right')
axes[4].set_title('Smoking History Distribution')
axes[4].set_ylabel('Count')

# 6. HbA1c levels
axes[5].hist(df['HbA1c_level'], bins=20, alpha=0.7, color='purple', edgecolor='black')
axes[5].set_title('HbA1c Level Distribution')
axes[5].set_xlabel('HbA1c Level')
axes[5].set_ylabel('Frequency')

# 7. Blood glucose levels
axes[6].hist(df['blood_glucose_level'], bins=30, alpha=0.7, color='red', edgecolor='black')
axes[6].set_title('Blood Glucose Level Distribution')
axes[6].set_xlabel('Blood Glucose Level')
axes[6].set_ylabel('Frequency')

# 8. Hypertension vs Diabetes
hyp_diabetes = pd.crosstab(df['hypertension'], df[target_column])
hyp_diabetes.plot(kind='bar', ax=axes[7], color=['lightblue', 'lightcoral'])
axes[7].set_title('Hypertension vs Diabetes')
axes[7].set_xlabel('Hypertension (0=No, 1=Yes)')
axes[7].set_ylabel('Count')
axes[7].legend(['No Diabetes', 'Diabetes'])
axes[7].tick_params(axis='x', rotation=0)

# 9. Heart disease vs Diabetes
heart_diabetes = pd.crosstab(df['heart_disease'], df[target_column])
heart_diabetes.plot(kind='bar', ax=axes[8], color=['lightgreen', 'orange'])
axes[8].set_title('Heart Disease vs Diabetes')
axes[8].set_xlabel('Heart Disease (0=No, 1=Yes)')
axes[8].set_ylabel('Count')
axes[8].legend(['No Diabetes', 'Diabetes'])
axes[8].tick_params(axis='x', rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Correlation analysis
plt.figure(figsize=(12, 8))
all_numerical = numerical_features + [target_column]
correlation_matrix = df[all_numerical].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
           square=True, linewidths=0.5)
plt.title('Correlation Matrix - Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# Feature analysis by diabetes status
print("=== FEATURE STATISTICS BY DIABETES STATUS ===")
for feature in numerical_features:
    print(f"\n{feature.upper()}:")
    no_diabetes = df[df[target_column] == 0][feature]
    diabetes = df[df[target_column] == 1][feature]
    
    print(f"No Diabetes - Mean: {no_diabetes.mean():.2f}, Std: {no_diabetes.std():.2f}")
    print(f"Diabetes - Mean: {diabetes.mean():.2f}, Std: {diabetes.std():.2f}")
    print(f"Difference: {diabetes.mean() - no_diabetes.mean():.2f}")

In [None]:
# Box plots for numerical features by diabetes status
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, feature in enumerate(numerical_features):
    sns.boxplot(data=df, x=target_column, y=feature, ax=axes[i])
    axes[i].set_title(f'{feature} by Diabetes Status')
    axes[i].set_xlabel('Diabetes (0=No, 1=Yes)')

plt.tight_layout()
plt.show()

# **5. Data Preprocessing**

In [None]:
print("=== DATA PREPROCESSING STEPS ===")

# 1. Check for missing values
print("\n1. Checking for missing values:")
missing_values = df.isnull().sum()
print(missing_values)
print(f"Total missing values: {missing_values.sum()}")

# Handle missing values if any
df_clean = df.copy()
if missing_values.sum() > 0:
    print("Handling missing values...")
    # Fill numerical missing values with median
    for col in numerical_features:
        if df_clean[col].isnull().sum() > 0:
            median_val = df_clean[col].median()
            df_clean[col].fillna(median_val, inplace=True)
            print(f"  - Filled {col} with median: {median_val:.2f}")
    
    # Fill categorical missing values with mode
    for col in categorical_features:
        if df_clean[col].isnull().sum() > 0:
            mode_val = df_clean[col].mode()[0]
            df_clean[col].fillna(mode_val, inplace=True)
            print(f"  - Filled {col} with mode: {mode_val}")
else:
    print("No missing values found.")

# 2. Check for duplicates
print("\n2. Checking for duplicates:")
duplicates = df_clean.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

if duplicates > 0:
    print("Removing duplicates...")
    df_clean = df_clean.drop_duplicates().reset_index(drop=True)
    print(f"Shape after removing duplicates: {df_clean.shape}")
else:
    print("No duplicates found.")

In [None]:
# 3. Data type cleaning and validation
print("\n3. Data type cleaning and validation:")

# Clean categorical features
for col in categorical_features:
    df_clean[col] = df_clean[col].astype(str).str.strip().str.lower()
    print(f"Cleaned {col}: {df_clean[col].unique()}")

# Validate numerical ranges
print("\nValidating numerical ranges:")
original_len = len(df_clean)

# Age validation (0-120)
if 'age' in df_clean.columns:
    df_clean = df_clean[(df_clean['age'] > 0) & (df_clean['age'] <= 120)]
    print(f"Age validation: {original_len} → {len(df_clean)} rows")

# BMI validation (10-70)
if 'bmi' in df_clean.columns:
    df_clean = df_clean[(df_clean['bmi'] > 0) & (df_clean['bmi'] <= 70)]
    print(f"BMI validation: kept rows with BMI > 0 and <= 70")

df_clean = df_clean.reset_index(drop=True)
print(f"Final shape after validation: {df_clean.shape}")

In [None]:
# 4. Outlier detection using IQR method
print("\n4. Outlier Detection using IQR method:")

def detect_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return outliers.index, lower_bound, upper_bound

outlier_summary = {}
for feature in numerical_features:
    outlier_indices, lower, upper = detect_outliers_iqr(df_clean, feature)
    outlier_count = len(outlier_indices)
    outlier_percentage = outlier_count/len(df_clean)*100
    
    outlier_summary[feature] = {
        'count': outlier_count,
        'percentage': outlier_percentage,
        'bounds': (lower, upper)
    }
    
    print(f"{feature}: {outlier_count} outliers ({outlier_percentage:.2f}%) - Bounds: [{lower:.2f}, {upper:.2f}]")

print("\nNote: Keeping outliers as they may be important for diabetes prediction.")

In [None]:
# 5. Feature encoding
print("\n5. Feature Encoding:")

# Separate features and target
X = df_clean.drop(target_column, axis=1)
y = df_clean[target_column]

# Encode categorical features using Label Encoding
label_encoders = {}
X_encoded = X.copy()

for col in categorical_features:
    if col in X_encoded.columns:
        label_encoders[col] = LabelEncoder()
        X_encoded[col] = label_encoders[col].fit_transform(X_encoded[col])
        
        # Show encoding mapping
        mapping = dict(zip(label_encoders[col].classes_, 
                          label_encoders[col].transform(label_encoders[col].classes_)))
        print(f"Encoded {col}: {mapping}")

print(f"\nFeatures after encoding: {list(X_encoded.columns)}")

In [None]:
# 6. Feature scaling
print("\n6. Feature Scaling:")

print(f"Original ranges for numerical features:")
for feature in numerical_features:
    if feature in X_encoded.columns:
        print(f"  {feature}: {X_encoded[feature].min():.2f} - {X_encoded[feature].max():.2f}")

# Apply StandardScaler to numerical features only
scaler = StandardScaler()
X_scaled = X_encoded.copy()

# Scale only numerical features
numerical_cols_to_scale = [col for col in numerical_features if col in X_scaled.columns]
X_scaled[numerical_cols_to_scale] = scaler.fit_transform(X_encoded[numerical_cols_to_scale])

print(f"\nAfter scaling - ranges for numerical features:")
for feature in numerical_cols_to_scale:
    print(f"  {feature}: {X_scaled[feature].min():.2f} - {X_scaled[feature].max():.2f}")

print(f"\nCategorical features (not scaled): {[col for col in categorical_features if col in X_scaled.columns]}")

In [None]:
# 7. Create final preprocessed dataset
print("\n7. Creating Final Preprocessed Dataset:")

# Combine scaled features with target
processed_df = X_scaled.copy()
processed_df[target_column] = y

print(f"Final preprocessed dataset shape: {processed_df.shape}")
print(f"Features: {list(processed_df.columns[:-1])}")
print(f"Target column: {target_column}")

print(f"\nTarget distribution in preprocessed data:")
target_dist = processed_df[target_column].value_counts().sort_index()
print(target_dist)
print(f"Diabetes prevalence: {target_dist[1]/len(processed_df)*100:.2f}%")

print(f"\nFirst 5 rows of preprocessed data:")
print(processed_df.head())

# Save preprocessed data
processed_df.to_csv('diabetes_preprocessed.csv', index=False)
print("\nPreprocessed data saved as 'diabetes_preprocessed.csv'")

# Verification
verification_df = pd.read_csv('diabetes_preprocessed.csv')
print(f"Verification - loaded file shape: {verification_df.shape}")

In [None]:
# 8. Quick model validation to ensure preprocessing quality
print("\n8. Quick Model Validation:")

# Split data for quick validation
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training set target distribution: {y_train.value_counts().sort_index().to_dict()}")
print(f"Test set target distribution: {y_test.value_counts().sort_index().to_dict()}")

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"\nQuick validation accuracy: {accuracy:.4f}")

# Classification report
target_names = ['No Diabetes', 'Diabetes']
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))

# Feature importance analysis
feature_importance = pd.DataFrame({
    'feature': X_scaled.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nTop 5 Most Important Features:")
for idx, row in feature_importance.head().iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

In [None]:
# Confusion Matrix Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
           xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix - Diabetes Prediction')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Feature importance visualization
plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance.head(8), x='importance', y='feature', palette='viridis')
plt.title('Top 8 Feature Importance - Diabetes Prediction')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.tight_layout()
plt.show()

In [None]:
# Final summary
print("\n" + "="*70)
print("DIABETES DATASET PREPROCESSING SUMMARY")
print("="*70)
print(f"Author: alpian_khairi_C1BO")
print(f"Dataset: Diabetes Prediction Dataset")
print(f"\n📊 DATA OVERVIEW:")
print(f"✓ Original dataset shape: {df.shape}")
print(f"✓ Final preprocessed shape: {processed_df.shape}")
print(f"✓ Features: {len(processed_df.columns)-1} ({len(numerical_features)} numerical, {len(categorical_features)} categorical)")
print(f"✓ Target: {target_column} (Binary: 0=No Diabetes, 1=Diabetes)")

print(f"\n🔧 PREPROCESSING STEPS:")
print(f"✓ Missing values handled: {df.isnull().sum().sum()} → 0")
print(f"✓ Duplicates removed: {duplicates}")
print(f"✓ Data type validation: ✓")
print(f"✓ Outlier detection completed: ✓")
print(f"✓ Categorical encoding (Label Encoding): {len(categorical_features)} features")
print(f"✓ Numerical scaling (StandardScaler): {len(numerical_features)} features")

print(f"\n🎯 VALIDATION RESULTS:")
print(f"✓ Model validation accuracy: {accuracy:.4f}")
print(f"✓ Most important feature: {feature_importance.iloc[0]['feature']}")
print(f"✓ Diabetes prevalence: {processed_df[target_column].value_counts()[1]/len(processed_df)*100:.2f}%")

print(f"\n💾 OUTPUT FILES:")
print(f"✓ diabetes_raw.csv (original data backup)")
print(f"✓ diabetes_preprocessed.csv (ready for ML modeling)")

print(f"\n🚀 NEXT STEPS:")
print(f"• The dataset is now ready for machine learning modeling")
print(f"• Consider trying different algorithms (SVM, XGBoost, Neural Networks)")
print(f"• Perform hyperparameter tuning for better performance")
print(f"• Consider feature selection techniques")
print(f"• Apply cross-validation for robust model evaluation")

print("="*70)
print("PREPROCESSING COMPLETED SUCCESSFULLY! 🎉")
print("="*70)