Connected to Python 3.12.5

 # Preprocessing the Pima Indians Diabetes Dataset

 This code preprocesses the Pima Indians Diabetes dataset (769 records) to prepare it for training machine learning models (Neural Network + Fuzzy Logic). It includes steps like data cleaning, handling missing values, removing outliers, and feature engineering. Modified to retain fuzzy logic features (Glucose, BMI, BloodPressure, Age).

 import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest
import seaborn as sns
import matplotlib.pyplot as plt
import os

 Load the Pima Indians Diabetes Dataset from local file

In [None]:
data_path = 'pima-indians-diabetes.data.csv'
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 
           'DiabetesPedigreeFunction', 'Age', 'Outcome']
try:
    data = pd.read_csv(data_path, names=columns)
    print(f"Dataset loaded successfully from {os.path.abspath(data_path)}")
except FileNotFoundError:
    print(f"Error: File {data_path} not found. Please download the dataset and update the path.")
    exit(1)

Dataset loaded successfully from c:\Users\fawzy\OneDrive\Desktop\soft_4\pima-indians-diabetes.data.csv


 Debugging: Print first few rows and data types

In [None]:
print("First 5 rows of loaded data:\n", data.head())
print("Data types:\n", data.dtypes)

First 5 rows of loaded data:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI   
1            6      148             72             35        0  33.6   
2            1       85             66             29        0  26.6   
3            8      183             64              0        0  23.3   
4            1       89             66             23       94  28.1   

   DiabetesPedigreeFunction  Age  Outcome  
0  DiabetesPedigreeFunction  Age  Outcome  
1                     0.627   50        1  
2                     0.351   31        0  
3                     0.672   32        1  
4                     0.167   21        0  
Data types:
 Pregnancies                 object
Glucose                     object
BloodPressure               object
SkinThickness               object
Insulin                     object
BMI                         object
DiabetesPedigreeFunction    object
Age            

 Check for non-numeric values in Pregnancies

In [None]:
non_numeric = data[data['Pregnancies'].astype(str).str.contains(r'[^0-9]')]
if not non_numeric.empty:
    print("Non-numeric values in Pregnancies:\n", non_numeric['Pregnancies'])
    print("Replacing non-numeric values in Pregnancies with median.")
    data['Pregnancies'] = pd.to_numeric(data['Pregnancies'], errors='coerce')
    median_pregnancies = data['Pregnancies'].median()
    data['Pregnancies'].fillna(median_pregnancies, inplace=True)
else:
    data['Pregnancies'] = data['Pregnancies'].astype(int)

Non-numeric values in Pregnancies:
 0    Pregnancies
Name: Pregnancies, dtype: object
Replacing non-numeric values in Pregnancies with median.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Pregnancies'].fillna(median_pregnancies, inplace=True)


 Ensure other columns are numeric

In [None]:
for col in columns[1:]:  # Skip Pregnancies, already handled
    data[col] = pd.to_numeric(data[col], errors='coerce')
    if data[col].isna().any():
        print(f"Warning: Non-numeric values found in {col}. Replacing with median.")
        data[col].fillna(data[col].median(), inplace=True)



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

 Verify data types after conversion

In [None]:
print("Data types after conversion:\n", data.dtypes)

Data types after conversion:
 Pregnancies                 float64
Glucose                     float64
BloodPressure               float64
SkinThickness               float64
Insulin                     float64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                         float64
Outcome                     float64
dtype: object


 # Step 1: Handle Missing Values

In [None]:
# Features with implausible zeros: Glucose, BloodPressure, SkinThickness, Insulin, BMI
features_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for feature in features_with_zeros:
    # Replace 0 with NaN for imputation
    data[feature] = data[feature].replace(0, np.nan)

# Use Iterative Imputation to fill missing values
imputer = IterativeImputer(max_iter=10, random_state=42)
try:
    data[features_with_zeros] = imputer.fit_transform(data[features_with_zeros])
except ValueError as e:
    print(f"Error during imputation: {e}")
    print("Ensure all values in features_with_zeros are numeric.")
    exit(1)

In [None]:
# Verify no missing values remain
print("Missing values after imputation:\n", data.isnull().sum())

Missing values after imputation:
 Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


 # Step 2: Outlier Detection

In [None]:
# Use Isolation Forest to detect and remove outliers
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(data[features_with_zeros])
data = data[outliers == 1]  # Keep non-outliers
print(f"Removed {sum(outliers == -1)} outliers from the dataset.")

Removed 39 outliers from the dataset.


 # Step 3: Feature Engineering

In [None]:
# Interaction features
data['Glucose_BMI'] = data['Glucose'] * data['BMI']
data['Insulin_Pedigree'] = data['Insulin'] * data['DiabetesPedigreeFunction']

In [None]:
# Non-linear transformations
data['Log_Insulin'] = np.log1p(data['Insulin'])
data['Log_DiabetesPedigree'] = np.log1p(data['DiabetesPedigreeFunction'])

In [None]:
# Create categorical feature: Obesity Category (Derived from BMI)
def categorize_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'
data['Obesity_Category'] = data['BMI'].apply(categorize_bmi)

In [None]:
# Create categorical feature: Age Group (Derived from Age)
def categorize_age(age):
    if age <= 30:
        return 'Young'
    elif 30 < age <= 50:
        return 'Middle-Aged'
    else:
        return 'Senior'
data['Age_Group'] = data['Age'].apply(categorize_age)

In [None]:
# One-hot encode categorical features
data = pd.get_dummies(data, columns=['Obesity_Category', 'Age_Group'], prefix=['Obesity', 'Age'])

 # Step 4: Apply SMOTE for Class Balancing

In [None]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']
smote = SMOTE(random_state=42, sampling_strategy=0.8)
X_balanced, y_balanced = smote.fit_resample(X, y)
data_balanced = pd.concat([pd.DataFrame(X_balanced, columns=X.columns), pd.Series(y_balanced, name='Outcome')], axis=1)
print("Class distribution after SMOTE:\n", data_balanced['Outcome'].value_counts(normalize=True))



Class distribution after SMOTE:
 Outcome
0.0    0.555683
1.0    0.444317
Name: proportion, dtype: float64


 # Step 5: Feature Selection for Neural Network

In [None]:
# Pre-select BloodPressure and DiabetesPedigreeFunction
pre_selected_features = ['BloodPressure', 'DiabetesPedigreeFunction']

In [None]:
# Verify pre-selected features exist
missing_pre_selected = [feat for feat in pre_selected_features if feat not in data_balanced.columns]
if missing_pre_selected:
    print(f"Error: The following pre-selected features are missing: {missing_pre_selected}")
    exit(1)

In [None]:
# Define features required for fuzzy logic
fuzzy_required_features = ['Glucose', 'BMI', 'BloodPressure', 'Age']

In [None]:
# Use mutual information for feature selection (excluding pre-selected and fuzzy features)
X = data_balanced.drop(['Outcome'] + pre_selected_features + fuzzy_required_features, axis=1)
y = data_balanced['Outcome']
selector = SelectKBest(score_func=mutual_info_classif, k=6)  # Select top 6 features
X_selected = selector.fit_transform(X, y)

In [None]:
# Get selected feature names
selected_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_indices].tolist()

In [None]:
# Combine pre-selected and selected features for neural network
nn_selected_features = pre_selected_features + selected_features
# Remove Glucose and DiabetesPedigreeFunction as per original logic, but ensure fuzzy features are retained separately
if 'Glucose' in nn_selected_features:
    nn_selected_features.remove('Glucose')
if 'DiabetesPedigreeFunction' in nn_selected_features:
    nn_selected_features.remove('DiabetesPedigreeFunction')
print("Neural network selected features:", nn_selected_features)

Neural network selected features: ['BloodPressure', 'Pregnancies', 'SkinThickness', 'Insulin', 'Glucose_BMI', 'Insulin_Pedigree', 'Log_Insulin']


In [None]:
# Combine neural network and fuzzy features for the final dataset
final_selected_features = list(set(nn_selected_features + fuzzy_required_features))
print("Final selected features (including fuzzy features):", final_selected_features)

Final selected features (including fuzzy features): ['Log_Insulin', 'BloodPressure', 'Age', 'Insulin_Pedigree', 'SkinThickness', 'Pregnancies', 'BMI', 'Glucose_BMI', 'Insulin', 'Glucose']


In [None]:
# Update dataset with selected features (for both neural network and fuzzy logic)
data_selected = pd.concat([data_balanced[final_selected_features], data_balanced['Outcome']], axis=1)

 # Step 6: Correlation Heatmap

In [None]:
numeric_cols = data_selected.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 1:
    plt.figure(figsize=(10, 8))
    sns.heatmap(data_selected[numeric_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix of Selected Features')
    output_path = './correlation_matrix.png'
    plt.savefig(output_path)
    plt.close()
    print(f"Correlation heatmap saved to {os.path.abspath(output_path)}")
else:
    print("Warning: Not enough numeric columns for correlation heatmap.")

Correlation heatmap saved to c:\Users\fawzy\OneDrive\Desktop\soft_4\correlation_matrix.png


 # Step 7: Feature Scaling

In [None]:
scaler = StandardScaler()
features_to_scale = [col for col in data_selected.drop('Outcome', axis=1).columns 
                     if not col.startswith(('Obesity_', 'Age_'))]
data_selected[features_to_scale] = scaler.fit_transform(data_selected[features_to_scale])

 # Step 8: Train-Test Split

In [None]:
X_final = data_selected.drop('Outcome', axis=1)
y_final = data_selected['Outcome']
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)

In [None]:
# Verify class balance in train and test sets
print("Train set class distribution:\n", y_train.value_counts(normalize=True))
print("Test set class distribution:\n", y_test.value_counts(normalize=True))

Train set class distribution:
 Outcome
0.0    0.556034
1.0    0.443966
Name: proportion, dtype: float64
Test set class distribution:
 Outcome
0.0    0.554286
1.0    0.445714
Name: proportion, dtype: float64


In [None]:
# Save preprocessed data
X_train.to_csv('X_train_preprocessed.csv', index=False)
X_test.to_csv('X_test_preprocessed.csv', index=False)
y_train.to_csv('y_train_preprocessed.csv', index=False)
y_test.to_csv('y_test_preprocessed.csv', index=False)