In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold

In [2]:
# Load dataset
df = pd.read_csv("Diabetes.csv")

In [3]:
# Clean column names
df.columns = df.columns.str.strip()

In [4]:
# Select numerical columns (excluding target variable if present)
numerical_cols = df.select_dtypes(include=['number']).drop(columns=['Class variable'], errors='ignore')

In [9]:
# Standard Deviation & Variance
std_dev = numerical_cols.std()
variance = numerical_cols.var()

In [11]:
# Covariance & Correlation
covariance = numerical_cols.cov()
correlation = numerical_cols.corr()


In [13]:
# Identify Independent Features (Low Variance Removal)
selector = VarianceThreshold(threshold=0.1)  # Removing near-constant features
selector.fit(numerical_cols)
independent_features = numerical_cols.columns[selector.get_support()]

In [15]:
# Identify Unwanted Features (High Correlation)
high_correlation_pairs = set()
correlation_threshold = 0.9  # Adjust threshold if needed
for i in range(len(correlation.columns)):
    for j in range(i):
        if abs(correlation.iloc[i, j]) > correlation_threshold:
            high_correlation_pairs.add((correlation.columns[i], correlation.columns[j]))


In [17]:
# Data Discretization (Equi-Frequency Binning on Age)
df["Age_Binned"], bins = pd.qcut(df["Age (years)"], q=5, labels=False, retbins=True)

In [19]:
# Normalization Methods: Min-Max, Z-Score, and Decimal Scaling
for col in ["Plasma glucose concentration", "Diastolic blood pressure", "Body mass index"]:
    df[f"{col}_MinMax"] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    df[f"{col}_ZScore"] = (df[col] - df[col].mean()) / df[col].std()
    df[f"{col}_Decimal"] = df[col] / (10 ** np.ceil(np.log10(df[col].abs().max())))

In [21]:
# Display Results
print("Standard Deviation:\n", std_dev)

Standard Deviation:
 Number of times pregnant          3.369578
Plasma glucose concentration     31.972618
Diastolic blood pressure         19.355807
Triceps skin fold thickness      15.952218
2-Hour serum insulin            115.244002
Body mass index                   7.884160
Diabetes pedigree function        0.331329
Age (years)                      11.760232
dtype: float64


In [23]:
print("\nVariance:\n", variance)


Variance:
 Number of times pregnant           11.354056
Plasma glucose concentration     1022.248314
Diastolic blood pressure          374.647271
Triceps skin fold thickness       254.473245
2-Hour serum insulin            13281.180078
Body mass index                    62.159984
Diabetes pedigree function          0.109779
Age (years)                       138.303046
dtype: float64


In [25]:
print("\nCovariance:\n", covariance)


Covariance:
                               Number of times pregnant  \
Number of times pregnant                     11.354056   
Plasma glucose concentration                 13.947131   
Diastolic blood pressure                      9.214538   
Triceps skin fold thickness                  -4.390041   
2-Hour serum insulin                        -28.555231   
Body mass index                               0.469774   
Diabetes pedigree function                   -0.037426   
Age (years)                                  21.570620   

                              Plasma glucose concentration  \
Number of times pregnant                         13.947131   
Plasma glucose concentration                   1022.248314   
Diastolic blood pressure                         94.430956   
Triceps skin fold thickness                      29.239183   
2-Hour serum insulin                           1220.935799   
Body mass index                                  55.726987   
Diabetes pedigree function   

In [27]:
print("\nCorrelation:\n", correlation)


Correlation:
                               Number of times pregnant  \
Number of times pregnant                      1.000000   
Plasma glucose concentration                  0.129459   
Diastolic blood pressure                      0.141282   
Triceps skin fold thickness                  -0.081672   
2-Hour serum insulin                         -0.073535   
Body mass index                               0.017683   
Diabetes pedigree function                   -0.033523   
Age (years)                                   0.544341   

                              Plasma glucose concentration  \
Number of times pregnant                          0.129459   
Plasma glucose concentration                      1.000000   
Diastolic blood pressure                          0.152590   
Triceps skin fold thickness                       0.057328   
2-Hour serum insulin                              0.331357   
Body mass index                                   0.221071   
Diabetes pedigree function  

In [29]:
print("\nIndependent Features:\n", list(independent_features))


Independent Features:
 ['Number of times pregnant', 'Plasma glucose concentration', 'Diastolic blood pressure', 'Triceps skin fold thickness', '2-Hour serum insulin', 'Body mass index', 'Diabetes pedigree function', 'Age (years)']


In [31]:
print("\nUnwanted Features (High Correlation):\n", high_correlation_pairs)


Unwanted Features (High Correlation):
 set()


In [33]:
print("\nAge Bins:\n", bins)


Age Bins:
 [21.  23.  27.  33.  42.6 81. ]


In [35]:
print("\nNormalized Data Sample:\n", df[[
    "Plasma glucose concentration_MinMax", "Plasma glucose concentration_ZScore", "Plasma glucose concentration_Decimal"
]].head())


Normalized Data Sample:
    Plasma glucose concentration_MinMax  Plasma glucose concentration_ZScore  \
0                             0.743719                             0.847771   
1                             0.427136                            -1.122665   
2                             0.919598                             1.942458   
3                             0.447236                            -0.997558   
4                             0.688442                             0.503727   

   Plasma glucose concentration_Decimal  
0                                 0.148  
1                                 0.085  
2                                 0.183  
3                                 0.089  
4                                 0.137  


In [38]:
# Save processed dataset (optional)
df.to_csv("Processed_Diabetes.csv", index=False)