<a href="https://colab.research.google.com/github/aaryanamrute/skill/blob/main/skill_7_%268_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

expt 7 & 8

In [9]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the diabetes dataset
file_path = "/content/diabetes (3).csv"  # Update this if needed
df = pd.read_csv(file_path)

# Splitting features and target variable
X = df.drop(columns=["Outcome"])
y = df["Outcome"]

# Standardization for numerical methods (ANOVA, Pearson, Mutual Info)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Normalization for Chi-Square (it requires non-negative values)
minmax_scaler = MinMaxScaler()
X_normalized = minmax_scaler.fit_transform(X)

# --- 1. ANOVA (F-Test) ---
anova_scores, _ = f_classif(X_scaled, y)
anova_results = dict(zip(X.columns, anova_scores))

# --- 2. Chi-Square Test ---
chi_scores, _ = chi2(X_normalized, y)
chi_results = dict(zip(X.columns, chi_scores))

# --- 3. Mutual Information (Information Gain) ---
mi_scores = mutual_info_classif(X_scaled, y)
mi_results = dict(zip(X.columns, mi_scores))

# --- 4. Pearson's Correlation with target ---
pearson_corr = {col: abs(df[col].corr(df["Outcome"])) for col in X.columns}

# --- 5. Random Forest Feature Importance ---
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)
rf_importance = dict(zip(X.columns, rf_model.feature_importances_))

# --- 6. Permutation Feature Importance ---
perm_importance = permutation_importance(rf_model, X, y, n_repeats=10, random_state=42)
perm_results = dict(zip(X.columns, perm_importance.importances_mean))

# Combine all results into a DataFrame
feature_scores = pd.DataFrame({
    "ANOVA_F": anova_results,
    "Chi_Square": chi_results,
    "Mutual_Info": mi_results,
    "Pearson_Corr": pearson_corr,
    "Random_Forest": rf_importance,
    "Permutation_Importance": perm_results
})

# Sort by Random Forest importance for better readability
feature_scores_sorted = feature_scores.sort_values(by="Random_Forest", ascending=False)

# Display the final feature scores
print(feature_scores_sorted)


                             ANOVA_F  Chi_Square  Mutual_Info  Pearson_Corr  \
Glucose                   213.161752    7.094910     0.114818      0.466581   
BMI                        71.772072    1.902673     0.073949      0.292695   
Age                        46.140611    8.205691     0.057618      0.238356   
DiabetesPedigreeFunction   23.871300    2.758584     0.013494      0.173844   
BloodPressure               3.256950    0.144306     0.000000      0.065068   
Pregnancies                39.670227    6.559982     0.056912      0.221898   
Insulin                    13.281108    2.571590     0.038562      0.130548   
SkinThickness               4.304381    0.536445     0.023480      0.074752   

                          Random_Forest  Permutation_Importance  
Glucose                        0.267142                0.208984  
BMI                            0.168769                0.106380  
Age                            0.131567                0.089453  
DiabetesPedigreeFunction