<a href="https://colab.research.google.com/github/aaryanamrute/skill/blob/main/skill_7_%26%208_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

expt 7 & 8

In [8]:
# Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load the diabetes dataset
file_path = "/content/diabetes.csv"  # Update this if needed
df = pd.read_csv(file_path)

# Splitting features and target variable
X = df.drop(columns=["Outcome"])
y = df["Outcome"]

# Standardization for numerical methods (ANOVA, Pearson, Mutual Info)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Normalization for Chi-Square (it requires non-negative values)
minmax_scaler = MinMaxScaler()
X_normalized = minmax_scaler.fit_transform(X)

# 1. ANOVA
anova_scores, _ = f_classif(X_scaled, y)
anova_results = dict(zip(X.columns, anova_scores))

# 2. Chi-Square Test
chi_scores, _ = chi2(X_normalized, y)
chi_results = dict(zip(X.columns, chi_scores))

# 3. Mutual Information (Information Gain)
mi_scores = mutual_info_classif(X_scaled, y)
mi_results = dict(zip(X.columns, mi_scores))

# 4. Pearson's Correlation with target
pearson_corr = {col: abs(df[col].corr(df["Outcome"])) for col in X.columns}

# Combine all results into a DataFrame
feature_scores = pd.DataFrame({
    "ANOVA_F": anova_results,
    "Chi_Square": chi_results,
    "Mutual_Info": mi_results,
    "Pearson_Corr": pearson_corr
})

# Sort by ANOVA F score for better readability (or any other feature)
feature_scores_sorted = feature_scores.sort_values(by="ANOVA_F", ascending=False)

# Function to get selected and removed features based on a threshold
def select_features(feature_scores_df, threshold=0.05, score_column="ANOVA_F"):

    # Select features where the score is greater than the threshold
    selected_features = feature_scores_df[feature_scores_df[score_column] > threshold].index.tolist()

    # Select features where the score is below the threshold
    removed_features = feature_scores_df[feature_scores_df[score_column] <= threshold].index.tolist()

    return selected_features, removed_features

# Display selected and removed features for all methods

threshold = 0.05  # Set a threshold value to select features

# Loop over all score columns and select/remove features based on the threshold
for score_column in feature_scores.columns:
    selected_features, removed_features = select_features(feature_scores_sorted, threshold, score_column)

    print(f"\nSelected Features based on {score_column} (Threshold > {threshold}):\n", selected_features)
    print(f"\nRemoved Features based on {score_column} (Threshold <= {threshold}):\n", removed_features)

# Display the final sorted feature scores
print("\nFinal Sorted Feature Scores:\n", feature_scores_sorted)



Selected Features based on ANOVA_F (Threshold > 0.05):
 ['Glucose', 'BMI', 'Age', 'Pregnancies', 'DiabetesPedigreeFunction', 'Insulin', 'SkinThickness', 'BloodPressure']

Removed Features based on ANOVA_F (Threshold <= 0.05):
 []

Selected Features based on Chi_Square (Threshold > 0.05):
 ['Glucose', 'BMI', 'Age', 'Pregnancies', 'DiabetesPedigreeFunction', 'Insulin', 'SkinThickness', 'BloodPressure']

Removed Features based on Chi_Square (Threshold <= 0.05):
 []

Selected Features based on Mutual_Info (Threshold > 0.05):
 ['Glucose', 'BMI', 'Age']

Removed Features based on Mutual_Info (Threshold <= 0.05):
 ['Pregnancies', 'DiabetesPedigreeFunction', 'Insulin', 'SkinThickness', 'BloodPressure']

Selected Features based on Pearson_Corr (Threshold > 0.05):
 ['Glucose', 'BMI', 'Age', 'Pregnancies', 'DiabetesPedigreeFunction', 'Insulin', 'SkinThickness', 'BloodPressure']

Removed Features based on Pearson_Corr (Threshold <= 0.05):
 []

Final Sorted Feature Scores:
                        