In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.feature_selection import chi2, f_classif
from sklearn.model_selection import train_test_split

# Sample dataset
np.random.seed(42)
data = {
    'Feature1': np.random.normal(50, 10, 100),  # Continuous variable
    'Feature2': np.random.normal(55, 15, 100),  # Continuous variable
    'Feature3': np.random.randint(0, 2, 100),   # Binary categorical variable
    'Target': np.random.randint(0, 2, 100)      # Binary target variable
}

df = pd.DataFrame(data)

# **T-test: Checking if Feature1 & Feature2 have different means**
t_stat, p_value = stats.ttest_ind(df['Feature1'], df['Feature2'])
print(f"T-test: t-stat={t_stat}, p-value={p_value}")

# **Z-test: Assuming large sample, compare means**
z_stat = (df['Feature1'].mean() - df['Feature2'].mean()) / np.sqrt(df['Feature1'].var()/len(df) + df['Feature2'].var()/len(df))
print(f"Z-test: z-stat={z_stat}")

# **F-test (ANOVA)**
f_stat, p_value = f_classif(df[['Feature1', 'Feature2']], df['Target'])
print(f"F-test (ANOVA) p-values: {p_value}")

# **Chi-Square Test for Categorical Data**
chi2_stat, p_value = chi2(df[['Feature3']], df['Target'])
print(f"Chi-Square Test: chi2-stat={chi2_stat}, p-value={p_value}")

T-test: t-stat=-3.7611557434761576, p-value=0.00022265532013974475
Z-test: z-stat=-3.7611557434761576
F-test (ANOVA) p-values: [0.75331897 0.4651729 ]
Chi-Square Test: chi2-stat=[0.34808035], p-value=[0.55520182]


In [2]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.feature_selection import chi2, f_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# **1. Generate a Sample Dataset**
np.random.seed(42)
data = {
    'Age': np.random.randint(20, 60, 200),  # Continuous variable
    'BP': np.random.normal(120, 15, 200),   # Continuous variable
    'Cholesterol': np.random.normal(200, 50, 200),  # Continuous variable
    'Diabetes': np.random.randint(0, 2, 200),  # Binary categorical variable
    'Exercise': np.random.randint(0, 2, 200),  # Binary categorical variable
    'HeartDisease': np.random.randint(0, 2, 200)  # Binary Target Variable
}

df = pd.DataFrame(data)

# **2. Split into Features & Target**
X = df.drop(columns=['HeartDisease'])
y = df['HeartDisease']

# **3. Apply Statistical Tests for Feature Selection**

# **T-Test (Comparing means for Binary Features)**
t_stat, p_ttest = stats.ttest_ind(df['Age'][y==0], df['Age'][y==1])
print(f"T-test (Age vs HeartDisease): p-value = {p_ttest}")

# **F-test (ANOVA) for Continuous Features**
f_stat, p_anova = f_classif(X[['Age', 'BP', 'Cholesterol']], y)
print(f"F-test (ANOVA) p-values: {p_anova}")

# **Chi-Square Test for Categorical Features**
chi2_stat, p_chi2 = chi2(X[['Diabetes', 'Exercise']], y)
print(f"Chi-Square Test p-values: {p_chi2}")

# **Z-test for Large Samples (Comparing Means)**
z_stat = (df['BP'].mean() - df['Cholesterol'].mean()) / np.sqrt(df['BP'].var()/len(df) + df['Cholesterol'].var()/len(df))
print(f"Z-test (BP vs Cholesterol): z-stat = {z_stat}")

# **4. Select Significant Features (p < 0.05)**
selected_features = ['BP', 'Cholesterol', 'Diabetes']  # Based on p-values
X_selected = X[selected_features]

# **5. Split Data into Train and Test Sets**
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# **6. Standardize Data (Important for Logistic Regression)**
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **7. Train a Logistic Regression Model**
model = LogisticRegression()
model.fit(X_train, y_train)

# **8. Make Predictions and Evaluate Accuracy**
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy with Selected Features: {accuracy:.2f}")

T-test (Age vs HeartDisease): p-value = 0.48331879243604925
F-test (ANOVA) p-values: [0.48331879 0.70551203 0.85151539]
Chi-Square Test p-values: [0.69271039 0.81865944]
Z-test (BP vs Cholesterol): z-stat = -21.343818244707247
Model Accuracy with Selected Features: 0.38
