<a href="https://colab.research.google.com/github/YuvrajChauhan1303/C-Programs/blob/main/DAV_MINI_PROJ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# STEP 0: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis, entropy
from scipy.signal import welch, find_peaks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

sns.set(style="whitegrid")


In [7]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
file_path = '/content/drive/My Drive/a01.csv'
df = pd.read_csv(file_path)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/a01.csv'

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
ones_count = df.iloc[:, -1].value_counts()[1] if 1 in df.iloc[:, -1].value_counts() else 0
print(f"Number of 1s in the last column: {ones_count}")


In [None]:
X_raw = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
for i in range(3):
    plt.figure(figsize=(12, 3))
    plt.plot(X_raw.iloc[i].values[:1000])
    plt.title(f"ECG Signal - Sample {i}")
    plt.xlabel("Sample Index")
    plt.ylabel("Amplitude")
    plt.show()

plt.figure(figsize=(6, 4))
sns.countplot(x=y)
plt.title("Sleep Apnea Label Distribution")
plt.xlabel("Label")
plt.ylabel("Count")
plt.show()


In [None]:
# STEP 3: Feature Extraction

def extract_features(signal):
    features = {}
    features['mean'] = np.mean(signal)
    features['std'] = np.std(signal)
    features['var'] = np.var(signal)
    features['median'] = np.median(signal)
    features['min'] = np.min(signal)
    features['max'] = np.max(signal)
    features['range'] = np.ptp(signal)
    features['q1'] = np.percentile(signal, 25)
    features['q3'] = np.percentile(signal, 75)
    features['iqr'] = features['q3'] - features['q1']
    features['skewness'] = skew(signal)
    features['kurtosis'] = kurtosis(signal)
    features['rms'] = np.sqrt(np.mean(signal**2))
    features['zero_crossings'] = np.count_nonzero(np.diff(np.sign(signal)))
    abs_diff = np.abs(np.diff(signal))
    features['abs_diff_mean'] = np.mean(abs_diff)
    features['abs_diff_std'] = np.std(abs_diff)
    features['symmetry'] = np.mean(np.abs(signal - signal[::-1]))
    features['signal_energy'] = np.sum(signal**2)
    hist, _ = np.histogram(signal, bins=100, density=True)
    features['signal_entropy'] = entropy(hist + 1e-6)
    freqs, psd = welch(signal, fs=250)
    total_power = np.sum(psd)
    vlf = np.sum(psd[(freqs >= 0.003) & (freqs < 0.04)])
    lf = np.sum(psd[(freqs >= 0.04) & (freqs < 0.15)])
    hf = np.sum(psd[(freqs >= 0.15) & (freqs < 0.4)])
    features['total_power'] = total_power
    features['vlf_power'] = vlf
    features['lf_power'] = lf
    features['hf_power'] = hf
    features['lf_hf_ratio'] = lf / (hf + 1e-6)
    peaks, _ = find_peaks(signal, distance=50, height=np.mean(signal))
    rr_intervals = np.diff(peaks) if len(peaks) > 1 else [0]
    features['num_beats'] = len(peaks)
    features['mean_rr'] = np.mean(rr_intervals) if len(rr_intervals) > 0 else 0
    features['std_rr'] = np.std(rr_intervals) if len(rr_intervals) > 0 else 0
    features['min_rr'] = np.min(rr_intervals) if len(rr_intervals) > 0 else 0
    features['max_rr'] = np.max(rr_intervals) if len(rr_intervals) > 0 else 0
    features['heart_rate'] = (60 / (features['mean_rr'] / 250)) if features['mean_rr'] > 0 else 0
    return features

# Extract for all samples
feature_list = [extract_features(row) for row in X_raw.values]
X_features = pd.DataFrame(feature_list)


In [None]:
# STEP 4: Feature Visualization

# Histogram of selected features
for feat in ['mean', 'std', 'rms', 'signal_entropy', 'heart_rate']:
    plt.figure(figsize=(6, 4))
    sns.histplot(X_features[feat], bins=30, kde=True)
    plt.title(f"Distribution of {feat}")
    plt.xlabel(feat)
    plt.ylabel("Frequency")
    plt.show()

# Correlation heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(X_features.corr(), cmap='coolwarm', center=0)
plt.title("Feature Correlation Heatmap")
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold

In [None]:
# Step 1: Remove constant features
constant_filter = VarianceThreshold(threshold=0.0)
X_no_constants = constant_filter.fit_transform(X_features)
filtered_feature_names = X_features.columns[constant_filter.get_support()]

# Step 2: Scale the filtered features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_no_constants)

# Step 3: Select top K features (adjust k if needed)
k = min(15, X_scaled.shape[1])  # Just in case fewer than 15 features remain
selector = SelectKBest(score_func=f_classif, k=k)
X_selected = selector.fit_transform(X_scaled, y)

# Step 4: Get names of selected features
selected_mask = selector.get_support()
selected_feature_names = filtered_feature_names[selected_mask]

print("Selected Features:\n", list(selected_feature_names))

In [None]:
# STEP 6: Dimensionality Reduction (PCA)

pca = PCA(n_components=0.95)  # retain 95% variance
X_reduced = pca.fit_transform(X_selected)
print(f"PCA reduced shape: {X_reduced.shape}")


In [None]:
# STEP 7: Resample to Fix Imbalance & Split

sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_reduced, y)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)


In [None]:
# STEP 8: Train Classifier

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


In [None]:
# STEP 9: Plot Feature Importances for All Selected Features

# Get importances from classifier
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

# Use correct selected feature names (after VarianceThreshold + SelectKBest)
selected_feature_names = filtered_feature_names[selector.get_support()]

# Sort selected features and their importances
sorted_feature_names = [selected_feature_names[i] for i in indices]
sorted_importances = importances[indices]

# Plot
plt.figure(figsize=(10, len(sorted_feature_names) * 0.4))
sns.barplot(x=sorted_importances, y=sorted_feature_names, palette="viridis", hue=sorted_feature_names)
plt.title("Feature Importances (All Selected Features)")
plt.xlabel("Importance Score")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()


In [None]:
def perform_hypothesis_tests(X_features: pd.DataFrame, y: pd.Series, top_n=5):
    df = X_features.copy()
    df['label'] = y.values
    results = []
    for col in X_features.columns[:top_n]:
        group0 = df[df['label'] == 0][col]
        group1 = df[df['label'] == 1][col]
        try: f_stat, f_p = f_oneway(group0, group1)
        except: f_stat, f_p = np.nan, np.nan
        try:
            binned = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
            df['binned'] = binned.fit_transform(df[[col]])
            contingency = pd.crosstab(df['binned'], df['label'])
            chi2_stat, chi2_p, _, _ = chi2_contingency(contingency)
        except: chi2_stat, chi2_p = np.nan, np.nan
        try: t_stat, t_p = ttest_ind(group0, group1, equal_var=False)
        except: t_stat, t_p = np.nan, np.nan
        try: u_stat, u_p = mannwhitneyu(group0, group1)
        except: u_stat, u_p = np.nan, np.nan
        try: r, r_p = pearsonr(df[col], df['label'])
        except: r, r_p = np.nan, np.nan
        results.append({
            'Feature': col,
            'ANOVA_F': f_stat, 'ANOVA_p': f_p,
            'Chi2': chi2_stat, 'Chi2_p': chi2_p,
            'T_Stat': t_stat, 'T_p': t_p,
            'U_Stat': u_stat, 'U_p': u_p,
            'Pearson_r': r, 'Pearson_p': r_p
        })
    return pd.DataFrame(results).round(4)

# Run tests
results_df = perform_hypothesis_tests(X_features, y, top_n=5)
display(results_df)
