In [None]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from matplotlib_venn import venn2

بخش 1

In [None]:
X = pd.read_csv('secom.data', sep=' ', header=None)
y = pd.read_csv('secom_labels.data', sep=' ', header=None)[0]
print(f"Dataset shape: {X.shape}")
print(f"Label vector shape: {y.shape}")

In [None]:
X = X.loc[:, X.var() > 0]
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)
print(f"Shape after preprocessing: {X.shape}")

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Mean of first 5 features:", np.mean(X_scaled[:, :5], axis=0))
print("Std of first 5 features:", np.std(X_scaled[:, :5], axis=0))

بخش 2

In [None]:
start = time.time()
mi = mutual_info_classif(X_scaled, y, random_state=42)
mi = pd.Series(mi, index=range(X_scaled.shape[1]))
top20_mi = mi.sort_values(ascending=False).head(20).index
X_mi = X_scaled[:, top20_mi]
mi_time = time.time() - start
print(f"MI execution time: {mi_time:.4f} seconds")
print(f"Shape after MI feature selection: {X_mi.shape}")

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
start = time.time()
rfe = RFE(rf, n_features_to_select=20, step=10)
rfe.fit(X_scaled, y)
rfe_time = time.time() - start
top20_rfe = rfe.support_
X_rfe = X_scaled[:, top20_rfe]
print(f"RFE execution time: {rfe_time:.2f} seconds")
print(f"Shape after RFE feature selection: {X_rfe.shape}")

بخش ۳

In [None]:
U, S, Vt = np.linalg.svd(X_scaled, full_matrices=False)
print(f"U shape: {U.shape}")
print(f"S shape: {S.shape}")
print(f"Vt shape: {Vt.shape}")

In [None]:
k = 20
start = time.time()
scores = np.zeros(Vt.shape[1])
for j in range(Vt.shape[1]):
    for i in range(k):
        scores[j] += (S[i] ** 2) * abs(Vt[i, j])
top20_svd = np.argsort(scores)[::-1][:20]
X_svd = X_scaled[:, top20_svd]
svd_time = time.time() - start
print(f"SVD execution time: {svd_time:.4f} seconds")
print(f"Shape after SVD feature selection: {X_svd.shape}")

بخش 4

In [None]:
pc1 = Vt[0, :]
pc2 = Vt[1, :]
plt.scatter(pc1, pc2, s=10)
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Loadings Plot (PC1 vs PC2)")
plt.show()

In [None]:
noise = 0.05 * np.random.randn(*X_scaled.shape)
X_noisy = X_scaled + noise
print(f"Noisy data shape: {X_noisy.shape}")

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rfe_noisy = RFE(rf, n_features_to_select=20, step=10)
rfe_noisy.fit(X_noisy, y)
rfe_features_noisy = np.where(rfe_noisy.support_)[0]
print(f"Number of selected features by RFE (noisy): {len(rfe_features_noisy)}")

In [None]:
U_n, S_n, Vt_n = np.linalg.svd(X_noisy, full_matrices=False)
scores_n = np.zeros(Vt_n.shape[1])
for j in range(Vt_n.shape[1]):
    for i in range(20):
        scores_n[j] += (S_n[i] ** 2) * abs(Vt_n[i, j])
svd_features_noisy = np.argsort(scores_n)[::-1][:20]
print(f"Number of selected features by SVD (noisy): {len(svd_features_noisy)}")

In [None]:
common_rfe = len(set(top20_rfe).intersection(rfe_features_noisy))
common_svd = len(set(top20_svd).intersection(svd_features_noisy))
print(f"Common features (RFE vs noisy RFE): {common_rfe}")
print(f"Common features (SVD vs noisy SVD): {common_svd}")

بخش 5

In [None]:
def train_eval(X, y):
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.3, random_state=42
    )
    model = LogisticRegression(max_iter=1000, class_weight='balanced')
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    return accuracy_score(y_te, y_pred), f1_score(y_te, y_pred)

acc_mi, f1_mi = train_eval(X_mi, y)
acc_rfe, f1_rfe = train_eval(X_rfe, y)
acc_svd, f1_svd = train_eval(X_svd, y)
print("MI  -> Accuracy:", round(acc_mi, 3), "F1:", round(f1_mi, 3))
print("RFE -> Accuracy:", round(acc_rfe, 3), "F1:", round(f1_rfe, 3))
print("SVD -> Accuracy:", round(acc_svd, 3), "F1:", round(f1_svd, 3))

In [None]:

results = pd.DataFrame({
    "Method": ["MI", "RFE", "SVD"],
    "Accuracy": [acc_mi, acc_rfe, acc_svd],
    "F1-score": [f1_mi, f1_rfe, f1_svd],
    "Feature Selection Time (s)": [0, rfe_time, 0]
})
results

In [None]:
set_rfe = set(top20_rfe)
set_svd = set(top20_svd)

venn2([set_rfe, set_svd], set_labels=('RFE', 'SVD'))
plt.show()