# **Feature Selection and Balancing Dataset using BOOST Methods**

In [None]:
X = df.drop(["target"], axis=1)
y = df["target"]

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

# **Univariate Feature Selection (UFS)**
ufs_selector = SelectKBest(score_func=f_classif, k='all')  # Select all for scoring
ufs_selector.fit(X, y)

# Get feature scores & p-values for UFS
ufs_scores = ufs_selector.scores_
ufs_feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'UFS_Score': ufs_scores
})

# **Information Gain Selection (IGS)**
igs_selector = SelectKBest(score_func=mutual_info_classif, k='all')  # Select all for scoring
igs_selector.fit(X, y)

# Get feature scores for IGS
igs_scores = igs_selector.scores_
igs_feature_scores = pd.DataFrame({
    'Feature': X.columns,
    'IGS_Score': igs_scores
})

# **Merge Scores from Both Methods**
combined_scores = pd.merge(ufs_feature_scores, igs_feature_scores, on="Feature", how="inner")

# **Compute Combined Score (Sum or Weighted Average)**
combined_scores['Total_Score'] = combined_scores['UFS_Score'] + combined_scores['IGS_Score']

# **Sort by Combined Score & Select Top 10**
top_10_combined_features = combined_scores.sort_values(by="Total_Score", ascending=False).head(10)
top_10_feature_names = top_10_combined_features['Feature'].tolist()

# **Filter Original Dataset with Selected Features**
df_top_10 = df[top_10_feature_names]

# Print Results
print("Top 10 Features (Combined from UFS & IGS):", top_10_feature_names)


Top 10 Features (Combined from UFS & IGS): ['TT4', 'tumor', 'TSH_measured', 'FTI', 'pregnant', 'TSH', 'query_hyperthyroid', 'T4U', 'on_antithyroid_meds', 'query_hypothyroid']


In [None]:
# view thedataframe
df_top_10

Unnamed: 0,TT4,tumor,TSH_measured,FTI,pregnant,TSH,query_hyperthyroid,T4U,on_antithyroid_meds,query_hypothyroid
0,48.000000,0,1,47.000000,0,68.000000,0,1.020000,0,0
1,157.000000,0,1,176.000000,0,0.050000,0,0.890000,0,0
2,33.000000,0,1,31.000000,0,140.000000,0,1.070000,0,0
3,114.000000,0,1,136.000000,0,9.799999,0,0.840000,0,0
4,7.500000,0,1,7.500000,0,90.000000,0,0.940000,0,0
...,...,...,...,...,...,...,...,...,...,...
975,44.150872,0,1,30.319927,0,49.547225,0,1.068935,0,0
976,34.385919,0,1,46.113455,0,94.968805,0,1.005763,0,0
977,41.198013,0,1,40.893985,0,84.329749,0,0.961568,0,0
978,24.521011,0,1,20.754870,0,85.019698,0,0.949468,0,0


In [None]:
# Split the data Into X and y (X has all features and y has target variable)
X = df_top_10
y = y

In [None]:
# Import required libraries
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification

In [None]:
# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [None]:
# Step 1: Apply Boosting Stage (BS)
# Initialize AdaBoost Classifier with Decision Tree as base estimator
boosting_model = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=2), n_estimators=100, random_state=42)
boosting_model.fit(X_train, y_train)

In [None]:
# Step 2: Apply SMOTE for oversampling minority class
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=5)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [None]:
# Step 3: Apply Tomek Links to remove noisy samples
tomek = TomekLinks()
X_resampled, y_resampled = tomek.fit_resample(X_smote, y_smote)

In [None]:
# Convert the resampled data into DataFrames
X_train_res_df = pd.DataFrame(X_resampled, columns=X_train.columns)
y_train_res_df = pd.DataFrame(y_resampled, columns=[y_train.name])  # y_train.name preserves the original target column name

# Combine the features and target into one DataFrame (preprocessed data)
df = pd.concat([X_train_res_df, y_train_res_df], axis=1)