#Model Training

In [None]:
# Split the data into features and target
X = df.drop("target", axis=1) # Features
y = df["target"] # Target

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [None]:
# Traing features
X_train

Unnamed: 0,TT4,tumor,TSH_measured,FTI,pregnant,TSH,query_hyperthyroid,T4U,on_antithyroid_meds,query_hypothyroid
687,158.564358,0,1,803.659311,0,0.034718,0,0.515604,0,0
2486,35.863171,0,1,48.878423,0,45.375180,0,1.025927,0,0
1525,55.319101,0,1,134.992816,0,3.597135,0,0.844334,0,0
2320,51.917141,0,1,173.248335,0,0.971284,0,1.006672,0,0
1307,134.455349,1,1,85.061499,0,0.919122,0,1.580600,0,0
...,...,...,...,...,...,...,...,...,...,...
791,215.519365,0,1,218.038730,0,0.168508,0,0.987403,0,0
2477,31.347996,0,1,36.642790,0,91.455630,0,1.011677,0,0
1387,229.014088,0,1,280.888268,0,22.347887,0,0.820822,0,0
922,138.642593,0,1,121.449206,0,0.065459,1,1.141016,0,0


In [None]:
# Traing target
y_train

Unnamed: 0,target
687,0
2486,8
1525,3
2320,7
1307,2
...,...
791,0
2477,8
1387,3
922,1


In [None]:
# testing features
X_test

Unnamed: 0,TT4,tumor,TSH_measured,FTI,pregnant,TSH,query_hyperthyroid,T4U,on_antithyroid_meds,query_hypothyroid
2555,14.691105,0,1,38.858245,0,91.148653,0,1.035202,0,0
2503,44.016242,0,1,44.946328,0,57.271672,0,1.054205,0,0
511,198.000000,0,1,211.000000,0,0.100000,0,0.940000,0,0
986,128.919270,0,1,128.219726,0,0.028700,1,1.009596,0,0
1195,132.720318,1,1,83.133687,0,0.852612,0,1.593131,0,0
...,...,...,...,...,...,...,...,...,...,...
1548,148.646265,0,1,396.137116,0,12.246367,0,0.428478,0,0
743,134.888934,0,1,165.793677,0,0.014762,0,0.813968,0,0
2257,56.542312,0,1,778.677091,0,5.738975,0,1.614459,0,0
1790,16.000000,0,0,15.000000,0,160.281366,0,1.100000,0,1


In [None]:
# testing target
y_test

Unnamed: 0,target
2555,8
2503,8
511,0
986,1
1195,2
...,...
1548,3
743,0
2257,7
1790,4


In [None]:
pip install catboost


Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


# # Algorithm Impementation

In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

In [None]:
#pip install catboost

In [None]:


# Define the classifiers
classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(probability=True),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Xgboost' : xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, verbose=0),


}


" Permutation Feature Importance (PFI) is a method used to evaluate the importance of features in a machine learning model. "

In [None]:
# Step 1: Train all classifiers and compute Permutation Feature Importance (PFI)
pfi_results = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    pfi = permutation_importance(clf, X_train, y_train, n_repeats=10, random_state=42)
    pfi_results[name] = np.mean(pfi.importances_mean)
    print(f"Classifier: {name}, PFI Score: {pfi_results[name]}")

Classifier: Decision Tree, PFI Score: 0.175805387575591
Classifier: SVM, PFI Score: 0.10108301264431008
Classifier: KNN, PFI Score: 0.1505827377680044
Classifier: Random Forest, PFI Score: 0.12839472237493127
Classifier: AdaBoost, PFI Score: 0.08170973062122047
Classifier: Gradient Boosting, PFI Score: 0.1418581638262782


Parameters: { "use_label_encoder" } are not used.



Classifier: Xgboost, PFI Score: 0.1289114898295767
Classifier: CatBoost, PFI Score: 0.14774051676745464


In [None]:
# Step 2: Select Half-Most Effective Classifiers (HEC) based on PFI
sorted_classifiers = sorted(pfi_results.items(), key=lambda x: x[1], reverse=True)
hec_classifiers = [name for name, _ in sorted_classifiers[:len(sorted_classifiers) // 2]]
print(f"Selected HEC Classifiers: {hec_classifiers}")

Selected HEC Classifiers: ['Decision Tree', 'KNN', 'CatBoost', 'Gradient Boosting']


In [None]:
# Step 3: Define the ensemble methods using the selected classifiers
estimators = [(name, classifiers[name]) for name in hec_classifiers]

# Define ensemble methods
boosting = AdaBoostClassifier(estimator=estimators[0][1])
bagging = BaggingClassifier(estimator=estimators[0][1])
voting = VotingClassifier(estimators=estimators, voting='soft')
stacking = StackingClassifier(estimators=estimators)

ensemble_methods = {
    'Boosting': boosting,
    'Bagging': bagging,
    'Voting': voting,
    'Stacking': stacking
}

In [None]:
# Step 4: Train each ensemble method and evaluate accuracy
accuracy_results = {}
for name, ensemble in ensemble_methods.items():
    ensemble.fit(X_train, y_train)
    y_pred = ensemble.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[name] = accuracy
    print(f"Ensemble Method: {name}, Accuracy: {accuracy}")

Ensemble Method: Boosting, Accuracy: 0.9794871794871794
Ensemble Method: Bagging, Accuracy: 0.9833333333333333
Ensemble Method: Voting, Accuracy: 0.9871794871794872
Ensemble Method: Stacking, Accuracy: 0.9910256410256411


In [None]:
# Step 5: Select Most Efficient Ensemble Method (EEM)
best_ensemble = max(accuracy_results, key=accuracy_results.get)
print(f"Most Efficient Ensemble Method: {best_ensemble}")

Most Efficient Ensemble Method: Stacking
