In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm

In [2]:
X_train = pd.read_csv('X_train.csv') 
X_val = pd.read_csv('X_val.csv') 
X_test = pd.read_csv('X_test.csv')

y_train = pd.read_csv('y_train.csv') 
y_val = pd.read_csv('y_val.csv')
y_test = pd.read_csv('y_test.csv')

In [6]:
results_file = 'feature_selection_logistic_lasso_results.txt'
selected_features_list = 'logistic_regression_lasso.txt'
f = open(results_file, "w")

# Get selected features

Folder 'important_features' includes .txt files which contain the features that each feature selection method selected

In [12]:
selected_feats=[]
with open(r'important_features/'+selected_features_list, 'r') as fp:
    for line in fp:
        x = line[:-1]
        selected_feats.append(x)

X_train = X_train[selected_feats]
X_val = X_val[selected_feats]
X_test = X_test[selected_feats]

In [13]:
len(selected_feats)

95

# Testing against validation set only

In [8]:
scaler = StandardScaler()
s = scaler.fit(X_train)
X_train=scaler.transform(X_train)
X_val = scaler.transform(X_val)

In [9]:
def print_metrics(y_val,y_pred,f,classifier):
    f.write(f'{classifier} Precision: {precision_score(y_val,y_pred)}\n')
    f.write(f'{classifier} Recall: {recall_score(y_val,y_pred)}\n')
    f.write(f'{classifier} f1_score: {f1_score(y_val,y_pred)}\n')
    f.write(f'{classifier} Accuracy: {accuracy_score(y_val,y_pred)}\n')
    f.write('---------------------------------------------------------------------------\n')

# Logistic Regression Test

In [12]:
clf = LogisticRegression(random_state=0,max_iter=1000).fit(X_train, y_train) 
y_pred = clf.predict(X_val)
print_metrics(y_val,y_pred,f,"logistic regression")

# SVM test

In [13]:
clf1 = svm.SVC()
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_val)
print_metrics(y_val,y_pred,f,"SVM")

# Naive Bayes

In [14]:
clf2 = GaussianNB()
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_val)
print_metrics(y_val,y_pred,f,"Naive Bayes")

# KNN

In [15]:
clf3 = KNeighborsClassifier(n_neighbors=3)
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_val)
print_metrics(y_val,y_pred,f,"KNN")

# Decision Tree

In [16]:
clf4 = tree.DecisionTreeClassifier()
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_val)
print_metrics(y_val,y_pred,f,"Decision Tree")

# LDA

In [17]:
clf5 = LinearDiscriminantAnalysis()
clf5.fit(X_train, y_train)
y_pred = clf5.predict(X_val)
print_metrics(y_val,y_pred,f,"LDA")

# QDA

In [18]:
clf6 = QuadraticDiscriminantAnalysis()
clf6.fit(X_train, y_train)
y_pred = clf6.predict(X_val)
print_metrics(y_val,y_pred,f,"QDA")

# ADABOOST

In [19]:
from sklearn.ensemble import AdaBoostClassifier

clf7 = AdaBoostClassifier(n_estimators=100, random_state=0)
clf7.fit(X_train, y_train)
y_pred = clf7.predict(X_val)
print_metrics(y_val,y_pred,f,"ADABOOST")

In [20]:
f.close()