In [198]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [194]:
from ucimlrepo import fetch_ucirepo
repo = fetch_ucirepo(id=697)
X = repo.data.features
y = repo.data.targets
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [172]:
class ModelComparison:
    
    def __init__(self, X_train, X_test, y_train, y_test):
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        
    def logistic_regression(self):
        model = LogisticRegression(max_iter=10000)
        model.fit(self.X_train, self.y_train)
        predictions = model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, predictions)
        return accuracy
    
    def svm(self):
        model = SVC(kernel='linear')
        model.fit(self.X_train, self.y_train)
        predictions = model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, predictions)
        return accuracy
    
    def knn(self, k):
        knn_X_train = np.asarray(self.X_train)
        knn_X_test = np.asarray(self.X_test)
        model = KNeighborsClassifier(n_neighbors=k)
        model.fit(knn_X_train, y_train)
        predictions = model.predict(knn_X_test)
        accuracy = accuracy_score(y_test, predictions)
        return accuracy
        
    def evaluate(self):
        print(f"logregacc: {self.logistic_regression()}")
        print(f"svmacc: {self.svm()}")
        for k in [1, 3, 5, 7, 9]:
            print(f"knn-{k}: {self.knn(k)}")    

In [173]:
vanilla = ModelComparison(X_train, X_test, y_train, y_test)
vanilla.evaluate()

logregacc: 0.7502824858757062
svmacc: 0.752542372881356
knn-1: 0.5649717514124294
knn-3: 0.5988700564971752
knn-5: 0.6090395480225989
knn-7: 0.5966101694915255
knn-9: 0.6011299435028249


In [174]:
'''applying minmax feature scaling'''
min_max_scaler = MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
X_test_minmax = min_max_scaler.transform(X_test)

minmax = ModelComparison(X_train_minmax, X_test_minmax, y_train, y_test)
minmax.evaluate()

logregacc: 0.7491525423728813
svmacc: 0.7423728813559322
knn-1: 0.6282485875706215
knn-3: 0.6632768361581921
knn-5: 0.6870056497175141
knn-7: 0.6779661016949152
knn-9: 0.6870056497175141


In [116]:
'''applying sklearn standard feature scaling'''
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

std = ModelComparison(X_train_std, X_test_std, y_train, y_test)
std.evaluate()

logregacc: 0.768361581920904
svmacc: 0.7559322033898305
knn-1: 0.6305084745762712
knn-3: 0.6644067796610169
knn-5: 0.6677966101694915
knn-7: 0.6734463276836158
knn-9: 0.6937853107344633


In [208]:
'''L1 regularization for logistic regression models'''
lr = LogisticRegression(penalty='l1',
                       C=1.0,
                       solver='liblinear',
                       multi_class='ovr')
lr.fit(X_train, y_train)
predictions = lr.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7593220338983051


logregacc: 0.7502824858757062
svmacc: 0.752542372881356
knn-1: 0.5649717514124294
knn-3: 0.5988700564971752
knn-5: 0.6090395480225989
knn-7: 0.5966101694915255
knn-9: 0.6011299435028249


In [126]:
'''Varying regularization strength by varying C'''
for c in np.arange(-4., 6.):
    lr = LogisticRegression(penalty='l2',
                       C=10.**c,
                       solver='liblinear',
                       multi_class='ovr',
                       random_state=42)
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{c}: {accuracy}") 

-4.0: 0.7005649717514124
-3.0: 0.7412429378531074
-2.0: 0.7457627118644068
-1.0: 0.7638418079096045
0.0: 0.7593220338983051
1.0: 0.7638418079096045
2.0: 0.7638418079096045
3.0: 0.7604519774011299
4.0: 0.7615819209039548
5.0: 0.7627118644067796


In [176]:
'''use a random forest classifier to assess feature importance'''
forest = RandomForestClassifier(n_estimators=500, random_state=42)
forest.fit(X_train, y_train)
features = X.columns
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(X.shape[1]):
    print(features[indices[f]], importances[indices[f]])

Curricular units 2nd sem (approved) 0.14251911655035635
Curricular units 2nd sem (grade) 0.10304204177962786
Curricular units 1st sem (approved) 0.09744701755827954
Curricular units 1st sem (grade) 0.06443600957126977
Curricular units 2nd sem (evaluations) 0.04290551807912366
Tuition fees up to date 0.04275928457015821
Admission grade 0.04219932756780902
Previous qualification (grade) 0.037744396720256354
Curricular units 1st sem (evaluations) 0.03608591810760967
Age at enrollment 0.03562577054617887
Course 0.033006553890450464
Father's occupation 0.02909368048710706
Mother's occupation 0.025796796750648825
GDP 0.023500176431770523
Curricular units 2nd sem (enrolled) 0.022047780891788556
Unemployment rate 0.021709800975935136
Father's qualification 0.02125354681135768
Mother's qualification 0.020722141806487886
Application mode 0.020715168315078976
Inflation rate 0.02065063559298854
Curricular units 1st sem (enrolled) 0.01812900581811509
Scholarship holder 0.016914406546804567
Applicat

In [145]:
'''limit features to top four, eight, and sixteen most important features'''
top_four = indices[:4]
top_eight = indices[:8]
top_sixteen = indices[:16]

In [183]:
for i in range(3, 36, 6):
    new_X_train = X_train.iloc[:, list(indices[:i])]
    new_X_test = X_test.iloc[:, list(indices[:i])] 
    model = ModelComparison(new_X_train, new_X_test, y_train, y_test)
    model.evaluate()
    

logregacc: 0.6903954802259887
svmacc: 0.6915254237288135
knn-1: 0.6508474576271186
knn-3: 0.6813559322033899
knn-5: 0.688135593220339
knn-7: 0.6745762711864407
knn-9: 0.6734463276836158
logregacc: 0.7231638418079096
svmacc: 0.7062146892655368
knn-1: 0.615819209039548
knn-3: 0.632768361581921
knn-5: 0.6531073446327683
knn-7: 0.6644067796610169
knn-9: 0.6666666666666666
logregacc: 0.751412429378531
svmacc: 0.7536723163841808
knn-1: 0.6056497175141243
knn-3: 0.6440677966101694
knn-5: 0.6338983050847458
knn-7: 0.632768361581921
knn-9: 0.6440677966101694
logregacc: 0.7502824858757062
svmacc: 0.7548022598870057
knn-1: 0.5570621468926553
knn-3: 0.5954802259887005
knn-5: 0.6090395480225989
knn-7: 0.5988700564971752
knn-9: 0.6135593220338983
logregacc: 0.7593220338983051
svmacc: 0.7502824858757062
knn-1: 0.5570621468926553
knn-3: 0.5966101694915255
knn-5: 0.6135593220338983
knn-7: 0.6011299435028249
knn-9: 0.607909604519774
logregacc: 0.751412429378531
svmacc: 0.7502824858757062
knn-1: 0.566101

In [196]:
'''unsupervised dimensionality reduction with principal component analysis'''
pca = PCA()
lr = LogisticRegression(multi_class='ovr',
                       random_state=1,
                       solver='lbfgs')

In [197]:
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
lr.fit(X_train_pca, y_train)
predictions = lr.predict(X_test_pca)

0.7028248587570621


In [207]:
'''supervised data compression via linear discriminant analysis'''
lda = LDA(n_components=2)
X_train_lda = lda.fit_transform(X_train, y_train)
X_test_lda = lda.fit(X_train_lda, y_train)
lr = LogisticRegression(multi_class='ovr', random_state=1,
                        solver='lbfgs')
lr = lr.fit(X_train, y_train)
predictions = lr.predict(X_test_lda)
print(accuracy_score(y_test, predictions))

ValueError: Expected 2D array, got scalar array instead:
array=LinearDiscriminantAnalysis(n_components=2).
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [204]:
predictions = lr.predict(X_test_lda)
print(accuracy_score(y_test, predictions))

ValueError: Expected 2D array, got scalar array instead:
array=LinearDiscriminantAnalysis(n_components=2).
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.