In [1]:
from sklearn.ensemble import BaggingClassifier
#from sklearn.ensemble import BalancedBaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
import numpy as np
import random
import re
import warnings; warnings.simplefilter('ignore')

In [61]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 
#'<=50K', '>50K', '<=50K.', '>50K.'
y['income']=y['income'].replace({'<=50K.':'<=50K'})
y['income']=y['income'].replace({'>50K.':'>50K'})
df = pd.concat([X,y],axis=1)
# metadata 
print(adult.metadata) 
  
# variable information 
print(adult.variables) 
df.fillna("?")
df = pd.get_dummies(df, drop_first=True).values
X = df[0:, :-1]
y = df[0:, -1]
y=y.astype('int')


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following conditions: ((AAG

In [62]:
rows = []
def add_to_rows(classifier_name, data):
    for split, values in data.items():
        train_acc, val_acc, test_acc, hyperparams = values
        rows.append([classifier_name, split, train_acc, val_acc, test_acc, hyperparams])


In [63]:
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
import numpy as np


X, y = shuffle(X, y, random_state=0)
X_D, X_test, y_D, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the function to perform training and validation
def train_and_validate(X, y, test_size,clf,X_test=X_test,y_test=y_test):
    # Splitting the data according to the given test_size
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=1-test_size, random_state=1)

    # Fit the model
    clf.fit(X_train, y_train)

    # Evaluate the model
    train_accuracy = clf.score(X_train, y_train)
    val_accuracy = clf.score(X_val, y_val)
    test_accuracy = clf.score(X_test, y_test)
    best_params = clf.best_params_

    return train_accuracy, val_accuracy, test_accuracy, best_params

def get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test,redo_hyperparam=False):
    splits = [0.2, 0.5, 0.8]
    results = {}
    for split in splits:
        print("doing split: "+str(split))
        a1,b1,c1,d1 = train_and_validate(X_D, y_D, split,clf,X_test=X_test,y_test=y_test)
        print(a1)
        #only optimize hyperparameters once for performance
        d_copy=d1
        for key, value in d_copy.items():
            d_copy[key] = [value]
        if not redo_hyperparam:
            clf.param_grid=d_copy
            print("found hyperparams")
        a2,b2,c2,d2 = train_and_validate(X_D, y_D, split,clf,X_test=X_test,y_test=y_test)
        print(a2)
        a3,b3,c3,d3 = train_and_validate(X_D, y_D, split,clf,X_test=X_test,y_test=y_test)
        print(a3)
        
        results[f"{int(split*100)}:{int((100-split*100))}"] = ((a1+a2+a3)/3,(b1+b2+b3)/3,(c1+c2+c3)/3,d1)
        #results[f"{int(split*100)}:{int((100-split*100))}"] = train_and_validate(X_D, y_D, split,clf)
    return results


In [64]:
C_list = [.000001,.00001,.0001,0.001, 0.01, 0.1, 1, 10,100,1000]
param_grid = {'C': C_list}
classifier = LogisticRegression(solver="liblinear", class_weight="balanced")
clf = GridSearchCV(classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("Logistic Regression", r)
print(r)

doing split: 0.2
0.8012541592014333
found hyperparams
0.8012541592014333
0.8012541592014333
doing split: 0.5
0.7984234234234234
found hyperparams
0.7984234234234234
0.7984234234234234
doing split: 0.8
0.7921492098022906
found hyperparams
0.7921492098022906
0.7921492098022906
{'20:80': (0.8012541592014332, 0.8024888832016379, 0.802333913399529, {'C': [100]}), '50:50': (0.7984234234234234, 0.8002252136970874, 0.7993653393387246, {'C': [100]}), '80:20': (0.7921492098022908, 0.7974408189379399, 0.7937352850854745, {'C': [100]})}


In [65]:
classifier = LinearSVC(class_weight="balanced")
clf = GridSearchCV(classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test,redo_hyperparam=True)
add_to_rows("Linear SVC", r)
print(r)

doing split: 0.2
0.776938827745073
0.7787304837471205
0.772843614026107
doing split: 0.5
0.7810196560196561
0.786496723996724
0.7856265356265356
doing split: 0.8
0.7845351589992962
0.7974918420884254
0.2600614242753855
{'20:80': (0.7761709751727667, 0.7732173134137369, 0.7737059405602756, {'C': [0.1]}), '50:50': (0.7843809718809719, 0.7789664056235178, 0.7820315965469001, {'C': [1e-06]}), '80:20': (0.6140294751210357, 0.6098528470889315, 0.6153819906506978, {'C': [0.01]})}


In [66]:
from sklearn.ensemble import RandomForestClassifier
max_features_options = [1, 2, 4, 8, 16]
param_grid = {'max_features': max_features_options}
classifier = RandomForestClassifier(n_estimators=1024, random_state=42)
clf = GridSearchCV(classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("Random Forest", r)
print(r)

doing split: 0.2
1.0
found hyperparams
1.0
1.0
doing split: 0.5
1.0
found hyperparams
1.0
1.0
doing split: 0.8
1.0
found hyperparams
1.0
1.0
{'20:80': (1.0, 0.8548897917399789, 0.8565871634763026, {'max_features': [16]}), '50:50': (1.0, 0.8552490146900752, 0.8522878493192753, {'max_features': [16]}), '80:20': (1.0, 0.8573256557901473, 0.8547446002661481, {'max_features': [16]})}


In [67]:
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    #'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4]
}

# Create the base model
classifier = DecisionTreeClassifier(random_state=42)

# Instantiate the grid search model
clf = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=3)

r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("Decision Tree", r)
print(r)


doing split: 0.2
0.8594829792679806
found hyperparams
0.8594829792679806
0.8594829792679806
doing split: 0.5
0.8524774774774775
found hyperparams
0.8524774774774775
0.8524774774774775
doing split: 0.8
0.8536374688079852
found hyperparams
0.8536374688079852
0.8536374688079852
{'20:80': (0.8594829792679807, 0.8514027959947535, 0.8548469648889344, {'criterion': ['gini'], 'max_depth': [5]}), '50:50': (0.8524774774774775, 0.8472129804985412, 0.8497287337496161, {'criterion': ['gini'], 'max_depth': [5]}), '80:20': (0.8536374688079852, 0.8496481126039668, 0.8517760262053434, {'criterion': ['gini'], 'max_depth': [5]})}


In [68]:
from sklearn.neural_network import MLPClassifier

# Define the list of hidden unit sizes and momentum values
hidden_units_list = [(1,), (4,), (32,), (128,)]  # Each tuple represents a layer configuration
momentum_list = [0, 0.2, 0.5, 0.9]

# Create the parameter grid
param_grid = {
    'hidden_layer_sizes': hidden_units_list#,
    #'momentum': momentum_list
}

# Initialize the ANN classifier
# solver='sgd' is used here to enable the momentum parameter
ann_classifier = MLPClassifier(solver='sgd',max_iter=5)

# Create the GridSearchCV object
clf = GridSearchCV(ann_classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("ANN", r)
print(r)

doing split: 0.2
0.7637573585871512
found hyperparams
0.7637573585871512
0.7637573585871512
doing split: 0.5
0.7660217035217035
found hyperparams
0.765509828009828
0.7642301392301393
doing split: 0.8
0.7629086953739843
found hyperparams
0.764508285878815
0.7629406871840809
{'20:80': (0.7637573585871512, 0.7598771553792508, 0.7609786057938376, {'hidden_layer_sizes': [(32,)]}), '50:50': (0.7652538902538902, 0.7583559400112607, 0.7618998873989149, {'hidden_layer_sizes': [(32,)]}), '80:20': (0.7634525561456268, 0.7541480059714226, 0.762070495103559, {'hidden_layer_sizes': [(32,)]})}


In [69]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Define the range of K values
K_values = np.linspace(1, 1000, 13, dtype=int)

# Define the weighting functions

# Create the parameter grid
param_grid = {
    'n_neighbors': K_values,
}

# Initialize the KNN classifier
knn_classifier = KNeighborsClassifier(metric='euclidean')

# Create the GridSearchCV object
clf = GridSearchCV(knn_classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("K Neighbors", r)
print(r)

doing split: 0.2
0.7761709751727668
found hyperparams
0.7761709751727668
0.7761709751727668
doing split: 0.5
0.7883394758394758
found hyperparams
0.7883394758394758
0.7883394758394758
doing split: 0.8
0.7941327020282808
found hyperparams
0.7941327020282808
0.7941327020282808
{'20:80': (0.7761709751727667, 0.7720656450942128, 0.7735694543965606, {'n_neighbors': [84]}), '50:50': (0.7883394758394758, 0.7821057480677688, 0.7855461152625652, {'n_neighbors': [84]}), '80:20': (0.7941327020282808, 0.781573896353167, 0.7946565666905517, {'n_neighbors': [84]})}


In [71]:
df = pd.DataFrame(rows, columns=['Classifier', 'Train/Test Split', 'Training Acc', 'Validation Acc', 'Testing Acc', 'Hyperparameters'])
# Display the DataFrame
df

Unnamed: 0,Classifier,Train/Test Split,Training Acc,Validation Acc,Testing Acc,Hyperparameters
0,Logistic Regression,20:80,0.801254,0.802489,0.802334,{'C': [100]}
1,Logistic Regression,50:50,0.798423,0.800225,0.799365,{'C': [100]}
2,Logistic Regression,80:20,0.792149,0.797441,0.793735,{'C': [100]}
3,Linear SVC,20:80,0.776171,0.773217,0.773706,{'C': [0.1]}
4,Linear SVC,50:50,0.784381,0.778966,0.782032,{'C': [1e-06]}
5,Linear SVC,80:20,0.614029,0.609853,0.615382,{'C': [0.01]}
6,Random Forest,20:80,1.0,0.85489,0.856587,{'max_features': [16]}
7,Random Forest,50:50,1.0,0.855249,0.852288,{'max_features': [16]}
8,Random Forest,80:20,1.0,0.857326,0.854745,{'max_features': [16]}
9,Decision Tree,20:80,0.859483,0.851403,0.854847,"{'criterion': ['gini'], 'max_depth': [5]}"


In [20]:
df = df.style.set_caption("Adult dataset model performance (accuracies are averaged over 3 trials)")
df

Unnamed: 0,Classifier,Train/Test Split,Training Acc,Validation Acc,Testing Acc,Hyperparameters
0,Logistic Regression,20:80,0.808549,0.805368,0.808988,{'C': [10]}
1,Logistic Regression,50:50,0.795403,0.787838,0.793121,{'C': [10]}
2,Logistic Regression,80:20,0.801939,0.802815,0.806121,{'C': [10]}
3,Linear SVC,20:80,0.776726,0.780309,0.785717,{'C': [10]}
4,Linear SVC,50:50,0.787401,0.787907,0.793121,{'C': [10]}
5,Linear SVC,80:20,0.654371,0.654255,0.658307,{'C': [10]}
6,Decision Tree,20:80,0.850141,0.842829,0.847579,"{'criterion': ['gini'], 'max_depth': [5]}"
7,Decision Tree,50:50,0.849969,0.844705,0.853209,"{'criterion': ['gini'], 'max_depth': [5]}"
8,Decision Tree,80:20,0.85079,0.847985,0.855359,"{'criterion': ['gini'], 'max_depth': [5]}"
9,ANN,20:80,0.765677,0.763983,0.762753,"{'hidden_layer_sizes': [(128,)]}"


In [40]:
wine_red = pd.read_csv('winequality-red.csv',sep=';')
wine_white = pd.read_csv('winequality-white.csv',sep=';')

In [41]:
median_quality = wine_white['quality'].median()
print(median_quality)
wine_white['quality'] = wine_white['quality'].apply(lambda x: 'good' if x >= median_quality else 'bad')
print(wine_white['quality'].value_counts())

median_quality_red = wine_red['quality'].median()
print(median_quality_red)
wine_red['quality'] = wine_red['quality'].apply(lambda x: 'good' if x >= median_quality_red else 'bad')
print(wine_red['quality'].value_counts())


6.0
quality
good    3258
bad     1640
Name: count, dtype: int64
6.0
quality
good    855
bad     744
Name: count, dtype: int64


In [42]:
wine_white[15:300]

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
15,6.6,0.17,0.38,1.5,0.032,28.0,112.0,0.9914,3.25,0.55,11.4,good
16,6.3,0.48,0.04,1.1,0.046,30.0,99.0,0.9928,3.24,0.36,9.6,good
17,6.2,0.66,0.48,1.2,0.029,29.0,75.0,0.9892,3.33,0.39,12.8,good
18,7.4,0.34,0.42,1.1,0.033,17.0,171.0,0.9917,3.12,0.53,11.3,good
19,6.5,0.31,0.14,7.5,0.044,34.0,133.0,0.9955,3.22,0.50,9.5,bad
...,...,...,...,...,...,...,...,...,...,...,...,...
295,6.3,0.33,0.27,1.2,0.046,34.0,175.0,0.9934,3.37,0.54,9.4,good
296,8.3,0.39,0.70,10.6,0.045,33.0,169.0,0.9976,3.09,0.57,9.4,bad
297,7.2,0.19,0.46,3.8,0.041,82.0,187.0,0.9932,3.19,0.60,11.2,good
298,7.5,0.17,0.44,11.3,0.046,65.0,146.0,0.9970,3.17,0.45,10.0,good


In [43]:
wine_red = shuffle(wine_red)
X = wine_red.drop('quality', axis = 1)
y = wine_red['quality']
rows=[]


In [52]:
wine_white = shuffle(wine_white)
X = wine_white.drop('quality', axis = 1)
y = wine_white['quality']
rows=[]

In [53]:
X_D, X_test, y_D, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [54]:
C_list = [.000001,.00001,.0001,0.001, 0.01, 0.1, 1, 10,100,1000]
param_grid = {'C': C_list}
classifier = LogisticRegression(solver="liblinear", class_weight="balanced")
clf = GridSearchCV(classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("Logistic Regression", r)
print(r)

doing split: 0.2
0.6934865900383141
found hyperparams
0.6934865900383141
0.6934865900383141
doing split: 0.5
0.7182235834609495
found hyperparams
0.7182235834609495
0.7182235834609495
doing split: 0.8
0.7198468410976387
found hyperparams
0.7198468410976387
0.7198468410976387
{'20:80': (0.6934865900383141, 0.7119617224880382, 0.7040816326530613, {'C': [100]}), '50:50': (0.7182235834609495, 0.7100561510974988, 0.6979591836734694, {'C': [100]}), '80:20': (0.7198468410976387, 0.7117346938775512, 0.7020408163265306, {'C': [100]})}


In [55]:
classifier = LinearSVC(class_weight="balanced")
clf = GridSearchCV(classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test,redo_hyperparam=True)
add_to_rows("Linear SVC", r)
print(r)

doing split: 0.2
0.6807151979565773
0.6973180076628352
0.685823754789272
doing split: 0.5
0.6937212863705973
0.698825931597754
0.6641143440530883
doing split: 0.8
0.6576260370134014
0.7169751116783663
0.6898532227185705
{'20:80': (0.6879523201362282, 0.6749601275917065, 0.6680272108843536, {'C': [0.01]}), '50:50': (0.6855538540071465, 0.6932108218478815, 0.6921768707482993, {'C': [100]}), '80:20': (0.6881514571367794, 0.6870748299319728, 0.6863945578231293, {'C': [0.01]})}


In [56]:
from sklearn.ensemble import RandomForestClassifier
max_features_options = [1, 2, 4, 8, 16]
param_grid = {'max_features': max_features_options}
classifier = RandomForestClassifier(n_estimators=1024, random_state=42)
clf = GridSearchCV(classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("Random Forest", r)
print(r)

doing split: 0.2
1.0
found hyperparams
1.0
1.0
doing split: 0.5
1.0
found hyperparams
1.0
1.0
doing split: 0.8
1.0
found hyperparams
1.0
1.0
{'20:80': (1.0, 0.7808612440191386, 0.7744897959183673, {'max_features': [1]}), '50:50': (1.0, 0.8080653394589076, 0.8030612244897958, {'max_features': [1]}), '80:20': (1.0, 0.8290816326530613, 0.8275510204081633, {'max_features': [1]})}


In [57]:
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15],
    #'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4]
}

# Create the base model
classifier = DecisionTreeClassifier(random_state=42)

# Instantiate the grid search model
clf = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=3)

r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("Decision Tree", r)
print(r)


doing split: 0.2
0.9438058748403576
found hyperparams
0.9438058748403576
0.9438058748403576
doing split: 0.5
0.9101582440020418
found hyperparams
0.9101582440020418
0.9101582440020418
doing split: 0.8
0.8841735800893427
found hyperparams
0.8841735800893427
0.8841735800893427
{'20:80': (0.9438058748403576, 0.6979266347687401, 0.6561224489795918, {'criterion': ['entropy'], 'max_depth': [10]}), '50:50': (0.9101582440020418, 0.7274119448698316, 0.7377551020408163, {'criterion': ['entropy'], 'max_depth': [10]}), '80:20': (0.8841735800893428, 0.7602040816326531, 0.75, {'criterion': ['entropy'], 'max_depth': [10]})}


In [58]:
from sklearn.neural_network import MLPClassifier

# Define the list of hidden unit sizes and momentum values
hidden_units_list = [(1,), (4,), (32,), (128,)]  # Each tuple represents a layer configuration
momentum_list = [0, 0.2, 0.5, 0.9]

# Create the parameter grid
param_grid = {
    'hidden_layer_sizes': hidden_units_list#,
    #'momentum': momentum_list
}

# Initialize the ANN classifier
# solver='sgd' is used here to enable the momentum parameter
ann_classifier = MLPClassifier(solver='sgd',max_iter=5)

# Create the GridSearchCV object
clf = GridSearchCV(ann_classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("ANN", r)
print(r)

doing split: 0.2
0.669220945083014
found hyperparams
0.6641123882503193
0.6500638569604087
doing split: 0.5
0.6559469116896376
found hyperparams
0.6375701888718734
0.5829504849412965
doing split: 0.8
0.6678366305041481
found hyperparams
0.6675175494575623
0.671665603063178
{'20:80': (0.6611323967645807, 0.6707070707070707, 0.6891156462585034, {'hidden_layer_sizes': [(32,)]}), '50:50': (0.6254891951676026, 0.6171516079632465, 0.6095238095238096, {'hidden_layer_sizes': [(32,)]}), '80:20': (0.6690065943416293, 0.6683673469387754, 0.6795918367346938, {'hidden_layer_sizes': [(32,)]})}


In [59]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

# Define the range of K values
K_values = np.linspace(1, 1000, 13, dtype=int)

# Define the weighting functions

# Create the parameter grid
param_grid = {
    'n_neighbors': K_values,
}

# Initialize the KNN classifier
knn_classifier = KNeighborsClassifier(metric='euclidean')

# Create the GridSearchCV object
clf = GridSearchCV(knn_classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("K Neighbors", r)
print(r)

doing split: 0.2
0.6743295019157088
found hyperparams
0.6743295019157088
0.6743295019157088
doing split: 0.5
0.6697294538029607
found hyperparams
0.6697294538029607
0.6697294538029607
doing split: 0.8
0.6818761965539247
found hyperparams
0.6818761965539247
0.6818761965539247
{'20:80': (0.6743295019157088, 0.6602870813397129, 0.6642857142857143, {'n_neighbors': [84]}), '50:50': (0.6697294538029607, 0.6758550280755488, 0.6806122448979591, {'n_neighbors': [84]}), '80:20': (0.6818761965539247, 0.6568877551020408, 0.6897959183673471, {'n_neighbors': [84]})}


In [11]:
C_list = [.000001,.00001,.0001,0.001, 0.01, 0.1, 1, 10,100,1000]
param_grid = {'C': C_list}
classifier = LogisticRegression(solver="liblinear", class_weight="balanced")
clf = GridSearchCV(classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("Logistic Regression", r)
print(r)

{'20:80': (0.7607843137254902, 0.7119140625, 0.703125, {'C': [1000]}), '50:50': (0.755868544600939, 0.7484375, 0.7312499999999998, {'C': [1000]}), '80:20': (0.7605083088954055, 0.73046875, 0.740625, {'C': [1000]})}


In [12]:
from sklearn.ensemble import RandomForestClassifier
max_features_options = [1, 2, 4, 6, 8, 12, 16, 20]
param_grid = {'max_features': max_features_options}
classifier = RandomForestClassifier(n_estimators=1024, random_state=42)
clf = GridSearchCV(classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
print(r)

{'20:80': (1.0, 0.748046875, 0.7624999999999998, {'max_features': [6]}), '50:50': (1.0, 0.7453125000000002, 0.78125, {'max_features': [6]}), '80:20': (1.0, 0.79296875, 0.8156250000000002, {'max_features': [6]})}


In [None]:
from sklearn.svm import SVC
param_grid = {
    #'C': [10**i for i in range(-4, 3)], 
    'kernel': ['linear', 'poly', 'rbf'],  # Kernel types
    'degree': [2, 3],  # Polynomial degree (only relevant for 'poly' kernel)
    'gamma': [ 0.005, 0.05, 0.5, 2]  # Kernel coefficient for 'rbf'
}

# Initialize the SVM classifier
svm_classifier = SVC()

# Initialize GridSearchCV
clf = GridSearchCV(svm_classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("SVC", r)
print(r)

In [None]:
classifier = LinearSVC(class_weight="balanced")
clf = GridSearchCV(classifier, param_grid, cv=3)
r = get_results(X_D,y_D,clf,X_test=X_test,y_test=y_test)
add_to_rows("Linear SVC", r)
print(r)

In [60]:
df = pd.DataFrame(rows, columns=['Classifier', 'Train/Test Split', 'Training Acc', 'Validation Acc', 'Testing Acc', 'Hyperparameters'])
df

Unnamed: 0,Classifier,Train/Test Split,Training Acc,Validation Acc,Testing Acc,Hyperparameters
0,Logistic Regression,20:80,0.693487,0.711962,0.704082,{'C': [100]}
1,Logistic Regression,50:50,0.718224,0.710056,0.697959,{'C': [100]}
2,Logistic Regression,80:20,0.719847,0.711735,0.702041,{'C': [100]}
3,Linear SVC,20:80,0.687952,0.67496,0.668027,{'C': [0.01]}
4,Linear SVC,50:50,0.685554,0.693211,0.692177,{'C': [100]}
5,Linear SVC,80:20,0.688151,0.687075,0.686395,{'C': [0.01]}
6,Random Forest,20:80,1.0,0.780861,0.77449,{'max_features': [1]}
7,Random Forest,50:50,1.0,0.808065,0.803061,{'max_features': [1]}
8,Random Forest,80:20,1.0,0.829082,0.827551,{'max_features': [1]}
9,Decision Tree,20:80,0.943806,0.697927,0.656122,"{'criterion': ['entropy'], 'max_depth': [10]}"
