In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier

ADULT - https://urldefense.com/v3/__http://archive.ics.uci.edu/dataset/20/census*income__;Kw!!Mih3wA!BW0GBzh4QR1xoMbAhYzMoFTAdkFgEDkXyIMIDJhfKyCTpKpGW-SYpIw118DtvJGDPZaDGkcdIo_KKK6TihQiBR9xnA$ 

In [2]:
income = pd.read_csv("data/adult.data", header=None)
income.columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 
    'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 
    'hours-per-week', 'native-country','income']

In [3]:
# Assuming 'income' is your DataFrame
label_encoder = LabelEncoder()

# List of categorical columns to label encode
categorical_columns = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'income']

# Label encode each categorical column
for column in categorical_columns:
    income[column] = label_encoder.fit_transform(income[column])

# Drop the 'education' column
income.drop('education', axis=1, inplace=True)

In [6]:
chess = pd.read_csv("data/krkopt.data", header=None)
chess.columns = ['w_king_col', 'w_king_row', 'w_rook_col', 'w_rook_row', 'b_king_col', 'b_king_row', 'turns_win']

mapping_dict = {'draw': -1,
    'zero': 0, 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7,
    'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14,
    'fifteen': 15, 'sixteen': 16
}

chess['turns_win'] = chess['turns_win'].replace(mapping_dict).astype(int)

mapping_dict = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e':5, 'f':6, 'g':7, 'h':8}

# Apply mapping to '_col' columns
col_columns = ['w_king_col', 'w_rook_col', 'b_king_col']

for col in col_columns:
    chess[col] = chess[col].map(mapping_dict).astype(float)

In [7]:
car = pd.read_csv("data/car.data", header=None)
car.columns = ['buying','maint','doors','persons','lug_boot','safety','class']

mapping_dict = {'vhigh':3,'high':2,'med':1,'low':0}
col_columns = ['buying','maint']
for col in col_columns:
    car[col] = car[col].map(mapping_dict).astype(int)

mapping_dict = {'2':2,'3':3,'4':4,'5more':5}
car['doors'] = car['doors'].map(mapping_dict).astype(int)

mapping_dict = {'2':2,'4':4,'more':6}
car['persons'] = car['persons'].map(mapping_dict).astype(int)

mapping_dict = {'small':0,'med':1,'big':2}
car['lug_boot'] = car['lug_boot'].map(mapping_dict).astype(int)

mapping_dict = {'low':0,'med':1,'high':2}
car['safety'] = car['safety'].map(mapping_dict).astype(int)

mapping_dict = {'unacc':0,'acc':1,'good':2,'vgood':3}
car['class'] = car['class'].map(mapping_dict).astype(int)

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# Define datasets and their corresponding target columns
datasets_info = {
    'income': {'data': income, 'target_column': 'income'},
    'chess': {'data': chess, 'target_column': 'turns_win'},
    'car': {'data': car, 'target_column': 'class'}
}

classifiers = [
    ('Random Forest', RandomForestClassifier(), {'n_estimators': [10, 50, 100, 500, 1000]}),
    ('KNN', KNeighborsClassifier(), {'n_neighbors': list(range(1, 26)), 'weights': ['uniform', 'distance']}),
    ('Naive Bayes', GaussianNB(), {})
]

# Iterate over datasets
for dataset_name, dataset_info in datasets_info.items():
    dataset = dataset_info['data']
    target_column = dataset_info['target_column']

    X = dataset.drop(target_column, axis=1)
    y = dataset[target_column]

    # Split the dataset into training and testing sets for each partition
    for test_size in [0.2, 0.5, 0.8]:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        # Iterate over classifiers
        for classifier_name, classifier, param_grid in classifiers:
            print(f"Dataset: {dataset_name}, Test Size: {test_size}, Classifier: {classifier_name}")

            # Perform hyperparameter tuning using GridSearchCV
            grid_search = GridSearchCV(estimator=classifier, param_grid=param_grid, scoring='accuracy', cv=3)
            grid_search.fit(X_train, y_train)

            # Get the best hyperparameters
            best_params = grid_search.best_params_

            print(f"Best Hyperparameters: {best_params}")

            # Train the classifier on the training set with the best hyperparameters
            best_classifier = grid_search.best_estimator_
            best_classifier.fit(X_train, y_train)

            # Evaluate on the training set
            train_accuracy = best_classifier.score(X_train, y_train)
            print(f"Training Accuracy: {train_accuracy}")

            # Evaluate on the validation set
            validation_accuracy = best_classifier.score(X_test, y_test)
            print(f"Validation Accuracy: {validation_accuracy}")

            # Evaluate on the test set
            test_accuracy = best_classifier.score(X_test, y_test)
            print(f"Test Accuracy: {test_accuracy}")

            # Perform any additional reporting or analysis as needed
            # ...

            print("\n")

Dataset: income, Test Size: 0.2, Classifier: Random Forest
Best Hyperparameters: {'n_estimators': 1000}
Training Accuracy: 1.0
Validation Accuracy: 0.8642714570858283
Test Accuracy: 0.8642714570858283


Dataset: income, Test Size: 0.2, Classifier: KNN
Best Hyperparameters: {'n_neighbors': 19, 'weights': 'uniform'}
Training Accuracy: 0.8033246314496314
Validation Accuracy: 0.8016275142023646
Test Accuracy: 0.8016275142023646


Dataset: income, Test Size: 0.2, Classifier: Naive Bayes
Best Hyperparameters: {}
Training Accuracy: 0.7943796068796068
Validation Accuracy: 0.7994779671426377
Test Accuracy: 0.7994779671426377


Dataset: income, Test Size: 0.5, Classifier: Random Forest
Best Hyperparameters: {'n_estimators': 1000}
Training Accuracy: 1.0
Validation Accuracy: 0.8592838277747067
Test Accuracy: 0.8592838277747067


Dataset: income, Test Size: 0.5, Classifier: KNN
Best Hyperparameters: {'n_neighbors': 17, 'weights': 'uniform'}
Training Accuracy: 0.8031326781326781
Validation Accuracy: