In [1]:
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from ucimlrepo import fetch_ucirepo 
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

# Training methods

In [2]:
def get_train_test(X, y, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [3]:
def train_evaluate_decision_tree(X_train, y_train, X_test, y_test, param):

    # Initialize the classifier
    clf = DecisionTreeClassifier(**param)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Predict on the test data
    y_pred = clf.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

In [4]:
def train_evaluate_svm(X_train, y_train, X_test, y_test, param):

    # Initialize the Support Vector Classifier
    svm_clf = SVC(**param)

    # Train the classifier
    svm_clf.fit(X_train, y_train)

    # Predict on the test data
    y_pred = svm_clf.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

In [5]:
def train_evaluate_logistic_regression(X_train, y_train, X_test, y_test):
    # Initialize the Logistic Regression Classifier
    log_reg = LogisticRegression()

    # Train the classifier on the training data
    log_reg.fit(X_train, y_train)

    # Predict on the test data
    y_pred = log_reg.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

In [6]:
def train_neural_nets(X_train, y_train, X_test, y_test, param):

    # Initialize the Multi-layer Perceptron Classifier
    mlp = MLPClassifier(**param)

    # Train the classifier on the training data
    mlp.fit(X_train, y_train)

    # Predict on the test data
    y_pred = mlp.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

In [7]:
def train_evaluate_random_forest(X_train, y_train, X_test, y_test,param):

    # Initialize the Random Forest Classifier
    rf_clf = RandomForestClassifier(**param)

    # Train the classifier on the training data
    rf_clf.fit(X_train, y_train)

    # Predict on the test data
    y_pred = rf_clf.predict(X_test)

    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

#
#

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

def param_dt(X,y):
    # Parameter grid for Decision Tree
    param_grid_dt = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Grid search with cross-validation
    grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5)
    grid_search_dt.fit(X, y)

    return grid_search_dt.best_params_


In [9]:
from sklearn.svm import SVC

def param_svm(X,y):
    param_grid_svm = {
        'C': [0.1, 1],
        'gamma': [1, 0.1, 0.01],
        'kernel': ['rbf', 'linear']
    }

    # Grid search with cross-validation
    grid_search_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=2)
    grid_search_svm.fit(X, y)

    return grid_search_svm.best_params_

In [10]:
"""from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV

# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Use LogisticRegressionCV to find the best parameters with internal cross-validation
log_reg_cv = LogisticRegressionCV(cv=5, max_iter=1000, random_state=42, solver='lbfgs')
log_reg_cv.fit(X_train_scaled, y_train)"""

"from sklearn.preprocessing import StandardScaler\nfrom sklearn.linear_model import LogisticRegressionCV\n\n# Scaling the features\nscaler = StandardScaler()\nX_train_scaled = scaler.fit_transform(X_train)\nX_test_scaled = scaler.transform(X_test)\n\n# Use LogisticRegressionCV to find the best parameters with internal cross-validation\nlog_reg_cv = LogisticRegressionCV(cv=5, max_iter=1000, random_state=42, solver='lbfgs')\nlog_reg_cv.fit(X_train_scaled, y_train)"

In [43]:
from sklearn.neural_network import MLPClassifier

def param_nn(X,y):
    # Parameter grid for Neural Network
    param_grid_nn = {
        'hidden_layer_sizes': [(50,), (100,),(200,)],
        'activation': ['tanh', 'relu'],
        'alpha': [0.0001, 0.001, 0.01]
    }

    # Grid search with cross-validation
    grid_search_nn = GridSearchCV(MLPClassifier(random_state=42), param_grid_nn, cv=3)
    grid_search_nn.fit(X, y)

    return grid_search_nn.best_params_


In [12]:
from sklearn.ensemble import RandomForestClassifier

def param_rf(X,y):
    # Parameter grid for Random Forest
    param_grid_rf = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }

    # Grid search with cross-validation
    grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5)
    grid_search_rf.fit(X, y)

    return grid_search_rf.best_params_


#
#

In [13]:
# fetch dataset 
mushroom = fetch_ucirepo(id=73) 
  
# data (as pandas dataframes) 
X_1 = mushroom.data.features 
y_1 = mushroom.data.targets 

# One-Hot Encoding for Features
encoder = OneHotEncoder()
X_1_encoded = encoder.fit_transform(X_1)

# Label Encoding for Target
label_encoder = LabelEncoder()
y_1_encoded = label_encoder.fit_transform(y_1.values.ravel()) 




# Target Variable Encoding
label_encoder = LabelEncoder()
y_dataset_1 = label_encoder.fit_transform(y_1['poisonous'])

# One-Hot Encoding for Categorical Features
X_1_encoded = pd.get_dummies(X_1)

# Handle Missing Values (if any)
# Assuming '?' represents missing values in the dataset
# Replace '?' with NaN and then with the most frequent value in each column
X_1_encoded = X_1_encoded.replace('?', pd.NA)
X_dataset_1 = X_1_encoded.apply(lambda x: x.fillna(x.mode()[0]), axis=0)

In [14]:
x1 = (X_dataset_1,y_dataset_1)
x1[0]

Unnamed: 0,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,cap-surface_s,cap-surface_y,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,0,0,0,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8120,0,0,0,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
8121,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,1,0,0,0,0


In [15]:
# fetch dataset 
automobile = fetch_ucirepo(id=10) 
  
# data (as pandas dataframes) 
X_2 = automobile.data.features 
y_2 = automobile.data.targets 

# Define the columns by type
continuous_cols = X_2.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X_2.select_dtypes(include=['object']).columns

# Preprocessing for continuous features
continuous_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', continuous_transformer, continuous_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Preprocess the data
X_2_preprocessed = preprocessor.fit_transform(X_2)

# Get feature names for transformed categorical columns
cat_cols_transformed = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical_cols)
all_cols = np.append(continuous_cols, cat_cols_transformed)

# Convert to DataFrame
X_dataset_2 = pd.DataFrame(X_2_preprocessed, columns=all_cols)


y_dataset_2 = (y_2 >= 0).astype(int)

In [16]:
x2 = (X_dataset_2,y_dataset_2)
x2[0]

Unnamed: 0,price,highway-mpg,city-mpg,peak-rpm,horsepower,compression-ratio,stroke,bore,engine-size,num-of-cylinders,...,make_nissan,make_peugot,make_plymouth,make_porsche,make_renault,make_saab,make_subaru,make_toyota,make_volkswagen,make_volvo
0,0.036674,-0.546059,-0.646553,-0.263484,0.171065,-0.288349,-1.839404,0.519089,0.074449,-0.352887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.419498,-0.546059,-0.646553,-0.263484,0.171065,-0.288349,-1.839404,0.519089,0.074449,-0.352887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.419498,-0.691627,-0.953012,-0.263484,1.261807,-0.288349,0.685920,-2.404862,0.604046,1.502032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.094639,-0.109354,-0.186865,0.787346,-0.057230,-0.035973,0.462157,-0.517248,-0.431076,-0.352887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.540524,-1.273900,-1.106241,0.787346,0.272529,-0.540725,0.462157,-0.517248,0.218885,0.574572,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,0.463449,-0.400490,-0.340094,0.577180,0.247163,-0.162161,-0.336996,1.666463,0.339248,-0.352887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
201,0.743720,-0.837195,-0.953012,0.367014,1.414003,-0.364062,-0.336996,1.666463,0.339248,-0.352887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
202,1.054566,-1.128332,-1.106241,0.787346,0.754485,-0.338824,-1.232047,0.926222,1.109571,1.502032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
203,1.180051,-0.546059,0.119594,-0.683816,0.044234,3.244916,0.462157,-1.183465,0.435538,1.502032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [31]:
# fetch dataset 
credit_approval = fetch_ucirepo(id=27) 
  
# data (as pandas dataframes) 
X_3 = credit_approval.data.features 
y_3 = credit_approval.data.targets 
  


categorical_cols = X_3.select_dtypes (include = ['object', 'category']).columns
numerical_cols = X_3.select_dtypes(include=['int64', 'float64']).columns

# Preprocessing for continuous features
continuous_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', continuous_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])


# One-Hot Encoding for Features
encoder = OneHotEncoder()
X_3_encoded = encoder.fit_transform(X_3)


# One-Hot Encoding for Categorical Features
X_3_encoded = pd.get_dummies(X_3)

X_dataset_3 = X_3_encoded.apply(lambda x: x.fillna(x.mode()[0]), axis=0)

y_dataset_3 = y_3.replace({'+': 1, '-': 0})

In [37]:
# Fetch dataset
credit_approval = fetch_ucirepo(id=27)

# Data (as pandas dataframes)
X_3 = credit_approval.data.features
y_3 = credit_approval.data.targets

# Identify categorical and numerical columns
categorical_cols = X_3.select_dtypes(include=['object', 'category']).columns
numerical_cols = X_3.select_dtypes(include=['int64', 'float64']).columns

# numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Imputing missing values with mean
    ('scaler', StandardScaler())  # Scaling numerical features
])

# categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputing missing values with most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

X_dataset_3_transformed = preprocessor.fit_transform(X_3)

X_dataset_3 = pd.DataFrame(X_dataset_3_transformed, columns=preprocessor.get_feature_names_out())

y_dataset_3 = y_3.replace({'+': 1, '-': 0})

In [38]:
x3 = (X_dataset_3,y_dataset_3)
x3[0]

Unnamed: 0,num__A15,num__A14,num__A11,num__A8,num__A3,num__A2,cat__A13_g,cat__A13_p,cat__A13_s,cat__A12_f,...,cat__A6_w,cat__A6_x,cat__A5_g,cat__A5_gg,cat__A5_p,cat__A4_l,cat__A4_u,cat__A4_y,cat__A1_a,cat__A1_b
0,-0.195413,0.104544,-0.288101,-0.291083,-0.956613,-0.062321,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,-0.087852,-0.819689,0.740830,0.244190,-0.060051,2.288101,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,-0.037144,0.557942,-0.493887,-0.216324,-0.856102,-0.596738,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,-0.194837,-0.488360,0.535044,0.456505,-0.647038,-0.315599,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,-0.195413,-0.372104,-0.493887,-0.153526,0.174141,-0.962303,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,-0.195413,0.441686,-0.493887,-0.291083,1.070704,-0.885475,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
686,-0.119736,0.092919,-0.082314,-0.066806,-0.805846,-0.751238,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
687,-0.195221,0.092919,-0.288101,-0.066806,1.757198,-0.533418,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
688,-0.051358,0.557942,-0.493887,-0.652915,-0.915403,-1.152262,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


## Here ans is used to store the data:

In [39]:
ans= []
for i in [x1,x2,x3]:
    ans.append(list(map(lambda x: train_test_split(i[0], i[1], test_size=x, random_state=42), [0.2,0.5,0.8])))

### Find the parameters:

In [44]:
params = [param_dt,param_svm,param_nn,param_rf]
best_params = []

In [45]:
for dataset in ans:
    best_params_for_dataset = []
    for partition in dataset:
        best_params_for_partition = []
        for param in params:
            best_params_for_partition.append(param(partition[0],np.ravel(partition[2])))
        best_params_for_dataset.append(best_params_for_partition)
    best_params.append(best_params_for_dataset)
    





















In [46]:
best_params

[[[{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2},
   {'C': 1, 'gamma': 1, 'kernel': 'linear'},
   {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50,)},
   {'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 100}],
  [{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2},
   {'C': 1, 'gamma': 1, 'kernel': 'linear'},
   {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (50,)},
   {'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 100}],
  [{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2},
   {'C': 1, 'gamma': 1, 'kernel': 'linear'},
   {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100,)},
   {'max_depth': None,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 100}]],
 [[{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2},
   {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'},
 

In [94]:
a = best_params[2][0][3]
a

{'max_depth': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100}

In [54]:
len(best_params[0][0])

4

In [55]:
methods = [train_evaluate_decision_tree,train_evaluate_svm,train_neural_nets,train_evaluate_random_forest]

In [56]:
ans[0][0]

[      cap-shape_b  cap-shape_c  cap-shape_f  cap-shape_k  cap-shape_s  \
 7873            0            0            0            1            0   
 6515            0            0            0            0            0   
 6141            0            0            1            0            0   
 2764            0            0            1            0            0   
 438             1            0            0            0            0   
 ...           ...          ...          ...          ...          ...   
 5226            0            0            0            0            0   
 5390            0            0            0            1            0   
 860             0            0            1            0            0   
 7603            0            0            0            1            0   
 7270            0            0            0            1            0   
 
       cap-shape_x  cap-surface_f  cap-surface_g  cap-surface_s  cap-surface_y  \
 7873            0          

### The following report is for test data

In [167]:
reports_testing = []
for i in range(3):
    temp1 = []
    for j in range(3):
        temp2 = []
        for k in range(4):
            method = methods[k]
            data = ans[i][j]
            param = best_params[i][j][k]
            temp2.append(method(data[0],np.ravel(data[2]),data[1],np.ravel(data[3]),param))
        temp2.append(train_evaluate_logistic_regression(data[0],data[2],data[1],data[3]))
        temp1.append(temp2)
    reports_testing.append(temp1)

            
        


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


'reports' here have 4 layers. First layer - dataset (mushroom, car or xxx). Second layer - different partitions for the dataset. Third layer - different learning methods (in the order of decisiontree, svm, mlp, randomforest, logistic regression),the forth layer is a tuple, with the accuracy being the 0th element and the report being the 1st.

In [273]:
reports_testing[1][0][0][0]

0.9024390243902439

### The following report is for training data

In [235]:
reports_training = []
for i in range(3):
    temp1 = []
    for j in range(3):
        temp2 = []
        for k in range(4):
            method = methods[k]
            data = ans[i][j]
            param = best_params[i][j][k]
            temp2.append(method(data[0],np.ravel(data[2]),data[0],np.ravel(data[2]),param))
        temp2.append(train_evaluate_logistic_regression(data[0],data[2],data[1],data[3]))
        temp1.append(temp2)
    reports_training.append(temp1)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [312]:
reports_training[2][2][4][0]

0.8768115942028986

### The following is for CV:

In [121]:
met = [DecisionTreeClassifier,SVC,MLPClassifier,RandomForestClassifier,LogisticRegression]

In [122]:
reports_cv = []
for i in range(3):
    temp1 = []
    for j in range(3):
        temp2 = []
        for k in range(4):
            data = ans[i][j]
            param = best_params[i][j][k]
            method = met[k](**param)
            temp2.append(cross_val_score(method, data[0], np.ravel(data[2]), cv=2, scoring='accuracy'))
        temp2.append(cross_val_score(met[4](), data[0], np.ravel(data[2]), cv=2, scoring='accuracy'))
        temp1.append(temp2)
    reports_cv.append(temp1)



'reports_cv' have 4 layers. 1st layer(len 3) being the different datasets, 2nd layer(len 3) being the different partitions for the specific dataset, 3rd layer (length 5) being the 5 different learning methods (in the order of decisiontree, svc, mlp, randomforest, logistic regression). The final layer is a 2d array with the accuracy score of each cross-validation. To find out the mean of a specific method, use np.mean.

In [374]:
np.mean(reports_cv[2][0][4])

0.8641304347826086