# COGS 118A Final Project Source Code 

## Imports

Here are all the packages we use for the data retireval, data pre-processing, and classifier implementations.

In [1]:
# UCI Data Retreival
from ucimlrepo import fetch_ucirepo 

# For data preprocessing
import pandas as pd
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import StandardScaler      
from sklearn.model_selection import GridSearchCV      

# Classifiers
from sklearn.ensemble import RandomForestClassifier   
from sklearn.svm import SVC                           
from sklearn.linear_model import LogisticRegression   

# For evaluation
from sklearn.metrics import accuracy_score

# For Results
from tabulate import tabulate

## Function to Create Partitions of Data

This function partitions the data in all ways the paper wants us to partition, 20/80, 50/50, 80/20.

In [3]:
# Partitions data in three wats, and store the partitions in a list of dictionaries

def partition_data(X, y, random_state=42):
    def create_partition(X, y, test_size, val_size=0.2, split_name="Partition"):     
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        X_train_final, X_val, y_train_final, y_val = train_test_split(
            X_train, y_train, test_size=val_size, random_state=random_state
        )
        return {
            "X_train": X_train_final,
            "X_val": X_val,
            "X_test": X_test,
            "y_train": y_train_final,
            "y_val": y_val,
            "y_test": y_test,
            "name": split_name
        }
    partitions = [
        create_partition(X, y, test_size=0.8, split_name="20/80"),
        create_partition(X, y, test_size=0.5, split_name="50/50"),
        create_partition(X, y, test_size=0.2, split_name="80/20")
    ]
    return partitions


## Process Models Function

This function runs the partitions of each dataset through each classifier with optimal parameters.

In [5]:
def process_model(partitions, model_type, param_grid, scaler_required=False):
    bold_model_name = f"\033[1m{model_type.__name__} Results\033[0m"
    print(bold_model_name)
    table = []  
    overall_training_accuracies = []  
    overall_validation_accuracies = []  
    overall_test_accuracies = []  
    for partition in partitions:
        partition_name = partition['name']
        training_accuracies = []
        validation_accuracies = []
        test_accuracies = []
        best_params_overall = None
        best_cv_score_overall = 0  
        for iteration in range(3):  
            y_train_flat = partition["y_train"].to_numpy().ravel()
            y_val_flat = partition["y_val"].to_numpy().ravel()
            y_test_flat = partition["y_test"].to_numpy().ravel()
            model = model_type(random_state=42) if model_type != SVC else model_type()
            grid_search = GridSearchCV(
                model,
                param_grid,
                cv=5,
                scoring='accuracy',
                verbose=0,
                n_jobs=-1
            )
            if scaler_required:
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(partition["X_train"])
                X_val_scaled = scaler.transform(partition["X_val"])
                X_test_scaled = scaler.transform(partition["X_test"])
            else:
                X_train_scaled = partition["X_train"]
                X_val_scaled = partition["X_val"]
                X_test_scaled = partition["X_test"]
            grid_search.fit(X_train_scaled, y_train_flat)
            best_params = grid_search.best_params_
            best_cv_score = grid_search.best_score_
            best_model = grid_search.best_estimator_
            if best_cv_score > best_cv_score_overall:
                best_params_overall = best_params
                best_cv_score_overall = best_cv_score
            y_val_pred = best_model.predict(X_val_scaled)
            validation_accuracy = accuracy_score(y_val_flat, y_val_pred)
            y_train_pred = best_model.predict(X_train_scaled)
            training_accuracy = accuracy_score(y_train_flat, y_train_pred)
            y_test_pred = best_model.predict(X_test_scaled)
            test_accuracy = accuracy_score(y_test_flat, y_test_pred)
            training_accuracies.append(training_accuracy)
            validation_accuracies.append(validation_accuracy)
            test_accuracies.append(test_accuracy)
        avg_training_accuracy = sum(training_accuracies) / len(training_accuracies)
        avg_validation_accuracy = sum(validation_accuracies) / len(validation_accuracies)
        avg_test_accuracy = sum(test_accuracies) / len(test_accuracies)
        overall_training_accuracies.append(avg_training_accuracy)
        overall_validation_accuracies.append(avg_validation_accuracy)
        overall_test_accuracies.append(avg_test_accuracy)
        formatted_best_params = "\n".join([f"{key}: {value}" for key, value in best_params_overall.items()])
        table.append([
            partition_name,
            formatted_best_params,
            f"{best_cv_score_overall:.4f}",
            f"{avg_training_accuracy:.4f}",
            f"{avg_validation_accuracy:.4f}",
            f"{avg_test_accuracy:.4f}"
        ])
    overall_avg_training = sum(overall_training_accuracies) / len(overall_training_accuracies)
    overall_avg_validation = sum(overall_validation_accuracies) / len(overall_validation_accuracies)
    overall_avg_test = sum(overall_test_accuracies) / len(overall_test_accuracies)
    table.append([
        "Overall Averages",
        "-",
        "-",
        f"{overall_avg_training:.4f}",
        f"{overall_avg_validation:.4f}",
        f"{overall_avg_test:.4f}"
    ])
    headers = [
        "Partition (3 Iterations)", 
        "Best Params", 
        "CV Accuracy", 
        "Avg Training Accuracy", 
        "Avg Validation Accuracy", 
        "Avg Test Accuracy"
    ]
    print(tabulate(table, headers=headers, tablefmt="grid"))


## Parameter Grids

Here, we are defining our parameter grids we aim to cross-validate. We are not including non-linear kernal for SVM due to its high computational cost. 

In [7]:
# Define parameter grids for each model
svm_param_grid = {
    'C': [10**i for i in range(-7, 4)],    # Regularization parameters                       
}

rf_param_grid = {Cross-Validation 
    'n_estimators': [100, 200, 300],       # Number of trees
    'max_depth': [None, 10, 20, 30],      # Maximum depth
    'min_samples_split': [2, 5, 10],      # Minimum samples to split
    'min_samples_leaf': [1, 2, 4]         # Minimum samples at a leaf node
}

logreg_param_grid = {
    'C': [10**i for i in range(-4, 4)],   # Regularization strength
    'penalty': ['l2'],                    # L2 regularization
    'solver': ['lbfgs'],                  # Solver
    'max_iter': [500, 1000]               # Max iterations
}

## Dataset 1: Credit Approval

Retrieving the dataset from the UCI Repository.

In [9]:
credit_approval = fetch_ucirepo(id=27) 
  
X_1 = credit_approval.data.features 
y_1 = credit_approval.data.targets 
   
print(credit_approval.variables) 

   name     role         type demographic description units missing_values
0   A16   Target  Categorical        None        None  None             no
1   A15  Feature   Continuous        None        None  None             no
2   A14  Feature   Continuous        None        None  None            yes
3   A13  Feature  Categorical        None        None  None             no
4   A12  Feature  Categorical        None        None  None             no
5   A11  Feature   Continuous        None        None  None             no
6   A10  Feature  Categorical        None        None  None             no
7    A9  Feature  Categorical        None        None  None             no
8    A8  Feature   Continuous        None        None  None             no
9    A7  Feature  Categorical        None        None  None            yes
10   A6  Feature  Categorical        None        None  None            yes
11   A5  Feature  Categorical        None        None  None            yes
12   A4  Feature  Categor

In [13]:
print(X_1.head())

   A15    A14 A13 A12  A11 A10 A9    A8 A7 A6 A5 A4     A3     A2 A1
0    0  202.0   g   f    1   t  t  1.25  v  w  g  u  0.000  30.83  b
1  560   43.0   g   f    6   t  t  3.04  h  q  g  u  4.460  58.67  a
2  824  280.0   g   f    0   f  t  1.50  h  q  g  u  0.500  24.50  a
3    3  100.0   g   t    5   t  t  3.75  v  w  g  u  1.540  27.83  b
4    0  120.0   s   f    0   f  t  1.71  v  w  g  u  5.625  20.17  b


### Data Preprocessing

Dropping any missing rows, and one-hot encoding the categorical features. 

In [15]:
nan_indices = X_1[X_1.isna().any(axis=1)].index

X_1 = X_1.drop(index=nan_indices)  
y_1 = y_1.drop(index=nan_indices)  
X_1.reset_index(drop=True, inplace=True)
y_1.reset_index(drop=True, inplace=True)

non_numerical_columns = ['A1','A4','A5','A6','A7','A9','A10','A12','A13']

X_1_encoded = pd.get_dummies(X_1, columns=non_numerical_columns)
X_1_encoded = X_1_encoded.replace({True: 1, False: 0})

y_1.loc[:, 'A16'] = y_1['A16'].replace({'+': 1, '-': 0})
y_1 = y_1.astype(int)  

print(X_1_encoded.head())
print(y_1.head())

   A15    A14  A11    A8     A3     A2  A1_a  A1_b  A4_l  A4_u  ...  A7_z  \
0    0  202.0    1  1.25  0.000  30.83     0     1     0     1  ...     0   
1  560   43.0    6  3.04  4.460  58.67     1     0     0     1  ...     0   
2  824  280.0    0  1.50  0.500  24.50     1     0     0     1  ...     0   
3    3  100.0    5  3.75  1.540  27.83     0     1     0     1  ...     0   
4    0  120.0    0  1.71  5.625  20.17     0     1     0     1  ...     0   

   A9_f  A9_t  A10_f  A10_t  A12_f  A12_t  A13_g  A13_p  A13_s  
0     0     1      0      1      1      0      1      0      0  
1     0     1      0      1      1      0      1      0      0  
2     0     1      1      0      1      0      1      0      0  
3     0     1      0      1      0      1      1      0      0  
4     0     1      1      0      1      0      0      0      1  

[5 rows x 46 columns]
   A16
0    1
1    1
2    1
3    1
4    1


  X_1_encoded = X_1_encoded.replace({True: 1, False: 0})
  y_1.loc[:, 'A16'] = y_1['A16'].replace({'+': 1, '-': 0})


## Partitioning and Running Models

We are partitioning the data and running the partitions through the process model function to attain the results. 

In [17]:
partitions_1 = partition_data(X_1_encoded, y_1, random_state=42)

In [19]:
# Run SVM
process_model(partitions_1, SVC, svm_param_grid, scaler_required=True)

# Run Random Forest
process_model(partitions_1, RandomForestClassifier, rf_param_grid, scaler_required=False)

# Run Logistic Regression
process_model(partitions_1, LogisticRegression, logreg_param_grid, scaler_required=True)

[1mSVC Results[0m
+----------------------------+---------------+---------------+-------------------------+---------------------------+---------------------+
| Partition (3 Iterations)   | Best Params   | CV Accuracy   |   Avg Training Accuracy |   Avg Validation Accuracy |   Avg Test Accuracy |
| 20/80                      | C: 1          | 0.8257        |                  0.9519 |                    0.8462 |              0.8432 |
+----------------------------+---------------+---------------+-------------------------+---------------------------+---------------------+
| 50/50                      | C: 1          | 0.8615        |                  0.9269 |                    0.7576 |              0.8716 |
+----------------------------+---------------+---------------+-------------------------+---------------------------+---------------------+
| 80/20                      | C: 1          | 0.8777        |                  0.9209 |                    0.819  |              0.855  |
+------

## Dataset 2: Parkinson's

Retrieving the dataset from the UCI Repository.

In [22]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
parkinsons = fetch_ucirepo(id=174) 
  
# data (as pandas dataframes) 
X_2 = parkinsons.data.features 
y_2 = parkinsons.data.targets 
  
# variable information 
print(parkinsons.variables) 

            name     role         type demographic description units  \
0           name       ID  Categorical        None        None  None   
1        MDVP:Fo  Feature   Continuous        None        None    Hz   
2       MDVP:Fhi  Feature   Continuous        None        None    Hz   
3       MDVP:Flo  Feature   Continuous        None        None    Hz   
4    MDVP:Jitter  Feature   Continuous        None        None     %   
5    MDVP:Jitter  Feature   Continuous        None        None   Abs   
6       MDVP:RAP  Feature   Continuous        None        None  None   
7       MDVP:PPQ  Feature   Continuous        None        None  None   
8     Jitter:DDP  Feature   Continuous        None        None  None   
9   MDVP:Shimmer  Feature   Continuous        None        None  None   
10  MDVP:Shimmer  Feature   Continuous        None        None    dB   
11  Shimmer:APQ3  Feature   Continuous        None        None  None   
12  Shimmer:APQ5  Feature   Continuous        None        None  

In [23]:
print(X_2.head())

   MDVP:Fo  MDVP:Fhi  MDVP:Flo  MDVP:Jitter  MDVP:Jitter  MDVP:RAP  MDVP:PPQ  \
0  119.992   157.302    74.997      0.00784      0.00784   0.00370   0.00554   
1  122.400   148.650   113.819      0.00968      0.00968   0.00465   0.00696   
2  116.682   131.111   111.555      0.01050      0.01050   0.00544   0.00781   
3  116.676   137.871   111.366      0.00997      0.00997   0.00502   0.00698   
4  116.014   141.781   110.655      0.01284      0.01284   0.00655   0.00908   

   Jitter:DDP  MDVP:Shimmer  MDVP:Shimmer  ...  MDVP:APQ  Shimmer:DDA  \
0     0.01109       0.04374       0.04374  ...   0.02971      0.06545   
1     0.01394       0.06134       0.06134  ...   0.04368      0.09403   
2     0.01633       0.05233       0.05233  ...   0.03590      0.08270   
3     0.01505       0.05492       0.05492  ...   0.03772      0.08771   
4     0.01966       0.06425       0.06425  ...   0.04465      0.10470   

       NHR     HNR      RPDE       DFA   spread1   spread2        D2       PPE  

In [24]:
print(y_2.head())

   status
0       1
1       1
2       1
3       1
4       1


### Partitioning and Running Models

We are partitioning the data and running the partitions through the process model function to attain the results. 

In [26]:
# Define partitions
partitions_2 = partition_data(X_2, y_2, random_state=42)

In [27]:
# Run SVM
process_model(partitions_2, SVC, svm_param_grid, scaler_required=True)

# Run Random Forest
process_model(partitions_2, RandomForestClassifier, rf_param_grid, scaler_required=False)

# Run Logistic Regression
process_model(partitions_2, LogisticRegression, logreg_param_grid, scaler_required=True)

[1mSVC Results[0m
+----------------------------+---------------+---------------+-------------------------+---------------------------+---------------------+
| Partition (3 Iterations)   | Best Params   | CV Accuracy   |   Avg Training Accuracy |   Avg Validation Accuracy |   Avg Test Accuracy |
| 20/80                      | C: 10         | 0.8667        |                  1      |                    0.875  |              0.8141 |
+----------------------------+---------------+---------------+-------------------------+---------------------------+---------------------+
| 50/50                      | C: 1          | 0.8567        |                  0.8571 |                    0.65   |              0.8469 |
+----------------------------+---------------+---------------+-------------------------+---------------------------+---------------------+
| 80/20                      | C: 100        | 0.8870        |                  1      |                    0.9375 |              0.9231 |
+------

## Dataset 3: Phishing Websites

Retrieving the dataset from the UCI Repository.

In [29]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
phishing_websites = fetch_ucirepo(id=327) 
  
# data (as pandas dataframes) 
X_3 = phishing_websites.data.features 
y_3 = phishing_websites.data.targets 
  
# variable information 
print(phishing_websites.variables) 

                          name     role     type demographic description  \
0            having_ip_address  Feature  Integer        None        None   
1                   url_length  Feature  Integer        None        None   
2           shortining_service  Feature  Integer        None        None   
3             having_at_symbol  Feature  Integer        None        None   
4     double_slash_redirecting  Feature  Integer        None        None   
5                prefix_suffix  Feature  Integer        None        None   
6            having_sub_domain  Feature  Integer        None        None   
7               sslfinal_state  Feature  Integer        None        None   
8   domain_registration_length  Feature  Integer        None        None   
9                      favicon  Feature  Integer        None        None   
10                        port  Feature  Integer        None        None   
11                 https_token  Feature  Integer        None        None   
12          

In [30]:
print(X_3.head())

   having_ip_address  url_length  shortining_service  having_at_symbol  \
0                 -1           1                   1                 1   
1                  1           1                   1                 1   
2                  1           0                   1                 1   
3                  1           0                   1                 1   
4                  1           0                  -1                 1   

   double_slash_redirecting  prefix_suffix  having_sub_domain  sslfinal_state  \
0                        -1             -1                 -1              -1   
1                         1             -1                  0               1   
2                         1             -1                 -1              -1   
3                         1             -1                 -1              -1   
4                         1             -1                  1               1   

   domain_registration_length  favicon  ...  rightclick  popupwindow

In [31]:
print(y_3.head())

   result
0      -1
1      -1
2      -1
3      -1
4       1


### Partitioning and Running Models

We are partitioning the data and running the partitions through the process model function to attain the results. 

In [32]:
# Define partitions
partitions_3 = partition_data(X_3, y_3, random_state=42)

In [33]:
# Run SVM
process_model(partitions_3, SVC, svm_param_grid, scaler_required=True)

# Run Random Forest
process_model(partitions_3, RandomForestClassifier, rf_param_grid, scaler_required=False)

# Run Logistic Regression
process_model(partitions_3, LogisticRegression, logreg_param_grid, scaler_required=True)

[1mSVC Results[0m
+----------------------------+---------------+---------------+-------------------------+---------------------------+---------------------+
| Partition (3 Iterations)   | Best Params   | CV Accuracy   |   Avg Training Accuracy |   Avg Validation Accuracy |   Avg Test Accuracy |
| 20/80                      | C: 1          | 0.9383        |                  0.9598 |                    0.9278 |              0.9359 |
+----------------------------+---------------+---------------+-------------------------+---------------------------+---------------------+
| 50/50                      | C: 10         | 0.9591        |                  0.9817 |                    0.9421 |              0.9484 |
+----------------------------+---------------+---------------+-------------------------+---------------------------+---------------------+
| 80/20                      | C: 10         | 0.9621        |                  0.9785 |                    0.9633 |              0.962  |
+------