# Step 1 Import needed libraries and scripts

This also requires setting up the correct working directory to be the top folder 'machine-learning-assisted-khovanov-homology'

In [244]:
# import useful libraries for data preprocessing
import math
import numpy as np
import pandas as pd
from numpy import linalg as LA
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from collections import Counter

In [245]:
#Import functions built in class

from scripts.polynomial import add_poly_terms
from scripts.GDLinearReg import J, DJ, GD_linreg_improved

Run this cell to check your current working directory. It should return the top folder "machine-learning-assisted-khovanov-homology"

In [287]:
os.getcwd()

'/mnt/c/Users/wuwj2/Desktop/jupyter/MAT_180_ML_Projects/machine-learning-assisted-khovanov-homology'

In [139]:
#Run this cell once if still in the notebooks folder.
#Note that running this command multiple times might get you too high in the directory tree so be 
#cautious running this cell
os.chdir("../")

# Step 2 Data Parsing

Parse the data from a .csv file containing the free parts and torsion count

In [288]:
df = pd.read_csv('data/dataset_no_repeats.csv')

# Obtain the list of free_part dictionaries.
# They are in the form {bigrading: value}
fp_list = df['free_part'].to_list()
fp_list = list(map(eval, fp_list))

# Obtain lsit of all bigradings that occur in the dataset in order
# Sorted using dictionary order
bigrading = []
for fp in fp_list:
    for key in fp.keys():
        bigrading.append(key)
bigrading_list = sorted(list(Counter(bigrading).keys()))

In [253]:
# Create the matrix X, where each column represents the bigrading
# in dictionary order 

m, n = len(fp_list), len(bigrading_list)
X = np.zeros([m,n])

for i, fp in enumerate(fp_list):
    for key, val in fp.items():
        X[i,bigrading_list.index(key)] = val

In [255]:
# Labels of the data set
y = df['torsion_part_count'].to_numpy().reshape(-1,1)

In [283]:
# Splitting data into training, validation, and testing sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

print(f'Size of training set: {len(X_train)}')
print(f'Size of validation set: {len(X_val)}')
print(f'Size of testing set: {len(X_test)}')

Size of training set: 432
Size of validation set: 108
Size of testing set: 541


In [284]:
def prediction(x,v):
    return round((x@v)[0])

def accuracy(X,v,y):
    count = 0
    for i,x in enumerate(X):
        if prediction(x,v) == y[i][0]:
            count += 1
    return count / len(X)

def fit(X, y, epsilon, lambda_, max_iters = 10000, poly_terms = 1):
    print(f'Running polynomial regression of degree {poly_terms} \n')
    
    v, costs =  GD_linreg_improved(add_poly_terms(X, poly_terms), y, epsilon, lambda_, max_iters) 
    
    print(f'\nFinal cost is {costs[-1]}\n')
    return v, costs

In [285]:
degree = 1

v, costs = fit(X_train, y_train, epsilon = 1e-10, lambda_ = 0, max_iters = 10000, poly_terms = degree)

print(f'Accuracy of training set is: {accuracy(add_poly_terms(X_train, degree),v,y_train)}')
print(f'Accuracy of validation set is: {accuracy(add_poly_terms(X_val, degree),v,y_val)}')
print(f'Accuracy of test set is: {accuracy(add_poly_terms(X_test, degree),v,y_test)}')

Running polynomial regression of degree 1 

After 0 steps the cost is 34.35648148148148
After 1000 steps the cost is 0.002707408596290094
After 2000 steps the cost is 0.0005345475763770855
After 3000 steps the cost is 0.000167915735706277
After 4000 steps the cost is 7.158472693385065e-05
After 5000 steps the cost is 3.6843536467093885e-05
After 6000 steps the cost is 2.127459822182962e-05
After 7000 steps the cost is 1.3193059355040984e-05
After 8000 steps the cost is 8.55737754635532e-06
After 9000 steps the cost is 5.713723623941485e-06
After 9999 steps the cost is 3.890854183807217e-06

Final cost is 3.889386876403748e-06

Accuracy of training set is: 1.0
Accuracy of validation set is: 1.0
Accuracy of test set is: 1.0


In [286]:
print(v)

[[-7.92165961e-01]
 [ 6.52440136e-02]
 [ 6.97436545e-01]
 [ 8.15772175e-01]
 [ 1.50366297e+00]
 [ 6.04112866e-01]
 [ 7.62680558e-01]
 [ 8.12478832e-01]
 [ 8.60260067e-01]
 [ 1.52159391e-01]
 [ 6.23490836e-01]
 [ 4.35173533e-01]
 [ 4.61647394e-01]
 [-9.48977880e-02]
 [ 5.15354934e-01]
 [ 1.52159391e-01]
 [ 6.16633602e-01]
 [-7.66923393e-02]
 [ 4.20825713e-01]
 [ 3.99696724e-01]
 [ 5.85825357e-01]
 [ 3.40080363e-01]
 [ 6.43142623e-01]
 [ 2.92090208e-01]
 [ 8.04771718e-01]
 [ 8.14606330e-01]
 [ 4.56519221e-01]
 [-5.45277154e-01]
 [ 3.45857800e-01]
 [ 3.56775166e-01]
 [ 9.17490063e-01]
 [ 5.18951462e-01]
 [ 3.92271120e-01]
 [ 3.25957609e-01]
 [ 6.94033909e-01]
 [ 6.78581158e-01]
 [ 1.08843925e-01]
 [ 3.72913178e-01]
 [ 5.87447594e-01]
 [ 4.16784393e-01]
 [ 3.20738487e-01]
 [ 3.16218211e-01]
 [ 3.76854153e-01]
 [ 6.28118048e-01]
 [ 3.46486741e-01]
 [ 6.43756201e-02]
 [ 6.76647521e-01]
 [ 5.98330413e-01]
 [ 1.08208553e-01]
 [ 3.73226002e-01]
 [ 5.27844878e-01]
 [ 8.98854259e-01]
 [ 4.2611173