# Step 1. Import needed libraries and scripts

This also requires setting up the correct working directory to be the top folder 'machine-learning-assisted-khovanov-homology'

In [1]:
# import useful libraries for data preprocessing
import math
import numpy as np
import pandas as pd
from numpy import linalg as LA
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from collections import Counter

Run this cell to check your current working directory. It should return the top folder "machine-learning-assisted-khovanov-homology"

In [5]:
os.getcwd()

'/mnt/c/Users/12428/Documents/GitHub/MAT_180_ML_Projects/machine-learning-assisted-khovanov-homology'

In [4]:
#Run this cell once if still in the notebooks folder.
#Note that running this command multiple times might get you too high in the directory tree so be 
#cautious running this cell
os.chdir("../")

In [15]:
#Import functions built in class
from scripts.getGridsDimensions import find_max_min_row, find_max_min_col
from scripts.polynomial import add_poly_terms
from scripts.GDLinearReg import J, DJ, GD_linreg_improved

# Step 2. Train and test with a dataset (links with 1 components only) 

Parse the data from a .csv file containing the free parts and torsion count

In [41]:
df = pd.read_csv('data/dataset_A.csv')

# Obtain the list of free_part dictionaries.
# They are in the form {bigrading: value}
fp_list = df['free_part'].to_list()
fp_list = list(map(eval, fp_list))

find_max_row, find_min_row = find_max_min_row(df)
find_max_col, find_min_col = find_max_min_col(df)

bigrading_list = []

for i in range(find_min_row, find_max_row+1):
    for j in range(find_min_col, find_max_col+1):
        bigrading_list.append((i,j))
        
print(bigrading_list)        


[(-25, -9), (-25, -8), (-25, -7), (-25, -6), (-25, -5), (-25, -4), (-25, -3), (-25, -2), (-25, -1), (-25, 0), (-25, 1), (-25, 2), (-25, 3), (-25, 4), (-25, 5), (-25, 6), (-25, 7), (-25, 8), (-25, 9), (-24, -9), (-24, -8), (-24, -7), (-24, -6), (-24, -5), (-24, -4), (-24, -3), (-24, -2), (-24, -1), (-24, 0), (-24, 1), (-24, 2), (-24, 3), (-24, 4), (-24, 5), (-24, 6), (-24, 7), (-24, 8), (-24, 9), (-23, -9), (-23, -8), (-23, -7), (-23, -6), (-23, -5), (-23, -4), (-23, -3), (-23, -2), (-23, -1), (-23, 0), (-23, 1), (-23, 2), (-23, 3), (-23, 4), (-23, 5), (-23, 6), (-23, 7), (-23, 8), (-23, 9), (-22, -9), (-22, -8), (-22, -7), (-22, -6), (-22, -5), (-22, -4), (-22, -3), (-22, -2), (-22, -1), (-22, 0), (-22, 1), (-22, 2), (-22, 3), (-22, 4), (-22, 5), (-22, 6), (-22, 7), (-22, 8), (-22, 9), (-21, -9), (-21, -8), (-21, -7), (-21, -6), (-21, -5), (-21, -4), (-21, -3), (-21, -2), (-21, -1), (-21, 0), (-21, 1), (-21, 2), (-21, 3), (-21, 4), (-21, 5), (-21, 6), (-21, 7), (-21, 8), (-21, 9), (-20

In [31]:
# Create the matrix X, where each column represents the bigrading
# in dictionary order 

m, n = len(fp_list), len(bigrading_list)
X = np.zeros([m,n])

for i, fp in enumerate(fp_list):
    for key, val in fp.items():
        X[i,bigrading_list.index(key)] = val

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
# Labels of the data set
y = df['torsion_part_count'].to_numpy().reshape(-1,1)

In [33]:
# Splitting data into training, validation, and testing sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

print(f'Size of training set: {len(X_train)}')
print(f'Size of validation set: {len(X_val)}')
print(f'Size of testing set: {len(X_test)}')

Size of training set: 432
Size of validation set: 108
Size of testing set: 541


In [34]:
def prediction(x,v):
    return round((x@v)[0])

def accuracy(X,v,y):
    count = 0
    for i,x in enumerate(X):
        if prediction(x,v) == y[i][0]:
            count += 1
    return count / len(X)

def fit(X, y, epsilon, lambda_, max_iters = 10000, poly_terms = 1):
    print(f'Running polynomial regression of degree {poly_terms} \n')
    
    v, costs =  GD_linreg_improved(add_poly_terms(X, poly_terms), y, epsilon, lambda_, max_iters) 
    
    print(f'\nFinal cost is {costs[-1]}\n')
    return v, costs

In [38]:
degree = 1

v, costs = fit(X_train, y_train, epsilon = 1e-10, lambda_ = 0, max_iters = 500, poly_terms = degree)

print(f'Accuracy of training set is: {accuracy(add_poly_terms(X_train, degree),v,y_train)}')
print(f'Accuracy of validation set is: {accuracy(add_poly_terms(X_val, degree),v,y_val)}')
print(f'Accuracy of test set is: {accuracy(add_poly_terms(X_test, degree),v,y_test)}')

Running polynomial regression of degree 1 

After 0 steps the cost is 34.35648148148148
After 499 steps the cost is 0.008717812499697913

Final cost is 0.008692644175094924

Accuracy of training set is: 0.9930555555555556
Accuracy of validation set is: 0.9814814814814815
Accuracy of test set is: 0.988909426987061


In [39]:
degree = 1

v, costs = fit(X_train, y_train, epsilon = 1e-10, lambda_ = 0, max_iters = 1000, poly_terms = degree)

print(f'Accuracy of training set is: {accuracy(add_poly_terms(X_train, degree),v,y_train)}')
print(f'Accuracy of validation set is: {accuracy(add_poly_terms(X_val, degree),v,y_val)}')
print(f'Accuracy of test set is: {accuracy(add_poly_terms(X_test, degree),v,y_test)}')

Running polynomial regression of degree 1 

After 0 steps the cost is 34.35648148148148
After 999 steps the cost is 0.0027127383321814873

Final cost is 0.0027074085962902786

Accuracy of training set is: 1.0
Accuracy of validation set is: 1.0
Accuracy of test set is: 0.9981515711645101


# Step 3. Test with another dataset (links with 1&2&3 components)

In [40]:
df2 = pd.read_csv('data/dataset_B.csv')

# Obtain the list of free_part dictionaries.
# They are in the form {bigrading: value}
fp_list2 = df2['free_part'].to_list()
fp_list2 = list(map(eval, fp_list2))

find_max_row2, find_min_row2 = find_max_min_row(df2)
find_max_col2, find_min_col2 = find_max_min_col(df2)

bigrading_list2 = []

for i in range(find_min_row2, find_max_row2+1):
    for j in range(find_min_col2, find_max_col2+1):
        bigrading_list2.append((i,j))
        
# print(bigrading_list2)

#bigrading_list contains bigrading_list2 so we can index using the old onelinks with 1&2&3 components

[(-23, -8), (-23, -7), (-23, -6), (-23, -5), (-23, -4), (-23, -3), (-23, -2), (-23, -1), (-23, 0), (-23, 1), (-23, 2), (-23, 3), (-23, 4), (-23, 5), (-23, 6), (-23, 7), (-23, 8), (-23, 9), (-22, -8), (-22, -7), (-22, -6), (-22, -5), (-22, -4), (-22, -3), (-22, -2), (-22, -1), (-22, 0), (-22, 1), (-22, 2), (-22, 3), (-22, 4), (-22, 5), (-22, 6), (-22, 7), (-22, 8), (-22, 9), (-21, -8), (-21, -7), (-21, -6), (-21, -5), (-21, -4), (-21, -3), (-21, -2), (-21, -1), (-21, 0), (-21, 1), (-21, 2), (-21, 3), (-21, 4), (-21, 5), (-21, 6), (-21, 7), (-21, 8), (-21, 9), (-20, -8), (-20, -7), (-20, -6), (-20, -5), (-20, -4), (-20, -3), (-20, -2), (-20, -1), (-20, 0), (-20, 1), (-20, 2), (-20, 3), (-20, 4), (-20, 5), (-20, 6), (-20, 7), (-20, 8), (-20, 9), (-19, -8), (-19, -7), (-19, -6), (-19, -5), (-19, -4), (-19, -3), (-19, -2), (-19, -1), (-19, 0), (-19, 1), (-19, 2), (-19, 3), (-19, 4), (-19, 5), (-19, 6), (-19, 7), (-19, 8), (-19, 9), (-18, -8), (-18, -7), (-18, -6), (-18, -5), (-18, -4), (-18

In [45]:
m, n = len(fp_list2), len(bigrading_list)
X2 = np.zeros([m,n])

for i, fp in enumerate(fp_list2):
    for key, val in fp.items():
        X2[i,bigrading_list.index(key)] = val

y2 = df2['torsion_part_count'].to_numpy().reshape(-1,1)

In [46]:
print(f'Accuracy of this data set is: {accuracy(add_poly_terms(X2, degree),v,y2)}')

Accuracy of training set is: 0.4083969465648855
