# Step 1. Import needed libraries and scripts

This also requires setting up the correct working directory to be the top folder 'machine-learning-assisted-khovanov-homology'

In [1]:
# import useful libraries for data preprocessing
import math
import random
import numpy as np
import pandas as pd
from numpy import linalg as LA
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from collections import Counter

Run this cell to check your current working directory. It should return the top folder "machine-learning-assisted-khovanov-homology"

In [4]:
os.getcwd()

'/mnt/c/Users/12428/Documents/GitHub/MAT_180_ML_Projects/machine-learning-assisted-khovanov-homology'

In [3]:
#Run this cell once if still in the notebooks folder.
#Note that running this command multiple times might get you too high in the directory tree so be 
#cautious running this cell
os.chdir("../")

In [5]:
#Import functions built in class
from scripts.getGridsDimensions import find_max_min_row, find_max_min_col
from scripts.polynomial import add_poly_terms
from scripts.GDLinearReg import J, DJ, GD_linreg_improved

In [6]:
df = pd.read_csv('data/dataset_C.csv')

# Step 2. Train and test with a dataset (links with 1,2 and 3 components) 

Parse the data from a .csv file containing the free parts and torsion count.

Since we want each bigrading to be its unique feature, we first need to find a bounding box for all possible bigradings using the getGridsDimensions script

In [7]:
df = pd.read_csv('data/dataset_C.csv')

# Obtain the list of free_part dictionaries.
# They are in the form {bigrading: value}
fp_list = df['free_part'].to_list()
fp_list = list(map(eval, fp_list))

max_row, min_row = find_max_min_row(df)
max_col, min_col = find_max_min_col(df)

bigrading_list = []

for i in range(min_row, max_row+1):
    for j in range(min_col, max_col+1):
        bigrading_list.append((i,j)) 
        
# print(bigrading_list)

Train the model using links of different components separately.

In [8]:
L_fp_list = list(map(eval, df['free_part'].to_list()))
L1_fp_list = list(map(eval, df[df.components == 1]['free_part'].to_list()))
L1_fp_list = list(map(eval, df[df.components == 1]['free_part'].to_list()))
L2_fp_list = list(map(eval, df[df.components == 2]['free_part'].to_list()))
L3_fp_list = list(map(eval, df[df.components == 3]['free_part'].to_list()))

y = df['torsion_part_count'].to_numpy().reshape(-1,1)
y1 = df[df.components == 1]['torsion_part_count'].to_numpy().reshape(-1,1)
y2 = df[df.components == 2]['torsion_part_count'].to_numpy().reshape(-1,1)
y3 = df[df.components == 3]['torsion_part_count'].to_numpy().reshape(-1,1)

In [9]:
n = len(bigrading_list)
m, m1, m2, m3 = len(y), len(y1), len(y2), len(y3)

In [10]:
X, X1, X2, X3 = np.zeros([m,n]), np.zeros([m1,n]), np.zeros([m2,n]), np.zeros([m3,n])

for i, fp in enumerate(L_fp_list):
    for key, val in fp.items():
        X[i,bigrading_list.index(key)] = val

for i, fp in enumerate(L1_fp_list):
    for key, val in fp.items():
        X1[i,bigrading_list.index(key)] = val
        
for i, fp in enumerate(L2_fp_list):
    for key, val in fp.items():
        X2[i,bigrading_list.index(key)] = val
        
for i, fp in enumerate(L3_fp_list):
    for key, val in fp.items():
        X3[i,bigrading_list.index(key)] = val

X = add_poly_terms(X,1)
X1 = add_poly_terms(X1,1)
X2 = add_poly_terms(X2,1)
X3 = add_poly_terms(X3,1)

In [12]:
from scripts.predictAccuracy import prediction, accuracy


#This fit function has been modified to not have the add_poly_terms built-in because the number is features in X is too
#large for it to be used at all.
def fit(X, y, epsilon, lambda_, max_iters = 10000):    
    v, costs =  GD_linreg_improved(X, y, epsilon, lambda_, max_iters) 
    
    print(f'\nFinal cost is {costs[-1]}\n')
    return v, costs

## Step 2a. Training a model using only knots (single component links)

In [120]:
# Splitting data into training, validation, and testing sets.
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=1)
X1_train, X1_val, y1_train, y1_val = train_test_split(X1_train, y1_train, test_size=0.2, random_state=1)

print(f'Size of training set for 1-Links: {len(X1_train)}')
print(f'Size of validation set for 1-Links: {len(X1_val)}')
print(f'Size of testing set for 1-Links: {len(X1_test)}')

Size of training set for 1-Links: 706
Size of validation set for 1-Links: 177
Size of testing set for 1-Links: 221


In [121]:
v1, costs1 = fit(X1_train, y1_train, epsilon = 1e-5, lambda_ = 0, max_iters = 1000)

print(f'Accuracy of training set is: {accuracy(X1_train,v1,y1_train)}')
print(f'Accuracy of validation set is: {accuracy(X1_val,v1,y1_val)}')
print(f'Accuracy of test set is: {accuracy(X1_test,v1,y1_test)}')

After 0 steps the cost is 35.23512747875354
After 817 steps the cost is 0.004400263527908442

Final cost is 0.004390308430671085

Accuracy of training set is: 0.9957507082152974
Accuracy of validation set is: 0.9774011299435028
Accuracy of test set is: 0.9728506787330317


## Step 2b. Training a model using only links with 2 components

In [122]:
# Splitting data into training, validation, and testing sets.
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=1)
X2_train, X2_val, y2_train, y2_val = train_test_split(X2_train, y2_train, test_size=0.2, random_state=1)

print(f'Size of training set for 2-Links: {len(X2_train)}')
print(f'Size of validation set for 2-Links: {len(X2_val)}')
print(f'Size of testing set for 2-Links: {len(X2_test)}')

Size of training set for 2-Links: 196
Size of validation set for 2-Links: 50
Size of testing set for 2-Links: 62


In [123]:
v2, costs2 = fit(X2_train, y2_train, epsilon = 1e-5, lambda_ = 0, max_iters = 1000)

print(f'Accuracy of training set is: {accuracy(X2_train,v2,y2_train)}')
print(f'Accuracy of validation set is: {accuracy(X2_val,v2,y2_val)}')
print(f'Accuracy of test set is: {accuracy(X2_test,v2,y2_test)}')

After 0 steps the cost is 26.535714285714285
After 808 steps the cost is 0.00476627365888708

Final cost is 0.004756324068640731

Accuracy of training set is: 1.0
Accuracy of validation set is: 0.98
Accuracy of test set is: 0.967741935483871


## Step 2c. Training a model using only links with 3 components

In [124]:
# Splitting data into training, validation, and testing sets.
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=1)
X3_train, X3_val, y3_train, y3_val = train_test_split(X3_train, y3_train, test_size=0.2, random_state=1)

print(f'Size of training set for 3-Links: {len(X3_train)}')
print(f'Size of validation set for 3-Links: {len(X3_val)}')
print(f'Size of testing set for 3-Links: {len(X3_test)}')

Size of training set for 3-Links: 716
Size of validation set for 3-Links: 179
Size of testing set for 3-Links: 224


In [125]:
v3, costs3 = fit(X3_train, y3_train, epsilon = 1e-5, lambda_ = 0, max_iters = 1000)

print(f'Accuracy of training set is: {accuracy(X3_train,v3,y3_train)}')
print(f'Accuracy of validation set is: {accuracy(X3_val,v3,y3_val)}')
print(f'Accuracy of test set is: {accuracy(X3_test,v3,y3_test)}')

After 0 steps the cost is 38.92039106145252
After 999 steps the cost is 0.2077239147315949

Final cost is 0.20766693604842407

Accuracy of training set is: 0.9259776536312849
Accuracy of validation set is: 0.9608938547486033
Accuracy of test set is: 0.9241071428571429


Here are some random batches:

In [134]:
batch1 = random.choices(range(m1), k = 10)
batch2 = random.choices(range(m2), k = 10)
batch3 = random.choices(range(m3), k = 10)

for i in batch1:
    print(f'1-Link, the {i}th point has label {y1[i]} and is predicted to be {prediction(X1[i],v1)}')
print('')
for i in batch2:
    print(f'2-Link, the {i}th point has label {y2[i]} and is predicted to be {prediction(X2[i],v2)}')
print('')
for i in batch3:
    print(f'3-Link, the {i}th point has label {y3[i]} and is predicted to be {prediction(X3[i],v3)}')

1-Link, the 774th point has label [2] and is predicted to be 2
1-Link, the 513th point has label [7] and is predicted to be 7
1-Link, the 979th point has label [3] and is predicted to be 3
1-Link, the 561th point has label [4] and is predicted to be 4
1-Link, the 418th point has label [9] and is predicted to be 9
1-Link, the 467th point has label [4] and is predicted to be 4
1-Link, the 958th point has label [8] and is predicted to be 8
1-Link, the 500th point has label [2] and is predicted to be 2
1-Link, the 246th point has label [1] and is predicted to be 2
1-Link, the 113th point has label [7] and is predicted to be 7

2-Link, the 210th point has label [4] and is predicted to be 4
2-Link, the 165th point has label [1] and is predicted to be 1
2-Link, the 65th point has label [1] and is predicted to be 1
2-Link, the 131th point has label [2] and is predicted to be 2
2-Link, the 197th point has label [2] and is predicted to be 2
2-Link, the 256th point has label [7] and is predicted 

## Step 2d. Training a model using only links with multiple (1&2&3) components

In [14]:
# Splitting data into training, validation, and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

print(f'Size of training set for multiple-Links: {len(X_train)}')
print(f'Size of validation set for multiple-Links: {len(X_val)}')
print(f'Size of testing set for multiple-Links: {len(X_test)}')

Size of training set for multiple-Links: 1619
Size of validation set for multiple-Links: 405
Size of testing set for multiple-Links: 507


In [15]:
v, costs = fit(X_train, y_train, epsilon = 1e-5, lambda_ = 0, max_iters = 1000)

print(f'Accuracy of training set is: {accuracy(X_train,v,y_train)}')
print(f'Accuracy of validation set is: {accuracy(X_val,v,y_val)}')
print(f'Accuracy of test set is: {accuracy(X_test,v,y_test)}')

After 0 steps the cost is 38.36380481778877
After 999 steps the cost is 0.10799511073957528

Final cost is 0.10795037586762624

Accuracy of training set is: 0.9524397776405188
Accuracy of validation set is: 0.9481481481481482
Accuracy of test set is: 0.9408284023668639
