In [1]:
import numpy as np
import pandas as pd
import os

Run this cell to check your current working directory. It should return the top folder "machine-learning-assisted-khovanov-homology"

In [2]:
os.getcwd()

'/mnt/c/Users/wuwj2/Desktop/jupyter/MAT_180_ML_Projects/machine-learning-assisted-khovanov-homology/notebooks'

In [3]:
#Run this cell once if still in the notebooks folder.
#Note that running this command multiple times might get you too high in the directory tree so be 
#cautious running this cell
os.chdir("../")

In [4]:
def free_part(kh):
    res = {}
    # Not the optimal way; while also not the bottleneck for computation
    # The size of kh is not big
    # TODO: improve this function
    for key1 in kh.keys():
        for key2 in kh[key1].keys():
            a = kh[key1][key2]
            gens = a.gens()
            n = len([gen for gen in gens if gen.additive_order() == +Infinity])
            if n>0:
                res[(key1, key2)] = n 
    return res

In [5]:
def torsion_part(kh): # key - orders of the generator; value - number of generator with that order
    res = {}
    # Not the optimal way; while also not the bottleneck for computation
    # The size of kh is not big
    # TODO: improve this function
    for key1 in kh.keys():
        for key2 in kh[key1].keys():
            val = {}
            a = kh[key1][key2]
            gens = a.gens()
            for gen in gens:
                b = gen.additive_order()
                if b == +Infinity:
                    continue
                elif b in val.keys():
                    val[b] += 1
                else:
                    val[b] = 1   
            if len(list(val.keys()))>0:
                res[(key1, key2)] = val
    return res

In [14]:
from scripts.dataGeneration import getRandomWord, count_FP, count_TP

S = [-3, -2, -1, 1, 2, 3]

B = BraidGroup(4)

In [6]:
def generate_dataset(n,m):

# n is the desired size of the dataset
# m determines how long the words used to generate the links are going to be
# WARNING: m >= 9 will make the khovanov homology very long so be careful.
# I am personally using m = 8
    
    data = {
        'braid': [],
        'khovanov_homology': [],
        'free_part': [],
        'torsion_part': [],
        'free_part_count': [],
        'torsion_part_count': [],
        'total_num_FP_per_row': [],
        'total_num_FP_per_column': []
        
    }

    while range(n):
        W = getRandomWord(m, S)
        link = Link(B(W))
        
        if link.number_of_components() != 1:
            continue
        
        kh = link.khovanov_homology()
        
        if kh in data["khovanov_homology"]:
            continue
            
        FP = free_part(kh)
        TP = torsion_part(kh)
        
        data["braid"].append(W)
        data["khovanov_homology"].append(kh)
        data["free_part"].append(FP)
        data["torsion_part"].append(TP)
        data["free_part_count"].append(count_FP(FP))
        data["torsion_part_count"].append(count_TP(TP))
        data["total_num_FP_per_row"].append(count_FP_per_row(FP))
        data["total_num_FP_per_column"].append(count_FP_per_col(FP))

    df = pd.DataFrame(data)
    
    df.to_csv(f'data/{n}_{m}_dataset_new.csv')

In [None]:
generate_dataset(10000, 9)

In [38]:
df = pd.read_csv("data/10000_9_parsed.csv")
df = df[df.torsion_part != '{}']

braids = df['braid'].to_dict()

for index, braid in braids.items():
    if Link(B(eval(braid))).number_of_components() != 1:
        df = df.drop(index, axis=0)
        
df = df.drop('Unnamed: 0', axis=1)

In [42]:
df.to_csv('10000_9_cleaned.csv', index=False)