In [1]:
from bs4 import BeautifulSoup, Tag
from pathlib import Path
from typing import TypedDict, List, Tuple, Dict
import os
import json
import re

class Subject(TypedDict):
    name: str
    keywords: List[str]

class Major(TypedDict):
    code: str
    subjects: List[Subject]

cwd = Path().resolve()
result_path = cwd.joinpath('result')

result : List[Major] = []

In [2]:
import spacy


spacy.require_gpu()

2022-12-06 14:51:23.013244: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-06 14:51:23.507512: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-12-06 14:51:24.716445: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:
2022-12-06 14:51:24.716690: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerr

True

In [3]:
keyword_banlist = set()
subject_banlist = set()

with open('keyword_banlist.txt') as f:
    keyword_banlist.update([word.lower() for word in f.read().split("\n")])

with open('subject_banlist.txt') as f:
    subject_banlist.update([word.lower() for word in f.read().split("\n")])

In [4]:

nlp = spacy.load("en_core_web_lg")

def process_major(major: str) -> Major:
    global subject_banlist, keyword_banlist, nlp
    numbering_pattern = f"\(\d+\)|\d+\)|\d+\.|\([a-zA-Z]\)"

    result_path = Path().resolve().joinpath('result')
    wajib = result_path.joinpath(f"{major}/wajib")
    pilihan = result_path.joinpath(f"{major}/pilihan")

    subjects = [wajib.joinpath(filename) for filename in os.listdir(wajib.__str__())] + [pilihan.joinpath(filename) for filename in os.listdir(pilihan.__str__())]

    subjects_list: List[Subject] = []

    subject_text: List[Tuple[str, Dict[str, str]]] = []
    for subject in subjects:
        with open(subject, "r", encoding="utf-8") as f:
            txt = f.read().encode('utf-8', errors='ignore').decode('utf-8')
            soup = BeautifulSoup(txt)

            table = soup.find("tbody")

            if table is not None:
                rows: List[Tag] = table.find_all("tr")

                name = rows[3].find_all("td")[1].text

                if name.lower() in subject_banlist:
                    continue

                syllable = rows[5].find_all("td")[1].text
                syllable  = re.sub(numbering_pattern, " ", syllable)
                syllable = re.sub("\s+", " ", syllable)
                syllable = re.sub("[^0-9a-zA-Z ]+", "", syllable)
                syllable = syllable.encode("ascii", errors='ignore').__str__()
                syllable = re.sub("b'\s*", "", syllable)

                subject_text.append((syllable.lower(), {"subject_name": name.lower()}))
            else:
                print(f"Fail to find table on file {subject}")
    
    docs = nlp.pipe(subject_text, as_tuples=True)
    
    stopwords = nlp.Defaults.stop_words

    for doc, context in docs:
        
        name = context["subject_name"]

        phrases : set[str] = set()

        for noun in doc.noun_chunks:
            text = " ".join([word for word in noun.lemma_.lower().split() if word not in stopwords])
            
            if text != "" and text not in keyword_banlist and text.count(' ') >= 1: # add only two words
                phrases.add(text)

        subjects_list.append(Subject(name=name, keywords=list(phrases)))

    
    return Major(code=major, subjects=subjects_list)

In [5]:
majors = os.listdir(result_path.__str__())

for major in majors:
    result.append(process_major(major))

In [6]:
with open("major_subjects_keyword.json", "w") as w:
    json.dump(result, w)

In [7]:
import itertools

vertex: List[tuple[str, str]] = []

for major in result:
    docs = []

    for subject in major["subjects"]:
        keywords = subject["keywords"]
        keywords.sort(reverse=True)

        if keywords.__len__() >= 5:
            keywords = keywords[:5]

        text = subject["name"] + " " + " ".join(keywords)
        docs.append(text)

    vertex.append((major["code"], " ".join(docs)))

edges: List[Tuple[str, str, float]] = []

pairs = list(itertools.combinations(vertex, 2))
i = 0
length = len(pairs)

for combination in pairs:
    vertex1, vertex2 = combination

    doc1 = nlp(vertex1[1])
    doc2 = nlp(vertex2[1])
    similarity = doc1.similarity(doc2)

    res = (vertex1[0], vertex2[0], similarity)

    edges.append(res)
    i+=1
    print(f"{res} progress {i} out of {length}")

# print(edges)

('STEI-183', 'SITH-112', 0.9789344072341919) progress 1 out of 1225
('STEI-183', 'SITH-115', 0.9301908016204834) progress 2 out of 1225
('STEI-183', 'FTI-144', 0.972428560256958) progress 3 out of 1225
('STEI-183', 'FTSL-157', 0.9342047572135925) progress 4 out of 1225
('STEI-183', 'STEI-135', 0.9772424101829529) progress 5 out of 1225
('STEI-183', 'FSRD-173', 0.9340152740478516) progress 6 out of 1225
('STEI-183', 'FSRD-170', 0.860014796257019) progress 7 out of 1225
('STEI-183', 'SF-107', 0.9781938791275024) progress 8 out of 1225
('STEI-183', 'FTSL-153', 0.9424229264259338) progress 9 out of 1225
('STEI-183', 'SF-116', 0.961985170841217) progress 10 out of 1225
('STEI-183', 'FITB-128', 0.9531862139701843) progress 11 out of 1225
('STEI-183', 'FSRD-175', 0.9423863887786865) progress 12 out of 1225
('STEI-183', 'STEI-182', 0.9665733575820923) progress 13 out of 1225
('STEI-183', 'FTSL-158', 0.9399986267089844) progress 14 out of 1225
('STEI-183', 'STEI-180', 0.9763698577880859) progre

In [8]:
with open("weight.txt", "w") as w:
    w.write("\n".join(edge[0] + " " + edge[1] + " " + str(edge[2]) for edge in edges))

In [9]:
import numpy as np

weights = np.array([edge[2] for edge in edges])

normalized = (weights - np.min(weights)) * 100 / (np.max(weights) - np.min(weights))

with open('weight_normalized.txt', "w") as w:
    w.write("\n".join(edges[i][0] + " " + edges[i][1] + " " + str(normalized[i]) for i in range(len(edges))))