In [15]:
from bs4 import BeautifulSoup, Tag
from pathlib import Path
from typing import TypedDict, List, Tuple, Dict
from nltk.tokenize import RegexpTokenizer
from nltk import RegexpParser, Tree
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from multiprocessing import Pool, cpu_count
import os
import json
import re
import string

class Subject(TypedDict):
    name: str
    keywords: List[str]

class Major(TypedDict):
    code: str
    subjects: List[Subject]

cwd = Path().resolve()
result_path = cwd.joinpath('result')

result : List[Major] = []

In [16]:
import nltk
import spacy

# spacy.prefer_gpu()
# spacy.require_gpu()
spacy.require_cpu()
nltk.download("stopwords")
nltk.download('omw-1.4')
nltk.download("wordnet")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/barcode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/barcode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/barcode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/barcode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/barcode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [17]:
# def process_sentence(sentence: str) -> List[str]:
#     sentence = nltk.sent_tokenize(sentence)
#     tokenized_sentence = [nltk.word_tokenize(s) for s in sentence]
#     tagged_sentence = [nltk.pos_tag(s) for s in tokenized_sentence]

#     grammar = ("NP: {<DT>?<JJ>*<NN>}")
#     chunker = RegexpParser(grammar)

#     phrases: List[str] = []

#     for tag in tagged_sentence:
#         tree:Tree = chunker.parse(tag)
#         for subtree in tree.subtrees():
#             if subtree.label() == "NP":
#                 phrases.append(subtree._pformat_flat("", "()", False))

#     return phrases

# def spacy_processor(text: str) -> List[str]:
#     nlp = spacy.load("en_core_web_sm")

#     doc = nlp(text)
#     phrases : List[str] = []

#     for noun in doc.noun_chunks:
#         phrases.append(noun.text)

#     return phrases

In [18]:
keyword_banlist = set()
subject_banlist = set()

with open('keyword_banlist.txt') as f:
    keyword_banlist.update([word.lower() for word in f.read().split("\n")])

with open('subject_banlist.txt') as f:
    subject_banlist.update([word.lower() for word in f.read().split("\n")])

In [19]:
def process_major(major: str) -> Major:
    global subject_banlist, keyword_banlist
    numbering_pattern = f"\(\d+\)|\d+\)|\d+\.|\([a-zA-Z]\)"
    # stop_words = set(stopwords.words('english'))
    # tokenizer = RegexpTokenizer(r"\w+")
    # lemmatizer = WordNetLemmatizer()
    result_path = Path().resolve().joinpath('result')
    wajib = result_path.joinpath(f"{major}/wajib")
    pilihan = result_path.joinpath(f"{major}/pilihan")

    subjects = [wajib.joinpath(filename) for filename in os.listdir(wajib.__str__())] + [pilihan.joinpath(filename) for filename in os.listdir(pilihan.__str__())]

    subjects_list: List[Subject] = []

    subject_text: List[Tuple[str, Dict[str, str]]] = []

    for subject in subjects:
        with open(subject, "r", encoding="utf-8") as f:
            # print(f"reading {subject}")
            txt = f.read().encode('utf-8', errors='ignore').decode('utf-8')
            soup = BeautifulSoup(txt)

            table = soup.find("tbody")

            if table is not None:
                rows: List[Tag] = table.find_all("tr")

                name = rows[3].find_all("td")[1].text

                if name.lower() in subject_banlist:
                    continue

                syllable = rows[5].find_all("td")[1].text
                syllable  = re.sub(numbering_pattern, " ", syllable)
                syllable = re.sub("\s+", " ", syllable)
                # syllable = re.sub("\\u\d\d\d\d|\*", "", syllable)
                # syllable = syllable.translate(string.)
                syllable = re.sub("[^0-9a-zA-Z ]+", "", syllable)
                syllable = syllable.encode("ascii", errors='ignore').__str__()
                syllable = re.sub("b'\s*", "", syllable)

                subject_text.append((syllable, {"subject_name": name}))

                # nlp = spacy.load("en_core_web_sm")

                # doc = nlp(syllable)
                # phrases : List[str] = []

                # for noun in doc.noun_chunks:
                #     phrases.append(noun.text)

                # remove punctuation and stopwords
                # words = [lemmatizer.lemmatize(word.lower()) for word in tokenizer.tokenize(syllable) if word.lower() not in stop_words]

                # subjects_list.append(Subject(name=name, keywords=phrases))
            else:
                print(f"Fail to find table on file {subject}")

    nlp = spacy.load("en_core_web_sm")
    docs = nlp.pipe(subject_text, n_process=2, batch_size=100, as_tuples=True)
    stopwords = nlp.Defaults.stop_words

    for doc, context in docs:
        name = context["subject_name"]

        phrases : set[str] = set()

        for noun in doc.noun_chunks:
            text = " ".join([word for word in noun.lemma_.lower().split() if word not in stopwords])
            
            if text != "" and text not in keyword_banlist:
                phrases.add(text)

        subjects_list.append(Subject(name=name, keywords=list(phrases)))

    return Major(code=major, subjects=subjects_list)

In [20]:
# pool = Pool(cpu_count())

majors = os.listdir(result_path.__str__())

# result.append(process_major(majors[0]))

for major in majors:
    result.append(process_major(major))

# result.extend(pool.map(process_major, majors[:4])) 

In [21]:
with open("major_subjects_keyword.json", "w") as w:
    json.dump(result, w)