In [16]:
import os
import json
import unicodedata
import math
import re
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

FOLDER_NAME = "TrainingSet"

In [17]:
json_data = []
for path, dirs, files in os.walk(f"./{FOLDER_NAME}"):
    for file in files:
        if file.endswith(".json"):
            f = open(os.path.join(path, file), encoding="utf-8-sig")
            json_data.append(json.load(f))
            json_data[-1]["FileName"] = file
            f.close()

In [18]:
CHAR_CONVERSION = {
    9:32,           # \t -> \s
    10:32,          # \n -> \s
    33:32,          # ! -> \s
    35:32,
    38:32,          # & -> \s
    40:32,          # ( -> \s
    41:32,          # ) -> \s
    42:32,
    43:32,          # + -> \s
    47:45,          # / -> -
    58:32,          # : -> \s
    59:32,          # ; -> \s           
    60:-1,          # < ->
    61:32,          # = -> \s
    62:-1,          # > -> 
    63:32,          # ? -> \s
    64:32,
    91:-1,          # [ ->
    93:-1,          # ] ->
    94:39,          # ^ -> '
    95:45,          # _ -> -
    96:39,
    123:-1,
    124:32,         # | -> \s
    125:-1,         # } -> 
    160:32,         #  -> \s
    167:32,         # § -> \s
    170:-1,         # ª -> 
    171:32,         # « -> \s
    176:-1,         # ° ->
    180:39,
    182:-1,         # ¶ ->     
    184:-1,         # ¸ ->
    187:32,         # » -> \s
    223:-1,         # ß ->
    224:97,         # à -> a
    225:97,         # á -> a
    226:97,         # â -> a
    228:97,         # ä -> a
    229:97,         # å -> a
    230:101,        # æ -> e
    232:101,        # è -> e
    233:101,        # é -> e
    234:101,        # ê -> e
    235:101,        # ë -> e
    236:105,        # ì -> i
    237:105,        # í -> i
    238:105,        # î -> i
    239:105,        # ï -> i
    241:110,
    242:111,        # ò -> o
    243:111,        # ó -> o
    244:111,        # ô -> o
    245:111,
    248:111,        # ø -> o
    249:117,        # ù -> u
    250:117,
    251:117,        # û -> u
    257:97,         # ā -> a
    261:97,         # ą -> a
    263:99,         # ć -> c
    269:99,         # č -> c
    279:101,        # ė -> e
    299:105,        # ī -> i
    322:108,
    324:110,
    353:115,        # š -> s
    363:117,        # ū -> u
    369:252,
    382:122,        # ž -> z
    523:105,        # ȋ -> i
    537:351,
    601:101,        # ə -> e
    699:39,         # ʻ -> '
    700:39,         # ʼ -> '
    703:39,         # ʿ -> '
    706:-1,         # ˂ ->
    714:39,         # ˊ -> '
    727:-1,         # ˗ ->
    774:{103: 287}, # g + Ux774 -> ğ
    775:{105: 105}, # i + Ux307 -> i
    8201:32,        #  -> \s 
    8208:45,        # ‐ -> -
    8211:45,        # – -> -
    8212:45,        # — -> -
    8216:39,        # ‘ -> '
    8217:39,        # ’ -> '
    8220:-1,        # “ -> 
    8221:-1,        # ” ->
    8232:32,        #  -> \s
    
}

In [19]:
STRING_CONVERSION = {
    "$": "dolar",
    "…": " ",
    "ﬀ": "ff",
    "ﬁ": "fi",
    "ﬂ ": "fl",
    "ﬂ": "fl",
    "ﬄ": "ffl",
    "n°": "no ",
    "N°": "no ",
    "Anahtar Kelimeler:": "",
    "I": "ı",
    "İ": "i",
    "SMK": "sınai mülkiyet kanunu",
    "VII.": "7.",
    "⅔": "%67",
    "⅕": "%20",
    "⅘": "%80",
    "⅚": "%83",
    "¼": "%25",
    "½": "%50",
    "¾": "%75",
}

In [20]:
REGEX_CONVERSION = {
    r"cov[ı,i]d-*\s*19": "covid-19",
    r"t?mk[\s\.]": "türk medeni kanunu ",
    r"(m\.?\s?)((\d)+)": r"madde \2",
    r"<([a-z]+)(?![^>]*\/>)[^>]*>": "",
    r"\.": " ",
    r"-": " ",
    r"'": "",
    r"%": " yüzde ",
}

In [21]:
def clean_text(string):
    for key in STRING_CONVERSION.keys():
        string = string.replace(key, STRING_CONVERSION[key])
    string = string.lower()
    result = []
    for char in string:
        if unicodedata.combining(char):
            try:
                result[-1] = chr(CHAR_CONVERSION[ord(char)][ord(result[-1])])
            except:
                continue
        elif ord(char) in CHAR_CONVERSION.keys():
            if CHAR_CONVERSION[ord(char)] != -1:
                result.append(chr(CHAR_CONVERSION[ord(char)]))
        elif ord(char) >= 942:      # greek and cyrillic alphabet
            continue
        else:
            result.append(char)
    result_str = ''.join(result)
    result_str = ' '.join([w for w in result_str.split(" ") if w != ""])
    while result_str.startswith(".") or result_str.startswith("'") or result_str.startswith("&") or result_str.startswith("-"):
        result_str = result_str[1:]
    while result_str.endswith(".") or result_str.endswith("'") or result_str.endswith("&") or result_str.endswith("-"):
        result_str = result_str[0:-1]
    for key in REGEX_CONVERSION.keys():
        result_str = re.sub(key, REGEX_CONVERSION[key], result_str)
    return((result_str).strip())

In [22]:
def clean_list(li):
    clean_list = []
    for item in li:
        if len(item.split(",")) > 1:
            clean_list.extend([it for it in item.split(",") if it != ""])
        elif len(item.split(";")) > 1:
            clean_list.extend([it for it in item.split(";") if it != ""])
        else:
            clean_list.append(item)
    clean_list = [clean_text(text) for text in clean_list]
    # clean_list = list(dict.fromkeys(clean_list))
    clean_list = ' '.join(clean_list)
    return [text for text in clean_list.split(" ") if text != ""]

In [23]:
def unique_chars_and_words(list_flat):
    list_unique = []
    list_dict = {}
    list_chars = []
    list_charcodes = []
    for w in list_flat:
        if w[0] not in list_unique:
            list_unique.append(w[0])
            list_dict[w[0]] = [w[1]]
        else:
            list_dict[w[0]].append(w[1])
        for c in w[0]:
            if c not in list_chars:
                list_chars.append(c)
                list_charcodes.append((ord(c), w[0]))
    list_chars = [[list_chars[i], list_charcodes[i][0], list_charcodes[i][1]] for i, c in enumerate(list_chars)]
    list_unique = [[list_unique[i], len(list_dict[list_unique[i]]), list_dict[list_unique[i]]] for i,_ in enumerate(list_unique)]
    return (list_chars, list_unique)

In [24]:
# pre-processing of input matrix, with removing emails, urls and html codes

for data_obj in json_data:
    data_obj["Metin"] = re.sub(r"[-\w\.]+@([-\w]+\.)+[-\w]{2,4}", "", data_obj["Metin"])
    data_obj["Metin"] = re.sub(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)", "", data_obj["Metin"])
    data_obj["Metin"] = re.sub(r"<([a-z]+)(?![^>]*\/>)[^>]*>", "", data_obj["Metin"])

In [25]:
words = [(data_obj["FileName"], clean_list(data_obj["Metin"].split(" "))) for data_obj in json_data]

In [26]:
words = [(w[0], len(w[1]), w[1]) for w in words]

In [27]:
w_flat = [(w, wvec[0]) for wvec in words for w in wvec[2]]

In [None]:
words_unique = []
for i, w in enumerate(w_flat):
    if i%10000 == 0:
        print(i)
    if w[0] not in words_unique:
        words_unique.append(w[0])

w_chars, w_unique = unique_chars_and_words(w_flat)

In [None]:
words_unique = []
list_dict = {}
for w in w_flat:
    if w[0] not in words_unique:
        words_unique.append(w[0])
        list_dict[w[0]] = [w[1]]
    else:
        list_dict[w[0]].append(w[1])
words_unique = [[words_unique[i], len(list_dict[words_unique[i]]), list_dict[words_unique[i]]] for i,_ in enumerate(words_unique)]

In [28]:
keywords = [(data_obj["FileName"], clean_list(data_obj["Anahtar Kelimeler"])) for data_obj in json_data]
kw_flat = [(kw, kwvec[0]) for kwvec in keywords for kw in kwvec[1]]
kw_chars, kw_unique = unique_chars_and_words(kw_flat)

unmatched_kw = []
for kw in kw_unique:
    s = ' '.join([w[0] for w in w_unique])
    for kww in kw[0].split(" "):
        if kww not in s:
            unmatched_kw.append((kww, kw[0]))
            break

words1 = [(data_obj["FileName"], clean_list(data_obj["Metin"].split(" "))) for data_obj in json_data if data_obj["FileName"]=="Hacettepe Üniversitesi 20.pdf.json"]