In [1]:
import os
import json
import unicodedata
import re

In [2]:
json_data = []
for path, dirs, files in os.walk(f"./TrainingSet"):
    for file in files:
        if file.endswith(".json"):
            f = open(os.path.join(path, file), encoding="utf-8-sig")
            json_data.append(json.load(f))
            json_data[-1]["FileName"] = file
            f.close()

In [3]:
CHAR_CONVERSION = {9:32,10:32,33:32,35:32,38:32,40:32,41:32,42:32,43:32,47:45,58:32,59:32,60:-1,61:32,62:-1,63:32,64:32,91:-1,93:-1,94:39,95:45,96:39,123:-1,124:32,125:-1,160:32,167:32,170:-1,171:32,176:-1,180:39,182:-1,184:-1,187:32,223:-1,224:97,225:97,226:97,228:97,229:97,230:101,232:101,233:101,234:101,235:101,236:105,237:105,238:105,239:105,241:110,242:111,243:111,244:111,245:111,248:111,249:117,250:117,251:117,257:97,261:97,263:99,269:99,279:101,299:105,322:108,324:110,353:115,363:117,369:252,382:122,523:105,537:351,601:101,699:39,700:39,703:39,706:-1,714:39,727:-1,774:{103: 287},775:{105: 105},8201:32,8208:45,8211:45,8212:45,8216:39,8217:39,8220:-1,8221:-1,8232:32, }
STRING_CONVERSION = {"$": "dolar","…": " ","ﬀ": "ff","ﬁ": "fi","ﬂ ": "fl","ﬂ": "fl","ﬄ": "ffl","n°": "no","N°": "no","Anahtar Kelimeler:": "","I": "ı","İ": "i","⅔": "%67","⅕": "%20","⅘": "%80","⅚": "%83","¼": "%25","½": "%50","¾": "%75",}
REGEX_CONVERSION = {r"cov[ı,i]d-*\s*19": "covid19",r"(\smd?\.?\s{0,2})((\d)+)": r" madde \2",r"<([a-z]+)(?![^>]*\/>)[^>]*>": "",r"\.": " ",r"-": " ",r"'": " ",r"(\s(%)(\s){0,2})((\d)*)": r" yüzde \4", r"%": "", r"hükumet": "hükümet", r"ışletme": "işletme", r"prepayment": "önödeme", r"arabulucui": "arabulucu"}
ABBREVIATIONS = {r"(^|\s)(smk)($|\s|\.)":r"\1sınai mülkiyet kanunu\3", r"(^|\s)(t?mk)($|\s|\.)": r"\1türk medeni kanunu\3", r"(^|\s)(t?ck)($|\s|\.)": r"\1türk ceza kanunu\3", r"(^|\s)(t?bk)($|\s|\.)": r"\1türk borçlar kanunu\3", r"(^|\s)(t?tk)($|\s|\.)": r"\1türk ticaret kanunu\3", r"(^|\s)(t?vk)($|\s|\.)": r"\1türk vergi kanunu\3", r"(^|\s)(a(b|t)ad)($|\s|\.)": r"\1avrupa birliği adalet divanı\4", r"(^|\s)(abd)($|\s|\.)": r"\1amerika birleşik devletleri\3", r"(^|\s)(ab)($|\s|\.)": r"\1avrupa birliği\3", r"(^|\s)(bm)($|\s|\.)": r"\1birleşmiş milletler\3", r"(^|\s)(a(i|ı)hm)($|\s|\.)": r"\1avrupa insan hakları mahkemesi\4", r"(^|\s)(a(i|ı)hs)($|\s|\.)": r"\1avrupa insan hakları sözleşmesi\4", r"(^|\s)(möhuk)($|\s|\.)": r"\1milletlerarası özel hukuk ve usul hukuku\3", r"(^|\s)(fsek)($|\s|\.)": r"\1fikir ve sanat eserleri kanunu\3", r"(^|\s)(kktc)($|\s|\.)": r"\1kuzey kıbrıs türk cumhuriyeti\3", r"(^|\s)([ey]?tkhk)($|\s|\.)": r"\1tüketicinin korunması hakkında kanun\3", r"(^|\s)(khk)($|\s|\.)": r"\1kanun hükmünde kararname\3"}

In [4]:
sw_file = open('./stopwords2.txt')
stop_words = sw_file.read()
stop_words = stop_words.split("\n")
sw_file.close()

In [5]:
def clean_text(string, kw=False):
    for key in STRING_CONVERSION.keys():
        string = string.replace(key, STRING_CONVERSION[key])
    string = string.lower()
    result = []
    for char in string:
        if unicodedata.combining(char):
            try:
                result[-1] = chr(CHAR_CONVERSION[ord(char)][ord(result[-1])])
            except:
                continue
        elif ord(char) in CHAR_CONVERSION.keys():
            if CHAR_CONVERSION[ord(char)] != -1:
                result.append(chr(CHAR_CONVERSION[ord(char)]))
        elif ord(char) >= 942:      # greek and cyrillic alphabet
            continue
        else:
            result.append(char)
    result_str = ''.join(result)
    result_str = ' '.join([w for w in result_str.split(" ") if w != ""])
    for key in ABBREVIATIONS.keys():
        result_str = re.sub(key, ABBREVIATIONS[key], result_str)
    if kw:
        for key in REGEX_CONVERSION.keys():
            result_str = re.sub(key, REGEX_CONVERSION[key], result_str)
    return((result_str).strip())

In [6]:
def clean_list(li, kw=False):
    clean_list = []
    for item in li:
        if len(item.split(",")) > 1:
            clean_list.extend([it for it in item.split(",") if it != ""])
        elif len(item.split(";")) > 1:
            clean_list.extend([it for it in item.split(";") if it != ""])
        else:
            clean_list.append(item)
    clean_list = [clean_text(text, kw) for text in clean_list if text != ""]
    if not kw:
        clean_str = ' '.join(clean_list)
        for key in REGEX_CONVERSION.keys():
            clean_str = re.sub(key, REGEX_CONVERSION[key], clean_str)
        clean_list = [part for part in clean_str.split(" ") if part != ""]
        # clean_list = [stemmer.stemWord(text) for text in clean_list if text not in stop_words]
        clean_list = [text for text in clean_list if text not in stop_words]
    if kw:
        clean_list = list(dict.fromkeys(clean_list))
    # clean_list = [' '.join([part for part in text.split(" ") if part not in stop_words and part != ""]) for text in clean_list]
    clean_list = [text for text in clean_list if text != ""]
    return clean_list

In [7]:
for data_obj in json_data:
    data_obj["Metin"] = re.sub(r"[-\w\.]+@([-\w]+\.)+[-\w]{2,4}", "", data_obj["Metin"])
    data_obj["Metin"] = re.sub(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)", "", data_obj["Metin"])
    data_obj["Metin"] = re.sub(r"<([a-z]+)(?![^>]*\/>)[^>]*>", "", data_obj["Metin"])

In [8]:
words = [(data_obj["FileName"], clean_list(data_obj["Metin"].split(" "))) for data_obj in json_data]
words = [(wtup[0], len(wtup[1]), wtup[1]) for wtup in words]

In [9]:
keywords = [(data_obj["FileName"], clean_list(data_obj["Anahtar Kelimeler"], True)) for data_obj in json_data]
keywords = [(kwtup[0], len(kwtup[1]), kwtup[1]) for kwtup in keywords]

w_flat = [(w, wtup[0]) for wtup in words for w in wtup[2]]
kw_flat = [(kw, kwtup[0]) for kwtup in keywords for kw in kwtup[2]]

def unique_chars_and_words(list_flat):
    list_unique = []
    list_dict = {}
    list_chars = []
    list_charcodes = []
    for w in list_flat:
        if w[0] not in list_unique:
            list_unique.append(w[0])
            list_dict[w[0]] = [w[1]]
        else:
            list_dict[w[0]].append(w[1])
        for c in w[0]:
            if c not in list_chars:
                list_chars.append(c)
                list_charcodes.append((ord(c), w[0]))
    list_chars = [[list_chars[i], list_charcodes[i][0], list_charcodes[i][1]] for i, c in enumerate(list_chars)]
    list_unique = [[list_unique[i], len(list_dict[list_unique[i]]), list_dict[list_unique[i]]] for i,_ in enumerate(list_unique)]
    return (list_chars, list_unique)

kw_chars, kw_unique = unique_chars_and_words(kw_flat)
w_chars, w_unique = unique_chars_and_words(w_flat)

In [10]:
X = [wtup[2] for wtup in words]
y = [kwtup[2] for kwtup in keywords]

In [11]:
allkws = [w for row in y for kw in row for w in kw.split(" ")]
allws = [w for row in X for w in row]
nomatchkws = [kw for kw in allkws if kw not in allws]

In [12]:
import gensim



In [13]:
DIM_SIZE = 300

In [14]:
word2vec_model = gensim.models.Word2Vec(X, min_count = 1, vector_size = DIM_SIZE, window = 10)

In [16]:
len(word2vec_model.wv)

242458

In [15]:
asd = word2vec_model.wv["hukuku"]
word2vec_model.wv.most_similar(positive=[asd])

[('hukuku', 1.0000001192092896),
 ('hukukunun', 0.7701903581619263),
 ('hukukuna', 0.7288190126419067),
 ('geçirilmiş', 0.632036566734314),
 ('hukukundaki', 0.6297780275344849),
 ('hukukuyla', 0.6232560873031616),
 ('hukukunda', 0.6179437637329102),
 ('hukukunu', 0.6174837946891785),
 ('vazedilmeye', 0.616786777973175),
 ('hukukundan', 0.6048429608345032)]

In [16]:
import numpy as np

In [17]:
X_np = np.zeros((len(X), DIM_SIZE))
for i, row in enumerate(X):
    for word in row:
        X_np[i] += word2vec_model.wv[word]
    X_np[i] /= len(row)

word2vec_model.build_vocab([["noword"]], update=True)

In [18]:
from sklearn.preprocessing import MultiLabelBinarizer

In [19]:
y_mlb = MultiLabelBinarizer()
y_binary = y_mlb.fit_transform(y)

In [22]:
X_mlb = MultiLabelBinarizer()
X_binary = X_mlb.fit_transform(X)

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_np, y_binary, test_size=0.05)

In [30]:
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()
X_train = min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)

In [31]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', activation='logistic', max_iter=2000, learning_rate='adaptive', hidden_layer_sizes=(15), random_state=1)
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [24]:
y_pred = clf.predict(X_test)

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)


0.0

In [127]:
for row in y_pred:
    for col in row:
        if col > 0:
            print("yes")

In [26]:
y_mlb.inverse_transform(y_pred[0:1,:])

[()]

In [27]:
a = y_pred[0:1,:].T

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader