In [1]:
import os
import json
import unicodedata
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD

In [2]:
json_data = []
for path, dirs, files in os.walk(f"./TrainingSet"):
    for file in files:
        if file.endswith(".json"):
            f = open(os.path.join(path, file), encoding="utf-8-sig")
            json_data.append(json.load(f))
            json_data[-1]["FileName"] = file
            f.close()

In [3]:
CHAR_CONVERSION = {9:32,10:32,33:32,35:32,38:32,40:32,41:32,42:32,43:32,47:45,58:32,59:32,60:-1,61:32,62:-1,63:32,64:32,91:-1,93:-1,94:39,95:45,96:39,123:-1,124:32,125:-1,160:32,167:32,170:-1,171:32,176:-1,180:39,182:-1,184:-1,187:32,223:-1,224:97,225:97,226:97,228:97,229:97,230:101,232:101,233:101,234:101,235:101,236:105,237:105,238:105,239:105,241:110,242:111,243:111,244:111,245:111,248:111,249:117,250:117,251:117,257:97,261:97,263:99,269:99,279:101,299:105,322:108,324:110,353:115,363:117,369:252,382:122,523:105,537:351,601:101,699:39,700:39,703:39,706:-1,714:39,727:-1,774:{103: 287},775:{105: 105},8201:32,8208:45,8211:45,8212:45,8216:39,8217:39,8220:-1,8221:-1,8232:32, }
STRING_CONVERSION = {"$": "dolar","…": " ","ﬀ": "ff","ﬁ": "fi","ﬂ ": "fl","ﬂ": "fl","ﬄ": "ffl","n°": "no","N°": "no","Anahtar Kelimeler:": "","I": "ı","İ": "i","⅔": "%67","⅕": "%20","⅘": "%80","⅚": "%83","¼": "%25","½": "%50","¾": "%75",}
REGEX_CONVERSION = {r"cov[ı,i]d-*\s*19": "covid19",r"(\smd?\.?\s{0,2})((\d)+)": r" madde \2",r"<([a-z]+)(?![^>]*\/>)[^>]*>": "",r"\.": " ",r"-": " ",r"'": " ",r"(\s(%)(\s){0,2})((\d)*)": r" yüzde \4", r"%": "", r"hükumet": "hükümet", r"ışletme": "işletme", r"prepayment": "önödeme", r"arabulucui": "arabulucu"}
ABBREVIATIONS = {r"(^|\s)(smk)($|\s|\.)":r"\1sınai mülkiyet kanunu\3", r"(^|\s)(t?mk)($|\s|\.)": r"\1türk medeni kanunu\3", r"(^|\s)(t?ck)($|\s|\.)": r"\1türk ceza kanunu\3", r"(^|\s)(t?bk)($|\s|\.)": r"\1türk borçlar kanunu\3", r"(^|\s)(t?tk)($|\s|\.)": r"\1türk ticaret kanunu\3", r"(^|\s)(t?vk)($|\s|\.)": r"\1türk vergi kanunu\3", r"(^|\s)(a(b|t)ad)($|\s|\.)": r"\1avrupa birliği adalet divanı\4", r"(^|\s)(abd)($|\s|\.)": r"\1amerika birleşik devletleri\3", r"(^|\s)(ab)($|\s|\.)": r"\1avrupa birliği\3", r"(^|\s)(bm)($|\s|\.)": r"\1birleşmiş milletler\3", r"(^|\s)(a(i|ı)hm)($|\s|\.)": r"\1avrupa insan hakları mahkemesi\4", r"(^|\s)(a(i|ı)hs)($|\s|\.)": r"\1avrupa insan hakları sözleşmesi\4", r"(^|\s)(möhuk)($|\s|\.)": r"\1milletlerarası özel hukuk ve usul hukuku\3", r"(^|\s)(fsek)($|\s|\.)": r"\1fikir ve sanat eserleri kanunu\3", r"(^|\s)(kktc)($|\s|\.)": r"\1kuzey kıbrıs türk cumhuriyeti\3", r"(^|\s)([ey]?tkhk)($|\s|\.)": r"\1tüketicinin korunması hakkında kanun\3", r"(^|\s)(khk)($|\s|\.)": r"\1kanun hükmünde kararname\3"}

In [4]:
sw_file = open('./stopwords2.txt')
stop_words = sw_file.read()
stop_words = stop_words.split("\n")
sw_file.close()

In [5]:
def clean_text(string, kw=False):
    for key in STRING_CONVERSION.keys():
        string = string.replace(key, STRING_CONVERSION[key])
    string = string.lower()
    result = []
    for char in string:
        if unicodedata.combining(char):
            try:
                result[-1] = chr(CHAR_CONVERSION[ord(char)][ord(result[-1])])
            except:
                continue
        elif ord(char) in CHAR_CONVERSION.keys():
            if CHAR_CONVERSION[ord(char)] != -1:
                result.append(chr(CHAR_CONVERSION[ord(char)]))
        elif ord(char) >= 942:
            continue
        else:
            result.append(char)
    result_str = ''.join(result)
    result_str = ' '.join([w for w in result_str.split(" ") if w != ""])
    for key in ABBREVIATIONS.keys():
        result_str = re.sub(key, ABBREVIATIONS[key], result_str)
    if kw:
        for key in REGEX_CONVERSION.keys():
            result_str = re.sub(key, REGEX_CONVERSION[key], result_str)
    return((result_str).strip())

In [6]:
def clean_list(li, kw=False):
    clean_list = []
    for item in li:
        if len(item.split(",")) > 1:
            clean_list.extend([it for it in item.split(",") if it != ""])
        elif len(item.split(";")) > 1:
            clean_list.extend([it for it in item.split(";") if it != ""])
        else:
            clean_list.append(item)
    clean_list = [clean_text(text, kw) for text in clean_list if text != ""]
    if not kw:
        clean_str = ' '.join(clean_list)
        for key in REGEX_CONVERSION.keys():
            clean_str = re.sub(key, REGEX_CONVERSION[key], clean_str)
        clean_list = [part for part in clean_str.split(" ") if part != ""]
        clean_list = [text for text in clean_list if text not in stop_words]
    if kw:
        clean_list = list(dict.fromkeys(clean_list))
    clean_list = [text for text in clean_list if text != ""]
    return clean_list

In [7]:
for data_obj in json_data:
    data_obj["Özet"] = re.sub(r"[-\w\.]+@([-\w]+\.)+[-\w]{2,4}", "", data_obj["Özet"])
    data_obj["Özet"] = re.sub(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)", "", data_obj["Özet"])
    data_obj["Özet"] = re.sub(r"<([a-z]+)(?![^>]*\/>)[^>]*>", "", data_obj["Özet"])

In [8]:
words = [(data_obj["FileName"], clean_list(data_obj["Özet"].split(" "))) for data_obj in json_data]
words = [(wtup[0], len(wtup[1]), wtup[1]) for wtup in words]

In [9]:
keywords = [(data_obj["FileName"], clean_list(data_obj["Anahtar Kelimeler"], True)) for data_obj in json_data]
keywords = [(kwtup[0], len(kwtup[1]), kwtup[1]) for kwtup in keywords]

In [10]:
X = [wtup[2] for wtup in words]
y = [kwtup[2] for kwtup in keywords]
data = [[' '.join(words[i][2]), ','.join(keywords[i][2])] for i,_ in enumerate(words)]

In [11]:
df = pd.DataFrame(data, columns = ['text', 'tags'])
df.head()

Unnamed: 0,text,tags
0,anonim ortaklıklarda uzun yıllardır kabul edil...,"menfaat sahipliği teorisi,insan sermayesi,takı..."
1,uluslararası kamu hukukunun asli kaynakları ul...,"ısrarlı itirazcı,itiraz,teamül hukuku,uluslara..."
2,vesayet velayet altında bulunmayan küçük kısıt...,"vesayet,vasi,vesayet makamı,vesayet altındaki ..."
3,dolandırıcılık suçunu malvarlığına karşı işlen...,"türk ceza kanunu,suç,ceza,hile,dolandırıcılık,..."
4,bankacılık sistemi dışında gerçekleşen yolcu b...,"gümrük,döviz,mülkiyet hakkı,idari para cezası,..."


In [12]:
vectorizer_binary = CountVectorizer(tokenizer = lambda x: x.split(","), binary=True)
y_multilabel = vectorizer_binary.fit_transform(df['tags'])

In [13]:
tags_sum = y_multilabel.sum(axis=0).tolist()[0]
sorted_tags_i = sorted(range(len(tags_sum)), key=lambda i: tags_sum[i], reverse=True)
yn_multilabel = y_multilabel[:, sorted_tags_i[:1000]]

In [14]:
df_tags = pd.DataFrame(df["tags"].apply(lambda x: x.split(",")).to_list())
df_tags_flat = pd.concat([df_tags[col] for col in df_tags])
df_tags_unique = df_tags_flat.unique()

In [15]:
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(df_tags_unique)
y_encoded = np.array(df_tags.apply(lambda x: label_encoder.transform(x)))

In [151]:
X_train, X_test, y_train, y_test = train_test_split(df, y_multilabel, test_size=0.2, random_state=42)

In [115]:
X_train, X_test, y_train, y_test = train_test_split(df, yn_multilabel, test_size=0.2, random_state=42)

In [87]:
X_train, X_test, y_train, y_test = train_test_split(df, y_encoded, test_size=0.2, random_state=42)

In [67]:
print("Number of data points in training data :", X_train.shape[0])
print("Number of data points in test data :", X_test.shape[0])

Number of data points in training data : 573
Number of data points in test data : 144


In [152]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.0009, max_features=200000, tokenizer = lambda x: x.split(), ngram_range=(1, 3))
X_train_multilabel = vectorizer.fit_transform(X_train['text'])
X_test_multilabel = vectorizer.transform(X_test['text'])

In [153]:
print("Training data shape X : ", X_train_multilabel.shape, "Y :", y_train.shape)
print("Test data shape X : ", X_test_multilabel.shape,"Y:", y_test.shape)

Training data shape X :  (573, 200000) Y : (573, 2905)
Test data shape X :  (144, 200000) Y: (144, 2905)


In [90]:
from napkinxc.models import PLT
from napkinxc.measures import precision_at_k

In [97]:
plt_xc = PLT("nlp-model")
plt_xc.fit(X_train_multilabel.toarray(), y_train)

In [98]:
plt_xc.load()
y_pred = plt_xc.predict(X_test_multilabel.toarray(), top_k=8)
precision_at_k(y_test, y_pred, k=8)

array([1., 1., 1., 1., 1., 1., 1., 1.])

In [27]:
tsvd = TruncatedSVD(n_components=100)
X_train_svd = tsvd.fit_transform(X_train_multilabel)
X_test_svd = tsvd.transform(X_test_multilabel)

In [28]:
print("Training data shape X : ", X_train_svd.shape, "Y :", y_train.shape)
print("Test data shape X : ", X_test_svd.shape,"Y:", y_test.shape)

Training data shape X :  (573, 100) Y : (573, 1000)
Test data shape X :  (144, 100) Y: (144, 1000)


In [48]:
clf = MultiOutputClassifier(LogisticRegression(penalty='l1', solver='saga'))
clf.fit(X_train_multilabel, y_train)

In [154]:
clf = OneVsRestClassifier(SGDClassifier(loss='log_loss', alpha=0.0001, penalty='l1'))
clf.fit(X_train_multilabel, y_train)
# clf.fit(X_train_svd, y_train)

In [None]:
clf = MultiOutputClassifier(SGDClassifier(loss='log_loss', alpha=0.0001, penalty='l1'))
clf.fit(X_train_multilabel, y_train)

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X_train_multilabel, y_train)

In [34]:
clf = RandomForestClassifier(n_estimators=300, criterion='log_loss')
clf.fit(X_train_multilabel, y_train)

In [60]:
clf = OneVsRestClassifier(LogisticRegression(penalty='l2', solver='sag'))
clf.fit(X_train_multilabel, y_train)

In [186]:
clf = OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None))
clf.fit(X_train_multilabel, y_train)

In [29]:
clf = OneVsRestClassifier(MLPClassifier(hidden_layer_sizes=(10,2)))
clf.fit(X_train_svd, y_train)

In [155]:
y_pred = clf.predict(X_test_multilabel)

In [30]:
y_pred = clf.predict(X_test_svd)

In [50]:
def binarize_encoding(row):
    a = label_encoder.inverse_transform(row)
    a = a[a != np.array(None)]
    a = np.unique(a)
    return sum(vectorizer_binary.transform(a)).toarray()[0]

In [51]:
y_test_bin = np.array([binarize_encoding(row) for row in y_test])
y_pred_bin = np.array([binarize_encoding(row) for row in y_pred])

In [52]:
print("Accuracy :", metrics.accuracy_score(y_test_bin, y_pred_bin))
print("Micro Precision score :", metrics.precision_score(y_test_bin, y_pred_bin, average = 'micro'))
print("Micro Recall score :", metrics.recall_score(y_test_bin, y_pred_bin, average = 'micro'))
print("Micro F1 score :", metrics.f1_score(y_test_bin, y_pred_bin, average = 'micro'))
print("Macro F1 score :", metrics.f1_score(y_test_bin, y_pred_bin, average = 'macro'))

Accuracy : 0.0
Micro Precision score : 0.013888888888888888
Micro Recall score : 0.007874015748031496
Micro F1 score : 0.010050251256281407
Macro F1 score : 0.0002527484455195298


In [None]:
for i,_ in enumerate(y_test_bin):
    print("Accuracy :", metrics.precision_score(y_test_bin[i], y_pred_bin[i]))

In [156]:
print("Accuracy :", metrics.accuracy_score(y_test, y_pred))
print("Micro Precision score :", metrics.precision_score(y_test, y_pred, average = 'micro'))
print("Weighted Precision score :", metrics.precision_score(y_test, y_pred, average = 'weighted'))
print("Micro Recall score :", metrics.recall_score(y_test, y_pred, average = 'micro'))
print("Micro F1 score :", metrics.f1_score(y_test, y_pred, average = 'micro'))
print("Macro F1 score :", metrics.f1_score(y_test, y_pred, average = 'macro'))

Accuracy : 0.0
Micro Precision score : 0.42424242424242425
Weighted Precision score : 0.026465441819772527
Micro Recall score : 0.01837270341207349
Micro F1 score : 0.03522012578616352
Macro F1 score : 0.0036103598065732314


In [157]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
          17       0.00    

In [158]:
vectorizer_binary.inverse_transform(y_pred)

[array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array(['tahkim'], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array(['adil yargılanma hakkı'], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array(['işleten', 'nükleer enerji',
        'üçüncü kişilere karşı hukuki sorumluluk'], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array(['yoksulluk nafakası'], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array(['tüketici'], dtype='<U195'),
 array([], dtype='<U195'

In [44]:
y_pred_np = y_pred.toarray()