In [19]:
import os
import json
import unicodedata
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import TruncatedSVD

In [20]:
json_data = []
for path, dirs, files in os.walk(f"./TrainingSet"):
    for file in files:
        if file.endswith(".json"):
            f = open(os.path.join(path, file), encoding="utf-8-sig")
            json_data.append(json.load(f))
            json_data[-1]["FileName"] = file
            f.close()

In [21]:
CHAR_CONVERSION = {9:32,10:32,33:32,35:32,38:32,40:32,41:32,42:32,43:32,47:45,58:32,59:32,60:-1,61:32,62:-1,63:32,64:32,91:-1,93:-1,94:39,95:45,96:39,123:-1,124:32,125:-1,160:32,167:32,170:-1,171:32,176:-1,180:39,182:-1,184:-1,187:32,223:-1,224:97,225:97,226:97,228:97,229:97,230:101,232:101,233:101,234:101,235:101,236:105,237:105,238:105,239:105,241:110,242:111,243:111,244:111,245:111,248:111,249:117,250:117,251:117,257:97,261:97,263:99,269:99,279:101,299:105,322:108,324:110,353:115,363:117,369:252,382:122,523:105,537:351,601:101,699:39,700:39,703:39,706:-1,714:39,727:-1,774:{103: 287},775:{105: 105},8201:32,8208:45,8211:45,8212:45,8216:39,8217:39,8220:-1,8221:-1,8232:32, }
STRING_CONVERSION = {"$": "dolar","…": " ","ﬀ": "ff","ﬁ": "fi","ﬂ ": "fl","ﬂ": "fl","ﬄ": "ffl","n°": "no","N°": "no","Anahtar Kelimeler:": "","I": "ı","İ": "i","⅔": "%67","⅕": "%20","⅘": "%80","⅚": "%83","¼": "%25","½": "%50","¾": "%75",}
REGEX_CONVERSION = {r"cov[ı,i]d-*\s*19": "covid19",r"(\smd?\.?\s{0,2})((\d)+)": r" madde \2",r"<([a-z]+)(?![^>]*\/>)[^>]*>": "",r"\.": " ",r"-": " ",r"'": " ",r"(\s(%)(\s){0,2})((\d)*)": r" yüzde \4", r"%": "", r"hükumet": "hükümet", r"ışletme": "işletme", r"prepayment": "önödeme", r"arabulucui": "arabulucu"}
ABBREVIATIONS = {r"(^|\s)(smk)($|\s|\.)":r"\1sınai mülkiyet kanunu\3", r"(^|\s)(t?mk)($|\s|\.)": r"\1türk medeni kanunu\3", r"(^|\s)(t?ck)($|\s|\.)": r"\1türk ceza kanunu\3", r"(^|\s)(t?bk)($|\s|\.)": r"\1türk borçlar kanunu\3", r"(^|\s)(t?tk)($|\s|\.)": r"\1türk ticaret kanunu\3", r"(^|\s)(t?vk)($|\s|\.)": r"\1türk vergi kanunu\3", r"(^|\s)(a(b|t)ad)($|\s|\.)": r"\1avrupa birliği adalet divanı\4", r"(^|\s)(abd)($|\s|\.)": r"\1amerika birleşik devletleri\3", r"(^|\s)(ab)($|\s|\.)": r"\1avrupa birliği\3", r"(^|\s)(bm)($|\s|\.)": r"\1birleşmiş milletler\3", r"(^|\s)(a(i|ı)hm)($|\s|\.)": r"\1avrupa insan hakları mahkemesi\4", r"(^|\s)(a(i|ı)hs)($|\s|\.)": r"\1avrupa insan hakları sözleşmesi\4", r"(^|\s)(möhuk)($|\s|\.)": r"\1milletlerarası özel hukuk ve usul hukuku\3", r"(^|\s)(fsek)($|\s|\.)": r"\1fikir ve sanat eserleri kanunu\3", r"(^|\s)(kktc)($|\s|\.)": r"\1kuzey kıbrıs türk cumhuriyeti\3", r"(^|\s)([ey]?tkhk)($|\s|\.)": r"\1tüketicinin korunması hakkında kanun\3", r"(^|\s)(khk)($|\s|\.)": r"\1kanun hükmünde kararname\3"}

In [23]:
sw_file = open('./stopwords2.txt')
stop_words = sw_file.read()
stop_words = stop_words.split("\n")
sw_file.close()

In [24]:
def clean_text(string, kw=False):
    for key in STRING_CONVERSION.keys():
        string = string.replace(key, STRING_CONVERSION[key])
    string = string.lower()
    result = []
    for char in string:
        if unicodedata.combining(char):
            try:
                result[-1] = chr(CHAR_CONVERSION[ord(char)][ord(result[-1])])
            except:
                continue
        elif ord(char) in CHAR_CONVERSION.keys():
            if CHAR_CONVERSION[ord(char)] != -1:
                result.append(chr(CHAR_CONVERSION[ord(char)]))
        elif ord(char) >= 942:
            continue
        else:
            result.append(char)
    result_str = ''.join(result)
    result_str = ' '.join([w for w in result_str.split(" ") if w != ""])
    for key in ABBREVIATIONS.keys():
        result_str = re.sub(key, ABBREVIATIONS[key], result_str)
    if kw:
        for key in REGEX_CONVERSION.keys():
            result_str = re.sub(key, REGEX_CONVERSION[key], result_str)
    return((result_str).strip())

In [25]:
def clean_list(li, kw=False):
    clean_list = []
    for item in li:
        if len(item.split(",")) > 1:
            clean_list.extend([it for it in item.split(",") if it != ""])
        elif len(item.split(";")) > 1:
            clean_list.extend([it for it in item.split(";") if it != ""])
        else:
            clean_list.append(item)
    clean_list = [clean_text(text, kw) for text in clean_list if text != ""]
    if not kw:
        clean_str = ' '.join(clean_list)
        for key in REGEX_CONVERSION.keys():
            clean_str = re.sub(key, REGEX_CONVERSION[key], clean_str)
        clean_list = [part for part in clean_str.split(" ") if part != ""]
        clean_list = [text for text in clean_list if text not in stop_words]
    if kw:
        clean_list = list(dict.fromkeys(clean_list))
    clean_list = [text for text in clean_list if text != ""]
    return clean_list

In [26]:
for data_obj in json_data:
    data_obj["Metin"] = re.sub(r"[-\w\.]+@([-\w]+\.)+[-\w]{2,4}", "", data_obj["Metin"])
    data_obj["Metin"] = re.sub(r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)", "", data_obj["Metin"])
    data_obj["Metin"] = re.sub(r"<([a-z]+)(?![^>]*\/>)[^>]*>", "", data_obj["Metin"])

In [27]:
words = [(data_obj["FileName"], clean_list(data_obj["Metin"].split(" "))) for data_obj in json_data]
words = [(wtup[0], len(wtup[1]), wtup[1]) for wtup in words]

In [28]:
keywords = [(data_obj["FileName"], clean_list(data_obj["Anahtar Kelimeler"], True)) for data_obj in json_data]
keywords = [(kwtup[0], len(kwtup[1]), kwtup[1]) for kwtup in keywords]

In [69]:
X = [wtup[2] for wtup in words]
y = [kwtup[2] for kwtup in keywords]
data = [[' '.join(words[i][2]), ','.join(keywords[i][2])] for i,_ in enumerate(words)]

In [30]:
df = pd.DataFrame(data, columns = ['text', 'tags'])
df.head()

Unnamed: 0,text,tags
0,anonim ortaklıkların amacı kime hizmet ettiği ...,"menfaat sahipliği teorisi,insan sermayesi,takı..."
1,uluslararası kamu hukuku devletlerin kimi zama...,"ısrarlı itirazcı,itiraz,teamül hukuku,uluslara..."
2,vesayet altındaki kişinin malvarlığının yöneti...,"vesayet,vasi,vesayet makamı,vesayet altındaki ..."
3,dolandırıcılık doğru davranma iyiniyet kuralla...,"türk ceza kanunu,suç,ceza,hile,dolandırıcılık,..."
4,türkiye 2019 yılının ilk altı ayında yaklaşık ...,"gümrük,döviz,mülkiyet hakkı,idari para cezası,..."


In [31]:
vectorizer_binary = CountVectorizer(tokenizer = lambda x: x.split(","), binary=True)
y_multilabel = vectorizer_binary.fit_transform(df['tags'])

In [32]:
tags_sum = y_multilabel.sum(axis=0).tolist()[0]
sorted_tags_i = sorted(range(len(tags_sum)), key=lambda i: tags_sum[i], reverse=True)
yn_multilabel = y_multilabel[:, sorted_tags_i[:1000]]

In [135]:
df['tag_count_1000'] = [sum(yn_multilabel[i].toarray()[0]) for i in range(yn_multilabel.shape[0])]
df1 = df[df['tag_count_1000'] != 0]
df1.shape

(655, 4)

In [35]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, tokenizer = lambda x: x.split(), ngram_range=(1, 3))
clf = OneVsRestClassifier(SGDClassifier(loss='log_loss', alpha=0.0001, penalty='l2'))

In [36]:
k = 5
kf = KFold(n_splits=k, shuffle=True)

scores = np.zeros((k, 5))
k_idx = 0
for train_index, test_index in kf.split(df, yn_multilabel):
    X_train, X_test = df.iloc[train_index, :], df.iloc[test_index, :]
    y_train, y_test = yn_multilabel[train_index], yn_multilabel[test_index]
    
    X_train_tfidf = vectorizer.fit_transform(X_train['text'])
    X_test_tfidf = vectorizer.transform(X_test['text'])
    print("Training:", X_train_tfidf.shape, y_train.shape, "Test:", X_test_tfidf.shape, y_test.shape)
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_test_tfidf)
     
    acc = metrics.accuracy_score(y_pred, y_test)
    prec = metrics.precision_score(y_test, y_pred, average = 'micro')
    recl = metrics.recall_score(y_test, y_pred, average = 'micro')
    mif1 = metrics.f1_score(y_test, y_pred, average = 'micro')
    maf1 = metrics.f1_score(y_test, y_pred, average = 'macro')
    scores[k_idx,:] = np.array([acc, prec, recl, mif1, maf1])
    k_idx += 1
avg_scores = scores.mean(axis=0)
print('Average accuracy : {}'.format(avg_scores[0]))
print('Average precision : {}'.format(avg_scores[0]))
print('Average recall : {}'.format(avg_scores[0]))
print('Average accuracy : {}'.format(avg_scores[0]))
print('Average accuracy : {}'.format(avg_scores[0]))

Accuracy of each fold - [0.10416666666666667, 0.10416666666666667, 0.06993006993006994, 0.0979020979020979, 0.07692307692307693]
Average accuracy : 0.09061771561771562


In [119]:
X_train, X_test, y_train, y_test = train_test_split(df, yn_multilabel, test_size=0.2, random_state=42)
print("Number of data points in training data :", X_train.shape[0])
print("Number of data points in test data :", X_test.shape[0])

Number of data points in training data : 573
Number of data points in test data : 144


In [120]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, tokenizer = lambda x: x.split(), ngram_range=(1, 3))
X_train_multilabel = vectorizer.fit_transform(X_train['text'])
X_test_multilabel = vectorizer.transform(X_test['text'])

In [121]:
print("Training data shape X : ", X_train_multilabel.shape, "Y :", y_train.shape)
print("Test data shape X : ", X_test_multilabel.shape,"Y:", y_test.shape)

Training data shape X :  (573, 200000) Y : (573, 1000)
Test data shape X :  (144, 200000) Y: (144, 1000)


In [122]:
clf = OneVsRestClassifier(SGDClassifier(loss='log_loss', alpha=0.0001, penalty='l1'))
clf.fit(X_train_multilabel, y_train)
y_pred = clf.predict(X_test_multilabel)

In [127]:
from sklearn.multiclass import OneVsOneClassifier

In [128]:
clf = OneVsOneClassifier(SGDClassifier(loss='log_loss', alpha=0.0001, penalty='l1'))
clf.fit(X_train_multilabel, y_train)
y_pred = clf.predict(X_test_multilabel)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [48]:
from sklearn.linear_model import LogisticRegression
import gensim

In [106]:
VEC_SIZE = 500

In [108]:
word2vec_model = gensim.models.Word2Vec(X, min_count = 1, vector_size = VEC_SIZE, window = 10)

In [109]:
def avg_pooling(ds):
    pooled = np.zeros((ds.shape[0], VEC_SIZE))
    for i, row in enumerate(ds):
        for word in row.split():
            pooled[i] += word2vec_model.wv[word]
        pooled[i] /= len(row)
    return pooled

In [110]:
X_train_wv = avg_pooling(X_train['text'])
X_test_wv = avg_pooling(X_test['text'])

In [111]:
print("Training data shape X : ", X_train_wv.shape, "Y :", y_train.shape)
print("Test data shape X : ", X_test_wv.shape,"Y:", y_test.shape)

Training data shape X :  (573, 500) Y : (573, 1000)
Test data shape X :  (144, 500) Y: (144, 1000)


model = LogisticRegression(multi_class="multinomial")
model.fit(X_train_wv, y_train)
y_pred = clf.predict(X_test_wv)

In [115]:
# model = OneVsRestClassifier(LogisticRegression(multi_class='multinomial', solver='lbfgs'))
model = OneVsRestClassifier(SGDClassifier(loss='log_loss', alpha=0.0001))
model.fit(X_train_wv, y_train)
y_pred = model.predict(X_test_wv)

In [129]:
print("Accuracy :", metrics.accuracy_score(y_test,y_pred))
print("Micro Precision score :", metrics.precision_score(y_test, y_pred, average = 'micro'))
print("Macro Precision score :", metrics.precision_score(y_test, y_pred, average = 'macro'))
print("Micro F1 score :", metrics.f1_score(y_test, y_pred, average = 'micro'))
print("Macro F1 score :", metrics.f1_score(y_test, y_pred, average = 'macro'))

Accuracy : 0.10416666666666667
Micro Precision score : 0.34545454545454546
Macro Precision score : 0.016
Micro F1 score : 0.095
Macro F1 score : 0.015285714285714284


In [132]:
print(y_test.shape, y_pred.shape)
y_test[0].shape

(144, 1000) (144, 1000)


(1, 1000)

In [124]:
vectorizer_binary.inverse_transform(y_pred)

[array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array(['2017 anayasa değişiklikleri', 'alt işveren'], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array(['anonim ortaklık', 'bağışlayan', 'hakimin karar vermesi'],
       dtype='<U195'),
 array([], dtype='<U195'),
 array(['6271 sayılı kanun'], dtype='<U195'),
 array([], dtype='<U195'),
 array(['1992 hukuki sorumluluk sözleşmesi', 'af',
        'ailenin korunması hakkı'], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array(['belge ibrazı'], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array(['akde aykırılık'], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], dtype='<U195'),
 array([], d