In [142]:
import sys,re, os, glob, json, string, pprint
import pandas as pd
import numpy as np 
from snowballstemmer import TurkishStemmer 
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize 
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from difflib import SequenceMatcher
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

## Global Variables

In [154]:
word_size_threshold = 100
similarity_ratio = 0.70
path = "datasets/*.json"
#path = "C://Users/onkol/Desktop/2021-01-20220322T055600Z-001/2021-01/*.json" 
create_data_json = False
enable_preprocessing = True

## Read Datasets

In [152]:
files = glob.glob(path)

json_arr = []
for file in files:
    f = open(file, "r", encoding='utf-8')
    jsonData = json.loads(f.read())
    json_arr.append(jsonData)


27851


## Create labels from dataset

In [153]:
if(not create_data_json):
    raise Exception("NOT ERROR - SKIPPINIG")

def has_key(dict, key): 
    if key in dict.keys(): 
        return 1 
    return 0

labels = {} 

def cosine_sim(vec1, vec2):
    vec1 = vec1.reshape(1, -1)
    vec2 = vec2.reshape(1, -1)
    return  cosine_similarity(vec1,vec2)[0][0]

def similarity(a, b):
    words_in_a = a.replace(',', ' ').split()
    words_in_b = b.replace(',', ' ').split()
    if(len(words_in_a) ==  1 and words_in_a[0] in words_in_b):
        return 1
    if(len(words_in_b) ==  1 and words_in_b[0] in words_in_a):
        return -1
    
    total_ratio = 0
    for word_a in words_in_a:
        a_ratio = 0
        for word_b in words_in_b:
            b_ratio = SequenceMatcher(None, a, b).ratio()
            if(b_ratio > a_ratio):
                a_ratio = b_ratio
        total_ratio += a_ratio
    
    return total_ratio / len(words_in_a)

def readLabels(jsonData): 
    key = jsonData["Suç"].strip().lower() if jsonData["Suç"] != '' else "undefined"
    if (not has_key(labels, key)):
        labels[key] = 1    
    else :
        labels[key] = labels[key] + 1

for file in json_arr: 
    readLabels(file)

items = list(map(lambda x: list(x), labels.items()))
labels_array = sorted(items, key=lambda x: x[1], reverse=True)

vectorize_array = list(map(lambda x: x[0], labels_array))
vectorizer = CountVectorizer().fit_transform(vectorize_array)
vectors = vectorizer.toarray()

len_labels_array = len(labels_array)
for i in range(0, len_labels_array):
    
    for j in range(i, len_labels_array):
        if labels_array[i][0] == labels_array[j][0] or labels_array[i][1] <= 1 or labels_array[j][1] <= 1 :
            continue
        
        i_search = re.search('([0-9].*) sayılı', labels_array[i][0])
        j_search = re.search('([0-9].*) sayılı', labels_array[j][0])
        if(j_search != None and i_search != None):
            if(i_search[0] == j_search[0]):
                labels_array[i][1] += labels_array[j][1]
                labels_array[j][1] = 0
                continue
            else:
                continue
        
        cos_similarity = cosine_sim(vectors[i], vectors[j])
        if cos_similarity > similarity_ratio:
            labels_array[i][1] += labels_array[j][1]
            labels_array[j][1] = 0
            continue
        
        similarity_result = similarity(labels_array[i][0], labels_array[j][0])
        if similarity_result == -1:
            labels_array[j][1] += labels_array[i][1]
            labels_array[i][1] = 0
        elif similarity_result > similarity_ratio:
            labels_array[i][1] += labels_array[j][1]
            labels_array[j][1] = 0



array_final = (list(filter(lambda x: x[1] > 1 , labels_array)))

    
array_final = sorted(array_final, key=lambda x: x[1], reverse=True)

    
jsonString = json.dumps(array_final, indent = 4, ensure_ascii=False)
jsonFile = open("data.json", "w", encoding='utf-8')
jsonFile.write(jsonString)
jsonFile.close()


## Read data.json

In [155]:
if(not enable_preprocessing):
    raise Exception("NOT ERROR - SKIPPINIG")
    
ictihats = []
suc_array = []

def similar(a, topTenCrimes):
    for crime in topTenCrimes:
        if SequenceMatcher(None, a, crime).ratio() > similarity_ratio:
            return crime
    return None
                
top_ten_crimes = []
with open("data.json", encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)
    top_ten_crimes = list(map(lambda x: x[0], data[:10]))



for jsonData in json_arr:
    key = jsonData["Suç"] if jsonData["Suç"] != '' else "undefined"
    similar_suc = similar(key, top_ten_crimes)
    
    key = similar_suc if similar_suc is not None else key
    suc = key if key in top_ten_crimes or key == 'undefined' else "other"

    new_ictihat = jsonData['ictihat'].strip()
    ictihats.append(new_ictihat)
    new_ictihat_ctr = new_ictihat.split()
    num_of_words = len(new_ictihat_ctr)
    suc_array.append(suc.strip())


## Preprocessing

In [156]:
if(not enable_preprocessing):
    raise Exception("NOT ERROR - SKIPPINIG")

ictihats_lower = []
for i in ictihats:
    i = i.lower()
    ictihats_lower.append(i)
    

ictihats_punctuation = []

file = open("stopwords.txt", "r", newline='', encoding='utf-8')
result = file.read()
stopwords = word_tokenize(result)

for ictihat in ictihats_lower:
    ictihat = ictihat.translate(str.maketrans("", "", string.punctuation))
    #print(ictihat)
    tokenized_words = ([word for word in ictihat.split() if word not in stopwords and len(word) > 1 and re.match("^[A-Z0-9a-zğüşöçİĞÜŞÖÇ]*$", word)])
    #print(tokenized_words)
    ictihats_punctuation.append(' '.join(tokenized_words[:word_size_threshold]))
    



# print(ictihats_punctuation[:10])   
# preprocessed_ictihats = []
# for i in ictihats_punctuation:
#     preprocessed_ictihats.append(list(filter(lambda x: x != "",i.split(" "))))

# print(preprocessed_ictihats)
turkStem = TurkishStemmer()
for index, ictihat in enumerate(ictihats_punctuation,start=0):
    #print([turkStem.stemWord(word) for word in ictihat])
    ictihats_punctuation[index] = ' '.join([turkStem.stemWord(word) for word in ictihat.split()])
    
#print(ictihats_punctuation)

## write data into csv file

In [157]:
if(not enable_preprocessing):
    raise Exception("NOT ERROR - SKIPPINIG")

df = pd.DataFrame(columns = ['ictihats', 'sucs'])
df['ictihats'] = ictihats_punctuation
df['sucs'] = suc_array
csv_file_name = 'train_set_' + str(word_size_threshold) + '.csv'

df.to_csv(csv_file_name, encoding='utf-8', index=False)


## Read train set from CSV

In [158]:
csv_file_name = 'train_set_' + str(word_size_threshold) + '.csv'
my_csv = pd.read_csv(csv_file_name, dtype=str, na_filter=False)
ictihats_punctuation = my_csv['ictihats'].tolist()
suc_array = my_csv['sucs'].tolist()

for item in ictihats_punctuation:
    if(type(item).__name__ != "str"):
        print(item)



## Split Dataset

In [159]:
x_train, x_test, y_train, y_test = train_test_split(ictihats_punctuation, 
                                                    suc_array, 
                                                    test_size=0.20,random_state=36)

## Vectorize

In [160]:
tfidfvectorizer = TfidfVectorizer(analyzer='word')
training_data = tfidfvectorizer.fit_transform(x_train)
testing_data = tfidfvectorizer.transform(x_test)


## Support Vector Machines (specifically linear SVM)

In [161]:
svclassifier = SVC(kernel='linear')
svclassifier.fit(training_data, y_train)
y_pred = svclassifier.predict(testing_data)

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[ 147    0    0    0    0    0   10    1    0    1    0]
 [   0   34    0    0    0    0   18    0    0    1    0]
 [   0    0   64    0    0    0   19    0    6    0    0]
 [   0    1    0   78    0    0   67    1    0    0    0]
 [   0    0    2    0  374    0   19    0    2    0    0]
 [   0    0    0    0    0   60    4    0    0    0    3]
 [  32    4    9   26   31    1 1622    5   12    5   15]
 [   0    0    0    0    0    0   22   23    0    0    0]
 [   0    0    7    0    0    0   48    0   48    0    0]
 [   0    2    0    1    6    1   21    1    0 2580    2]
 [   0    0    0    0    0    2    0    0    0    0  133]]
                                            precision    recall  f1-score   support

              5607 sayılı kanuna muhalefet       0.82      0.92      0.87       159
                            dolandırıcılık       0.83      0.64      0.72        53
                                   hakaret       0.78      0.72      0.75        89
                        

## Multinomial Naive Bayes

In [162]:
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

predictions = naive_bayes.predict(testing_data)

# print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
# print('Precision score: ', format(precision_score(y_test, predictions, average='weighted')))
# print('Recall score: ', format(recall_score(y_test, predictions ,average='weighted')))
# print('F1 score: ', format(f1_score(y_test, predictions ,average='weighted')))

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))


[[   9    0    0    0    0    0  149    0    0    1    0]
 [   0    0    0    0    0    0   50    0    0    3    0]
 [   0    0    0    0    0    0   89    0    0    0    0]
 [   0    0    0    0    0    0  144    0    0    3    0]
 [   0    0    0    0   84    0  305    0    0    8    0]
 [   0    0    0    0    0    0   49    0    0   18    0]
 [   3    0    0    0    0    0 1710    0    0   49    0]
 [   0    0    0    0    0    0   41    0    0    4    0]
 [   0    0    0    0    0    0  102    0    0    1    0]
 [   1    0    0    0    2    0  237    0    0 2374    0]
 [   0    0    0    0    0    0   89    0    0   46    0]]
                                            precision    recall  f1-score   support

              5607 sayılı kanuna muhalefet       0.69      0.06      0.10       159
                            dolandırıcılık       0.00      0.00      0.00        53
                                   hakaret       0.00      0.00      0.00        89
                        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Logistic Regression

In [163]:
log_regres_classifier = LogisticRegression(random_state = 36)
log_regres_classifier.fit(training_data, y_train)
y_pred = log_regres_classifier.predict(testing_data)
cm = confusion_matrix(y_test, y_pred)


# print('Accuracy score: ', (accuracy_score(y_test, y_pred)))
# print('Precision score: ', (precision_score(y_test, y_pred, average='weighted')))
# print('Recall score: ', (recall_score(y_test, y_pred ,average='weighted')))
# print('F1 score: ', (f1_score(y_test, y_pred ,average='weighted')))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[ 141    0    0    0    0    0   14    0    0    4    0]
 [   0   30    0    0    0    0   20    0    0    3    0]
 [   0    0   45    0    0    0   39    0    5    0    0]
 [   0    0    0   67    0    0   78    0    0    2    0]
 [   0    0    2    0  364    0   26    0    2    3    0]
 [   0    0    0    0    0   54    5    0    0    3    5]
 [  30    2    6   18   29    1 1637    4    6   15   14]
 [   0    0    0    0    0    0   28   16    0    1    0]
 [   0    0    7    0    0    0   51    0   45    0    0]
 [   0    1    0    1    6    1   40    1    0 2561    3]
 [   0    0    0    0    0    2    3    0    0    0  130]]
                                            precision    recall  f1-score   support

              5607 sayılı kanuna muhalefet       0.82      0.89      0.85       159
                            dolandırıcılık       0.91      0.57      0.70        53
                                   hakaret       0.75      0.51      0.60        89
                        

In [11]:
#print(len(suc_array))
#print(ictihats_punctuation)
# index_list = []
# for i in range(0,100):
#     index_list.append(str(i))

# preprocessed_ictihats = []
# for i in ictihats_punctuation:
#        preprocessed_ictihats.append(list(filter(lambda x: x != "",i.split(" "))))

# #print(preprocessed_ictihats)
# turkStem = TurkishStemmer()
# for index, ictihat in enumerate(preprocessed_ictihats,start=0):
#     preprocessed_ictihats[index] = [turkStem.stemWord(word) for word in ictihat]
    
#print(preprocessed_ictihats)

#tfidf_wm_array
#tfidf_wm_array.shape
# frequency_matrix = pd.DataFrame(tfidf_wm_array, columns=tfidfvectorizer.get_feature_names())
# from sklearn.model_selection import train_test_split
# y = np.array([0,1,2,3,4,5,6,7,8,9,10]) 


# naive_bayes = MultinomialNB()
# naive_bayes.fit(training_data, y_train)

# predictions = naive_bayes.predict(testing_data)

# from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
# print('Precision score: ', format(precision_score(y_test, predictions,
#                                            average='macro')))
# print('Recall score: ', format(recall_score(y_test, predictions
#                                            ,average='macro')))
# print('F1 score: ', format(f1_score(y_test, predictions
#                                            ,average='macro')))

# print(confusion_matrix(y_test,predictions))


# from sklearn.svm import SVC
# svclassifier = SVC(kernel='linear')
# svclassifier.fit(training_data, y_train)
# y_pred = svclassifier.predict(testing_data)

# from sklearn.metrics import classification_report, confusion_matrix
# print(confusion_matrix(y_test,y_pred))
# print(classification_report(y_test,y_pred))



# frequency_matrix
# print(tfidfvectorizer.vocabulary_)
# print(tfidf_wm.toarray())
# tfidf_tokens = tfidfvectorizer.get_feature_names()
# df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),index = index_list,columns = tfidf_tokens)
# print("\nTD-IDF Vectorizer\n")
# print(df_tfidfvect)
# vectorizer.get_feature_names_out()
# ictihat_freq = []


# for i in preprocessed_ictihats:
#     count_freq = Counter(i)
#     ictihat_freq.append(count_freq)
    
# pprint.pprint(ictihat_freq)

