In [None]:
import pandas as pd
import numpy as np
import os
import re
import transformers
from transformers import AutoTokenizer , DataCollatorWithPadding,AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,Seq2SeqTrainingArguments, Seq2SeqTrainer
import datasets as dt
import torch
from transformers import AutoModelForSequenceClassification , TrainingArguments , Trainer
from evaluate import load
import string

In [None]:
df=pd.read_excel('./NitelNicel.xlsx')
df=df.drop(['Unnamed: 0'],axis=1)
df=df.drop(187)
df

In [None]:
def data_cleaning(x):
    x=''.join((s for s in x if not s.isdigit()))
    x=x.strip('-')
    x=x.strip()
    return x

In [None]:
#Here, if the function is applied to 3 columns separately, if the columns are to be combined and turned into a single column, it can also be applied to that single column.

df["Nitel_Soru"] = df['Nitel_Soru'].apply(data_cleaning)
df["Nitel_Cevap"] = df['Nitel_Cevap'].apply(data_cleaning)
df["Nicel_Soru"] = df['Nicel_Soru'].apply(data_cleaning)
df


In [None]:
df=df.drop(["Nicel_Cevap_KaanAla"] , axis=1)


In [None]:
# removing stopwords from the data to reduce input size
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords = set(stopwords.words('turkish'))
def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_text=[]
    for word in words:
        if word not in stopWords:
            filtered_text.append(word)
    text=" ".join(filtered_text)
    return text

In [None]:
# removing punctuations to reduce input size 
punct_list = list(string.punctuation)
def remove_punct(text):
    words=word_tokenize(text)
    text_filtered=[]
    for word in words:
        if word not in punct_list:
            text_filtered.append(word)
    text=" ".join(text_filtered)
    return text     

In [None]:
# applying statistical base keyphrase function to the data because dataset is turkish
import string
import pke
def keyphrase(text):
    stoplist=list(string.punctuation)
    extractor = pke.unsupervised.YAKE()
    extractor.load_document(input=text,
                        language='en',
                        normalization=None,
                        stoplist=stoplist)
    extractor.candidate_selection(n=5)
    window = 2
    use_stems = False 
    extractor.candidate_weighting(window=window,
                              use_stems=use_stems)
    threshold = 0.8
    keyphrases = extractor.get_n_best(n=3, threshold=threshold)
    return keyphrases[0][0]

In [None]:
# applying lemmatization to the data to simplfy inputs
import zeyrek
analyzer = zeyrek.MorphAnalyzer()
def lemmatizer(text):
    words_lemmatize=[]
    words = word_tokenize(text)
    for word in words:
        word=analyzer.analyze(word)[0][0].lemma
        words_lemmatize.append(word)
    text=" ".join(words_lemmatize)
    return text
    

In [None]:
# Since Bert model accept labels as 0 base labels edited accordingly
df = df.assign(labels = lambda x: (x['Nicel_Puan'] - 1 ))
df=df.drop(["Nicel_Puan"],axis=1)
df

In [None]:
# reducing number of label values from 5 to 2 to apply 2 label approach.
def label_reduction(label):
    if label==0 or label==1:
        label=0
    else:
        label=1
    return label

df= df[df['labels'] != 2]df["labels"] = df["labels"].apply(label_reduction)

In [None]:
#veriyi temizlerken oluşan düzensilikleri gidermek amacıyla oluşturulan fonksiyon.
banned_words=["Unk","mu","mi"]
def remove_unk(text):
        simplified_text=[]
        words = word_tokenize(text)
        for word in words:
            if word in banned_words:
                text=text.replace(word,"")
        text=text.strip()
        return text

In [None]:
#User can apply functions according to which approach he will choose.This part available for multi-column approaches.


df["Nitel_Soru"] = df['Nitel_Soru'].apply(remove_stopwords)
df["Nitel_Cevap"] = df['Nitel_Cevap'].apply(remove_stopwords)
df["Nicel_Soru"] = df['Nicel_Soru'].apply(remove_stopwords)

df["Nitel_Soru"] = df['Nitel_Soru'].apply(remove_punct)
df["Nitel_Cevap"] = df['Nitel_Cevap'].apply(remove_punct)
df["Nicel_Soru"] = df['Nicel_Soru'].apply(remove_punct)

df["Nitel_Soru"] = df['Nitel_Soru'].apply(keyphrase)
df["Nitel_Cevap"] = df['Nitel_Cevap'].apply(keyphrase)
df["Nicel_Soru"] = df['Nicel_Soru'].apply(keyphrase)

df["Nitel_Soru"] = df['Nitel_Soru'].apply(lemmatizer)
df["Nitel_Cevap"] = df['Nitel_Cevap'].apply(lemmatizer)
df["Nicel_Soru"] = df['Nicel_Soru'].apply(lemmatizer)

df["Nitel_Soru"] = df['Nitel_Soru'].apply(remove_unk)
df["Nitel_Cevap"] = df['Nitel_Cevap'].apply(remove_unk)
df["Nicel_Soru"] = df['Nicel_Soru'].apply(remove_unk)

df


In [None]:
#Column merging part.If multi column approaches going to be used first merging should be used
#ıf single column approaches going to be used  then second merging should be used

df["input"]="[CLS]"+" "+df["Nitel_Soru"]+" "+"[SEP]"+" "+df["Nitel_Cevap"]+" "+"[SEP]"+" "+df["Nicel_Soru"]+" "+"[SEP]"
df["input"]=df["Nitel_Soru"]+" "+df["Nitel_Cevap"]+" "+df["Nicel_Soru"]
df=df.drop(["Nitel_Soru","Nitel_Cevap","Nicel_Soru"],axis=1)


In [None]:
#functions are applied to the input columns.This part is available for single column approaches

df["input"] = df['input'].apply(keyphrase)
df["input"] = df['input'].apply(lemmatizer)
df["input"] = df['input'].apply(remove_unk)


In [None]:
df_test=df.iloc[500:]
df=df.iloc[:500]
df.to_csv("classification_task_csv", index=False , encoding='utf-8'  )
df_test.to_csv("classification_task_test_csv", index=False , encoding='utf-8'  )
Tapaco_dataset=dt.load_dataset("csv",data_files="classification_task_csv")
Tapaco_dataset=Tapaco_dataset["train"]
Tapaco_dataset.shuffle()
Tapaco_dataset=Tapaco_dataset.train_test_split(test_size=50)

In [None]:
model_checkpoint="dbmdz/bert-base-turkish-uncased"
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)
max_input_length=512 #512 ve 1024 arasında değişiyor maksimum izin verilen uzunluk , garanti olsun diye 512 yaptım.

def tokenizer_function(example) :
 
    model_inputs = tokenizer(example["input"], max_length=max_input_length, truncation=True)
    
    return model_inputs

tokenized_dataset=Tapaco_dataset.map(tokenizer_function, batched=True)
tokenized_dataset

In [None]:
batch_size = 3
model_name = "BertTürk_Classification"
model_dir = f"./{model_name}"


args = TrainingArguments(
    model_dir,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    num_train_epochs=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",    
) 

In [None]:
from transformers import BertForSequenceClassification
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
accuracy= load("accuracy")

def metrics_display(eval_pred):
    predictions , labels = eval_pred
    predictions=np.argmax(predictions,axis=1)
    return accuracy.compute(predictions=predictions,references=labels)

def model_init():
    return BertForSequenceClassification.from_pretrained(model_checkpoint,num_labels=5)

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=metrics_display,
)

In [None]:
trainer.train()