In [18]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

from sklearn.model_selection import train_test_split
import pandas as pd
import torch

# Load the data
#data = pd.read_csv('binary_labeled_text_data.csv')
data_list = []
with open('train_chatgpt_rewrites_in_Turkish.txt', 'r') as file:
    data = file.read().rstrip()

In [19]:
split_ai = data.split("---------- o ----------")

In [20]:
df_ai = pd.DataFrame(split_ai, columns =['text'])


In [21]:
import re

with open("trnews-64/trnews-64.train.raw") as fi:
    articles = re.split("\n\n", fi.read()) 

In [22]:
articles = articles[0:1000]

In [23]:
truncated = []
for article in articles:
    splitted = article.split(" ")
    truncated.append(splitted[0:200])


In [24]:
truncated_text_human = []
for trunc in truncated:
    truncated_text_human.append(" ".join(trunc))

In [25]:
df_human = pd.DataFrame(truncated_text_human, columns = ['text'])

In [26]:
df_human['label'] = 0

In [27]:
df_ai['label'] = 1

In [28]:
#HUMAN SET 817, AI SETTE YOK.

In [29]:
df_human = df_human.drop(817)


In [30]:
df_ai = df_ai.drop(999)

In [31]:
data = pd.concat([df_ai, df_human])


In [32]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.33, random_state=42)

In [33]:
y_val = y_test[0:330]
y_test = y_test[330:]
X_val = X_test[0:330]
X_test = X_test[330:]

In [34]:
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-128k-uncased')
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True)

train_labels = y_train.tolist()
val_labels = y_val.tolist()


In [35]:
train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(train_encodings['input_ids']),
        torch.tensor(train_encodings['attention_mask']),
        torch.tensor(train_labels))
val_dataset = torch.utils.data.TensorDataset(
        torch.tensor(val_encodings['input_ids']),
        torch.tensor(val_encodings['attention_mask']),
        torch.tensor(val_labels))

In [36]:
model = AutoModelForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-128k-cased', num_labels=2)
training_args= TrainingArguments(
    output_dir=f'./results_bert-base-turkish-128k-cased/fold',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'./logs_bert-base-turkish-128k-cased/fold',
    logging_steps=100)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=lambda data: {'input_ids': torch.stack([item[0] for item in data]),
                                'attention_mask': torch.stack([item[1] for item in data]),
                                'labels': torch.stack([item[2] for item in data])})
trainer.train()

Some weights of the model checkpoint at dbmdz/bert-base-turkish-128k-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification 

Step,Training Loss
100,0.5847
200,0.4201




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=252, training_loss=0.4769162753271678, metrics={'train_runtime': 3364.5854, 'train_samples_per_second': 1.193, 'train_steps_per_second': 0.075, 'total_flos': 957115797194880.0, 'train_loss': 0.4769162753271678, 'epoch': 3.0})

In [37]:
import torch
torch.save(model, 'model_1.sav')

In [41]:
# Make predictions on the test data
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)
num_test_samples = len(test_encodings['input_ids'])
test_labels = [0] * num_test_samples

test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels))

predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)





***** Running Prediction *****
  Num examples = 330
  Batch size = 64


In [44]:
from sklearn.metrics import classification_report
print("Classification report:")
print(classification_report(y_test, preds))



Classification report:
              precision    recall  f1-score   support

           0       1.00      0.65      0.79       170
           1       0.73      1.00      0.84       160

    accuracy                           0.82       330
   macro avg       0.87      0.83      0.82       330
weighted avg       0.87      0.82      0.82       330

