In [None]:

import os, re, numpy as np, pandas as pd
import matplotlib.pyplot as plt
import seab as sns
from tqdm import tqdm
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel

# === BERT MODEL & TOKENIZER ===
model_name = "dbmdz/bert-base-turkish-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# === CLEAN TEXT FUNCTION ===
def clean_text(text):
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"[^a-zA-ZçÇğĞıİöÖşŞüÜ ]", "", text)
    text = text.lower()
    text = re.sub(r"\d+", "", text)
    text = text.replace("i̇", "i")
    return text

# === MEAN POOLING FUNCTION ===
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1)

# === LOAD DATA ===
texts = []
labels = []
main_folder = r"C:\Users\YUCE037\Downloads\AAydintasbas"

for author_folder in os.listdir(main_folder):
    author_path = os.path.join(main_folder, author_folder)
    if os.path.isdir(author_path):
        for txt_file in os.listdir(author_path):
            file_path = os.path.join(author_path, txt_file)
            if file_path.endswith(".txt"):
                with open(file_path, "r", encoding="utf-8") as f:
                    texts.append(f.read())
                    labels.append(author_folder)

df = pd.DataFrame({"text": texts, "author": labels})
df["clean_text"] = df["text"].apply(clean_text)


In [None]:

# === EXTRACT EMBEDDINGS ===
embeddings = []
batch_size = 8
for i in tqdm(range(0, len(df), batch_size)):
    batch_texts = df["clean_text"].iloc[i:i+batch_size].tolist()
    encoded_input = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        model_output = model(**encoded_input)
    batch_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    embeddings.append(batch_embeddings)

X_bert = torch.cat(embeddings).cpu().numpy()
y = df["author"].values

# === LABEL ENCODE ===
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# === SPLIT ===
X_train, X_test, y_train, y_test = train_test_split(X_bert, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


In [None]:

from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=200, random_state=42)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

print("Random Forest Results:")
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))


In [None]:

from sklearn.svm import SVC

model_svm = SVC(kernel='linear', probability=True)
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_test)

print("SVM (Linear) Results:")
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))


In [None]:

from sklearn.naive_bayes import GaussianNB

model_nb = GaussianNB()
model_nb.fit(X_train, y_train)
y_pred = model_nb.predict(X_test)

print("Naive Bayes Results:")
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))


In [None]:

from sklearn.neural_network import MLPClassifier

model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
model_mlp.fit(X_train, y_train)
y_pred = model_mlp.predict(X_test)

print("MLP Results:")
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))


In [None]:

from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(random_state=42)
model_tree.fit(X_train, y_train)
y_pred = model_tree.predict(X_test)

print("Decision Tree Results:")
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))


In [None]:

from xgboost import XGBClassifier

model_xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)

print("XGBoost Results:")
print(classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))


In [None]:

# === OVERFITTING CONTROL FOR DECISION TREE ===
from sklearn.metrics import accuracy_score

train_preds = model_tree.predict(X_train)
test_preds = model_tree.predict(X_test)

train_acc = accuracy_score(y_train, train_preds)
test_acc = accuracy_score(y_test, test_preds)

print("Overfitting Evaluation (Decision Tree):")
print(f"Training Accuracy: {train_acc:.2%}")
print(f"Test Accuracy: {test_acc:.2%}")
