In [8]:
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# === 1. Veri yükleme ===
main_folder = r'C:\Users\YUCE037\Downloads\AAydintasbas'

texts = []
labels = []

for author_folder in os.listdir(main_folder):
    author_path = os.path.join(main_folder, author_folder)
    if os.path.isdir(author_path):
        for txt_file in os.listdir(author_path):
            file_path = os.path.join(author_path, txt_file)
            if file_path.endswith(".txt"):
                with open(file_path, "r", encoding="utf-8") as f:
                    texts.append(f.read())
                    labels.append(author_folder)

# === 2. Label encode ===
encoder = LabelEncoder()
labels_encoded = encoder.fit_transform(labels)

# === 3. TF-IDF ===
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000, stop_words='english')
X = vectorizer.fit_transform(texts)

# === 4. Eğitim/Test bölme ===
X_train, X_test, y_train, y_test = train_test_split(X, labels_encoded, test_size=0.2, random_state=42)

# === 5. Hiperparametre ayarları ===
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# === 6. Modeli eğit ===
grid_search.fit(X_train, y_train)

# === 7. Sonuçları yazdır ===
print("En iyi parametreler:", grid_search.best_params_)
print("Doğrulama setinde en iyi doğruluk: {:.2f}%".format(grid_search.best_score_ * 100))

# === 8. Test seti üzerinde tahmin ===
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

report = classification_report(y_test, y_pred, target_names=encoder.classes_)
print(report)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2%}")


Fitting 3 folds for each of 24 candidates, totalling 72 fits
En iyi parametreler: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}
Doğrulama setinde en iyi doğruluk: 74.33%
              precision    recall  f1-score   support

      AAltan       0.70      1.00      0.82         7
AAydintasbas       0.88      0.54      0.67        13
      AHakan       0.80      0.89      0.84         9
 ATuranAlkan       1.00      0.43      0.60         7
    AYArslan       0.67      1.00      0.80         6
     BCoskun       0.44      0.67      0.53         6
     CCandar       0.75      0.86      0.80         7
    COzdemir       1.00      0.73      0.84        11
  DCundioglu       0.82      0.69      0.75        13
  DUAribogan       0.88      0.78      0.82         9
      EArdic       0.82      1.00      0.90         9
      ECakir       0.67      0.86      0.75         7
    GGokturk       0.67      0.75      0.71         8
   HBabaoglu       0.91      0.