In [None]:
import os
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report,accuracy_score

# === Custom Preprocessor (temizleme) ===
class TextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        cleaned = []
        for text in X:
            text = text.lower()  # küçük harf
            text = re.sub(r'\d+', '', text)  # sayıları sil
            text = text.translate(str.maketrans('', '', string.punctuation))  # noktalama işaretlerini sil
            text = text.strip()
            cleaned.append(text)
        return cleaned

# main_folder = r'C:\Users\YUCE037\Downloads\AAydintasbas'
main_folder = r'C:\Users\YUCE037\Downloads\AAydintasbas'
texts = []
labels = []

# Folder reading
for author_folder in os.listdir(main_folder):
    author_path = os.path.join(main_folder, author_folder)
    if os.path.isdir(author_path):
        for txt_file in os.listdir(author_path):
            file_path = os.path.join(author_path, txt_file)
            if file_path.endswith(".txt"):
                with open(file_path, "r", encoding="utf-8") as f:
                    texts.append(f.read())
                    labels.append(author_folder)

# train ve test 
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# === Pipeline: cleaner + tfidf + classifier ===
pipeline = Pipeline([
    ('cleaner', TextCleaner()),  # text temizleyici
    ('tfidf', TfidfVectorizer(ngram_range=(1, 1), stop_words='english')),  # unigram + stopwords
    ('tree', DecisionTreeClassifier(random_state=42))  
])


param_grid = {
    'tfidf__max_features': [1000, 3000],
    'tree__max_depth': [3, 5, 10,15 ,None],
    'tree__min_samples_split': [2, 5],
    'tree__criterion': ['gini']
}


grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)


print("En iyi parametreler:", grid_search.best_params_)
y_pred = grid_search.predict(X_test)

print(accuracy_score(y_test, y_pred))
train_pred = grid_search.predict(X_train)
print("Train Accuracy:", accuracy_score(y_train, train_pred))

test_pred = grid_search.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
En iyi parametreler: {'tfidf__max_features': 3000, 'tree__criterion': 'gini', 'tree__max_depth': None, 'tree__min_samples_split': 5}
              precision    recall  f1-score   support

      AAltan       0.36      0.57      0.44         7
AAydintasbas       0.56      0.38      0.45        13
      AHakan       0.45      0.56      0.50         9
 ATuranAlkan       0.06      0.14      0.08         7
    AYArslan       0.50      0.50      0.50         6
     BCoskun       0.23      0.50      0.32         6
     CCandar       0.38      0.43      0.40         7
    COzdemir       0.64      0.64      0.64        11
  DCundioglu       0.53      0.62      0.57        13
  DUAribogan       0.33      0.22      0.27         9
      EArdic       0.50      0.44      0.47         9
      ECakir       0.07      0.14      0.09         7
    GGokturk       0.50      0.62      0.56         8
   HBabaoglu       0.25      0.07      0.11      

MODEL OVER FITTED SINCE TEST ACCURACY < TRAIN ACCURACY 