In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np
import matplotlib.pyplot as plt
from datasets import Dataset
import evaluate 
import seaborn as sns

In [2]:
accuracy_metric = evaluate.load("accuracy")

In [3]:
def load_and_preprocess_data(filepath):
    df = pd.read_csv(filepath)
    df.fillna('', inplace=True)

    # 'severity' değerlerine göre 'aksiyon' sütunu ekleniyor
    df['aksiyon'] = df['severity'].apply(lambda x: 1 if x in [1, 2] else 0)

    # 'text' sütununun veri tipini string'e çevirme
    df['text'] = df['text'].astype(str)

    # Belirli sütunları string türüne çevirip, 'text' sütununa göre karşılaştırma yapıyoruz
    for column in ['bilet', 'musteri_hizmetleri', 'odeme', 'uygulama', 'passolig', 'passolig kart', 'diger']:
        df[column] = df[column].astype(str)
        df[column] = df.apply(lambda row: 1 if row[column].lower() in row['text'].lower() else 0, axis=1)

    return df

In [4]:
def check_labels(df, column_name):
    unique_labels = df[column_name].unique()
    print(f"Sütun: {column_name}")
    print(f"Benzersiz etiketler: {unique_labels}")
    print(f"Etiket sayısı: {len(unique_labels)}")
    print("-" * 40)

In [5]:
def preprocess_data(df, column_name):
    tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
    
    def tokenize_function(examples):
        return tokenizer(examples[column_name], padding="max_length", truncation=True)

    dataset = Dataset.from_pandas(df)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    
    return tokenized_dataset


In [6]:
def compute_metrics(pred):
    predictions = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=pred.label_ids)
    
    return accuracy

In [7]:
def train_model(column_name, num_labels):
    # Veriyi yükleyip ön işleme tabi tutuyoruz
    df = load_and_preprocess_data('C:/Users/Ali Riza Ercan/Documents/GitHub/PassoAssist/data/processed/merged_df.csv')
    tokenized_dataset = preprocess_data(df, column_name)
    
    # Eğitim ve validasyon seti ayrımı
    train_size = int(0.8 * len(tokenized_dataset))
    train_dataset = tokenized_dataset.select(range(train_size))
    eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))
    
    # BERT modeli
    model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=num_labels)
    
    # Eğitim argümanları
    training_args = TrainingArguments(
        output_dir=f'./results/{column_name}',
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
    )
    
    # Verilere etiketleri (labels) ekleyin
    def add_labels(examples):
        examples["labels"] = examples[column_name]
        return examples

    # Hem eğitim hem de değerlendirme setine etiketleri ekleyin
    train_dataset = train_dataset.map(add_labels)
    eval_dataset = eval_dataset.map(add_labels)

    # Trainer oluşturma
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )
    
    # Model eğitme
    trainer.train()
    model.save_pretrained(f'data/models/{column_name}_model')
    
    # Modeli validasyon seti üzerinde değerlendirme
    eval_results = trainer.evaluate(eval_dataset)
    return eval_results

In [8]:
def plot_results(results, title):
    columns = list(results.keys())
    accuracies = [results[col]['eval_accuracy'] for col in columns]

    plt.figure(figsize=(10, 6))
    sns.barplot(x=columns, y=accuracies)
    plt.title(title)
    plt.xlabel('Column Name')
    plt.ylabel('Accuracy')
    plt.xticks(rotation=45)
    plt.show()

In [9]:
results = {}

# Sentiment (3 sınıf: olumlu, olumsuz, nötr)
results['sentiment'] = train_model('sentiment', 3)

# Entity (16 sınıf)
results['entity'] = train_model('entity', 16)  # Burada doğru etiket sayısını kullanıyoruz

# Konu (11 sınıf)
results['konu'] = train_model('konu', 11)

# Severity (3 sınıf: 2, 1, 0)
results['severity'] = train_model('severity', 3)

# Diğer ikili sınıflar
results['bilet'] = train_model('bilet', 2)
results['musteri_hizmetleri'] = train_model('musteri_hizmetleri', 2)
results['odeme'] = train_model('odeme', 2)
results['uygulama'] = train_model('uygulama', 2)
results['passolig'] = train_model('passolig', 2)
results['passolig kart'] = train_model('passolig kart', 2)
results['diger'] = train_model('diger', 2)

# Sonuçları görselleştirme
plot_results(results, 'Model Training Results - Accuracy by Column')



Map:   0%|          | 0/3184 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2547 [00:00<?, ? examples/s]

Map:   0%|          | 0/637 [00:00<?, ? examples/s]

  0%|          | 0/957 [00:00<?, ?it/s]

IndexError: Target 9 is out of bounds.