In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from datasets import load_dataset
import sqlite3
import pandas as pd

# Cihazı belirle
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Veriyi yükle ve hazırla
dataset = load_dataset("go_emotions")
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Geri bildirim veritabanı işlemleri
DB_NAME = 'feedback.db'

def load_feedback_data():
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()
    cursor.execute("SELECT text, actual_emotion FROM feedback")
    rows = cursor.fetchall()
    conn.close()
    return rows

# Veriyi yükle
feedback_data = load_feedback_data()
feedback_texts = [row[0] for row in feedback_data]
feedback_labels = [row[1] for row in feedback_data]

# Orijinal veri kümesini al
original_texts = dataset['train']['text'] + dataset['validation']['text']
original_labels = dataset['train']['labels'] + dataset['validation']['labels']

# Tüm veriyi birleştir
merged_texts = original_texts + feedback_texts
merged_labels = original_labels + feedback_labels

# Veri bütünlüğünü kontrol et
print(f"Original dataset size: {len(original_texts)}")
print(f"Feedback dataset size: {len(feedback_texts)}")
print(f"Merged dataset size: {len(merged_texts)}")

if len(merged_texts) != len(merged_labels):
    print("Data length mismatch detected. Fixing...")
    min_len = min(len(merged_texts), len(merged_labels))
    merged_texts = merged_texts[:min_len]
    merged_labels = merged_labels[:min_len]

# Label mapping
label_mapping = {
    "admiration": 0, "amusement": 1, "anger": 2, "annoyance": 3, "approval": 4, "caring": 5,
    "confusion": 6, "curiosity": 7, "desire": 8, "disappointment": 9, "disapproval": 10,
    "disgust": 11, "embarrassment": 12, "excitement": 13, "fear": 14, "gratitude": 15,
    "grief": 16, "joy": 17, "love": 18, "nervousness": 19, "optimism": 20, "pride": 21,
    "realization": 22, "relief": 23, "remorse": 24, "sadness": 25, "surprise": 26, "neutral": 27
}

# Geri bildirim verisindeki etiketleri sayısal kodlara çevirme
feedback_labels_numerical = [label_mapping.get(label, 27) for label in feedback_labels]

# Tekrardan veri birleştirme (sayısal etiketlerle)
merged_texts = original_texts + feedback_texts
merged_labels = original_labels + feedback_labels_numerical

# Veriyi CSV olarak kaydet
df = pd.DataFrame({'text': merged_texts, 'emotion': merged_labels})
df.to_csv("merged_emotions_dataset.csv", index=False)
print("Merged dataset saved as 'merged_emotions_dataset.csv'.")

# Modeli eğitmek için veriyi işleme
def preprocess_data(texts, labels):
    encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=512, return_tensors="pt")
    label_tensor = torch.tensor(labels, dtype=torch.long)
    return encodings, label_tensor

train_encodings, train_labels = preprocess_data(merged_texts, merged_labels)

# Veriyi DataLoader'a çevir
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Modeli yükle ve ayarla
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label_mapping)
).to(device)

optimizer = optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

# Model eğitimi
def train_model(model, train_loader, optimizer, criterion, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

train_model(model, train_loader, optimizer, criterion)

# Modeli kaydet
torch.save(model.state_dict(), "enhanced_emotion_recognition_model.pth")
print("Model training and saving complete.")


Original dataset size: 48836
Feedback dataset size: 14
Merged dataset size: 48850
Merged dataset saved as 'merged_emotions_dataset.csv'.


ValueError: expected sequence of length 1 at dim 1 (got 2)