In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Excel dosyasını yükleyin
file_path = '/content/sample_data/duygular_stemmed.xlsx'  # Excel dosyanızın yolu
data = pd.read_excel(file_path)

# Veri seti sütun adlarını kontrol edin
print(data.head())

# Varsayılan olarak 'comments' ve 'label' sütunları olduğunu varsayıyorum; gerekirse bunları değiştirin
data.columns = ['comments', 'label']  # Sütun adlarını veri setinize göre ayarlayın

# NaN değerleri kontrol edin ve metni 'string' türüne dönüştürün
data['comments'] = data['comments'].fillna('').astype(str)

# Etiketleri sayısal değerlere dönüştürme (önceki kodun etiketleri 'string' olabilir)
data['label'] = pd.Categorical(data['label']).codes  # Etiketleri sayısal formata dönüştürün

# Metin ve etiketleri ayırma
texts = data['comments'].tolist()
labels = data['label'].tolist()

# Tokenizer ve model seçimi
tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-turkish-cased')
model = AutoModelForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-cased', num_labels=6)

# Tokenizasyon, özel tokenların eklenmesi ve attention maskelerinin oluşturulması
def preprocess_data(texts, tokenizer, max_length=128):
    inputs = tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        return_attention_mask=True,
    )
    return inputs['input_ids'], inputs['attention_mask']

# Veriyi ön işleyin
input_ids, attention_masks = preprocess_data(texts, tokenizer)

# Etiketleri tensora dönüştürün
labels = torch.tensor(labels, dtype=torch.long)

# Veriyi eğitim ve test setlerine ayırın
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)
train_masks, test_masks, _, _ = train_test_split(
    attention_masks, attention_masks, test_size=0.2, random_state=42
)

# Veriyi DataLoader'a dönüştürün
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Eğitim parametreleri
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

# Eğitim döngüsü
epochs = 3
for epoch in range(epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item()}")

# Modeli değerlendirme
model.eval()
predictions, true_labels = [], []
for batch in test_dataloader:
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs.logits
    predictions.append(logits.argmax(dim=-1).cpu().numpy())
    true_labels.append(b_labels.cpu().numpy())

# Performans değerlendirme
predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

print(classification_report(true_labels, predictions))
print(confusion_matrix(true_labels, predictions))

F1_Score = f1_score(true_labels, predictions, average='weighted')
print('Test Verisi Üzerindeki Model Doğruluğu:', round(F1_Score, 2))


                                            comments     label
0  gerçek siyaset böyl yapılma sinirlenme üslup s...  mutluluk
1  ömr bin yıl ols bıkma duraklama izler barış be...  mutluluk
2      çıkan konuk ara keyif sıkılma izledik böl olt  mutluluk
3  bütün mam seri begendig hat bilgisayar indirdi...  mutluluk
4  bur çıkamıyor defa izliyor hal doya iç samim s...  mutluluk


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Loss: 0.24005280435085297
Epoch 2/3 - Loss: 0.2867029309272766
Epoch 3/3 - Loss: 0.09456425160169601
              precision    recall  f1-score   support

           0       0.97      0.91      0.94       626
           1       0.83      0.70      0.76       588
           2       0.85      0.90      0.88       599
           3       0.77      0.84      0.80       601
           4       0.81      0.84      0.82       615
           5       0.93      0.96      0.94       614

    accuracy                           0.86      3643
   macro avg       0.86      0.86      0.86      3643
weighted avg       0.86      0.86      0.86      3643

[[572   6   3  16  13  16]
 [  5 411  52  70  38  12]
 [  1  21 541   9  22   5]
 [  3  26  15 503  44  10]
 [  6  20  23  47 515   4]
 [  2  10   1   5   6 590]]
Test Verisi Üzerindeki Model Doğruluğu: 0.86


In [9]:
print(labels[0:10]) #mutluluk
print(labels[4000:4010]) #üzünüt
print(labels[7000:7010]) #korku
print(labels[10000:10010]) #ofke
print(labels[13000:13010]) #saskinlik
print(labels[16000:16010]) #kücümseme

tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4])
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5])
tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])


In [None]:
pip install transformers datasets pandas torch scikit-learn


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda