In [1]:
!pip install scikit-learn




In [2]:
# Upgrade pip and setuptools
!pip install --upgrade pip setuptools

# Install packages with specific versions
!pip install transformers torch scikit-learn openpyxl

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Collecting setuptools
  Using cached setuptools-72.1.0-py3-none-any.whl.metadata (6.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Using cached setuptools-72.1.0-py3-none-any.whl (2.3 MB)
Installing collected packages: setuptools, pip
  Attempting uninstall: setuptools
    Found existing installation: setuptools 71.0.4
    Uninstalling setuptools-71.0.4:
      Successfully uninstalled setuptools-71.0.4
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0mSuccessfully installed pip-24.2 setuptools-72.1.0


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [4]:
import pandas as pd
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
import re
import numpy as np

# Excel dosyasını yükleyin
file_path = '/content/sample_data/duygular_stemmed.xlsx'  # Excel dosyanızın yolu
data = pd.read_excel(file_path)

# Veri seti sütun adlarını kontrol edin
print(data.head())

# Varsayılan olarak 'comments' ve 'label' sütunları olduğunu varsayıyorum; gerekirse bunları değiştirin
data.columns = ['comments', 'label']  # Sütun adlarını veri setinize göre ayarlayın

# NaN değerleri kontrol edin ve metni 'string' türüne dönüştürün
data['comments'] = data['comments'].fillna('').astype(str)

# Etiketleri sayısal değerlere dönüştürme (önceki kodun etiketleri 'string' olabilir)
data['label'] = pd.Categorical(data['label']).codes  # Etiketleri sayısal formata dönüştürün

# Metinleri küçük harfe dönüştürme
data['comments'] = data['comments'].str.lower()

# Gereksiz karakterleri kaldırma
data['comments'] = data['comments'].apply(lambda x: re.sub(r'[^a-zA-Zçğıöşü\s]', '', x))

# Boşlukları düzeltme
data['comments'] = data['comments'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

# Metin ve etiketleri ayırma
texts = data['comments'].tolist()
labels = data['label'].tolist()

# XLM-RoBERTa tokenizer'ını yükleyin
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Tokenizasyon, özel tokenların eklenmesi ve attention maskelerinin oluşturulması
def preprocess_data(texts, tokenizer, max_length=128):
    inputs = tokenizer(
        texts,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt',
        return_attention_mask=True,
    )
    return inputs['input_ids'], inputs['attention_mask']

# Veriyi ön işleyin
input_ids, attention_masks = preprocess_data(texts, tokenizer)

# Etiketleri tensora dönüştürün
labels = torch.tensor(labels, dtype=torch.long)

# Veriyi eğitim ve test setlerine ayırın
train_inputs, test_inputs, train_labels, test_labels = train_test_split(
    input_ids, labels, test_size=0.2, random_state=42
)
train_masks, test_masks, _, _ = train_test_split(
    attention_masks, attention_masks, test_size=0.2, random_state=42
)

# Veriyi DataLoader'a dönüştürün
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# XLM-RoBERTa modelini yükleyin
model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=6)

# Eğitim parametreleri
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

# Eğitim döngüsü
epochs = 4
for epoch in range(epochs):
    model.train()
    for step, batch in enumerate(train_dataloader):
        b_input_ids, b_input_mask, b_labels = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs} - Loss: {loss.item()}")

# Modeli değerlendirme
model.eval()
predictions, true_labels = [], []
for batch in test_dataloader:
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs.logits
    predictions.append(logits.argmax(dim=-1).cpu().numpy())
    true_labels.append(b_labels.cpu().numpy())

# Performans değerlendirme
from sklearn.metrics import classification_report, confusion_matrix, f1_score

predictions = [item for sublist in predictions for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

print(classification_report(true_labels, predictions))
print(confusion_matrix(true_labels, predictions))

F1_Score = f1_score(true_labels, predictions, average='weighted')
print('Test Verisi Üzerindeki Model Doğruluğu:', round(F1_Score, 2))

                                            comments     label
0  gerçek siyaset böyl yapılma sinirlenme üslup s...  mutluluk
1  ömr bin yıl ols bıkma duraklama izler barış be...  mutluluk
2      çıkan konuk ara keyif sıkılma izledik böl olt  mutluluk
3  bütün mam seri begendig hat bilgisayar indirdi...  mutluluk
4  bur çıkamıyor defa izliyor hal doya iç samim s...  mutluluk


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4 - Loss: 0.604236900806427
Epoch 2/4 - Loss: 0.43327081203460693
Epoch 3/4 - Loss: 0.08253686130046844
Epoch 4/4 - Loss: 0.7478938698768616
              precision    recall  f1-score   support

           0       0.94      0.93      0.94       626
           1       0.81      0.65      0.72       588
           2       0.81      0.88      0.84       599
           3       0.80      0.78      0.79       601
           4       0.77      0.85      0.81       615
           5       0.91      0.95      0.93       614

    accuracy                           0.84      3643
   macro avg       0.84      0.84      0.84      3643
weighted avg       0.84      0.84      0.84      3643

[[585   6   2  10  14   9]
 [  9 381  70  54  49  25]
 [  6  16 528  14  27   8]
 [  7  39  22 468  56   9]
 [ 11  20  26  31 520   7]
 [  7   7   3   8   6 583]]
Test Verisi Üzerindeki Model Doğruluğu: 0.84
