In [1]:
%pip install transformers
%pip install datasets
%pip install evaluate
%pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, http

In [2]:
from torch.utils.data import DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from datasets import Dataset, DatasetDict
from torch.optim import AdamW
import torch
from tqdm.auto import tqdm
import evaluate

In [3]:
import os
print(os.getcwd())
torch.cuda.is_available()
print(os.listdir())

/content
['.config', 'sample_data']


In [112]:
df = pd.read_csv('dataset_balanced.csv')
df_generated = pd.read_csv('generated_dataset.csv')

#shuffling dataset
df = df.sample(frac=1).reset_index(drop=True)

#taking small part to reduce cost and time
df = df[:400]
df.label = df.label.apply(lambda x: 1 if x == "positive" else 0)

#df = pd.concat([df, df_generated], axis=1)
split = int(len(df.text) * 0.8)
df_train = df[:split]
df_test = df[split:]

dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test)
    })

In [20]:
#get dataloaders
def dataloaders(dataset, path, batch):
  tokenizer = AutoTokenizer.from_pretrained(path)

  def preprocess_function(examples):
      return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

  tokenized_datasets = dataset.map(preprocess_function, batched=True)
  tokenized_datasets = tokenized_datasets.remove_columns(["text"])
  tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
  tokenized_datasets.set_format("torch")

  train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=batch)
  eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch)
  return train_dataloader, eval_dataloader

In [101]:
#Computing pre-trained model's accuracy on test dataset.
def eval_model(model, eval_dataloader, threelabels=False):
  model.eval()
  metric = evaluate.load("accuracy")
  for batch in eval_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model(**batch)

      logits = outputs.logits
      if threelabels:
        for i in logits:
          i[1] = i[2]
          i[2] = 0
      predictions = torch.argmax(logits, dim=-1)
      metric.add_batch(predictions=predictions, references=batch["labels"])
  return metric.compute()

In [123]:
def train_model(model, train_dataloader, threelabels=False):
  if threelabels:
    dataset_df = train_dataloader.dataset.to_pandas()
    dataset_df.labels = dataset_df.labels.apply(lambda x: 2 if x==1 else 0)

    dataset_modified = Dataset.from_pandas(dataset_df)
    dataset_modified.set_format("torch")
    train_dataloader = DataLoader(dataset_modified, batch_size=8)
  optimizer = AdamW(model.parameters(), lr=5e-5)

  num_epochs = 3
  num_training_steps = num_epochs * len(train_dataloader)

  lr_scheduler = get_scheduler(
      name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
  )

  model.train()
  progress_bar = tqdm(range(num_training_steps))
  for epoch in range(num_epochs):
      for batch in train_dataloader:
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
          loss.backward()

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased", num_labels=2)


Downloading (…)lve/main/config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

In [108]:
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment", num_labels=3)


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
train_dataloader, eval_dataloader = dataloaders(dataset, "cardiffnlp/twitter-xlm-roberta-base-sentiment", 8)

In [121]:
accuracy_eval = eval_model(model, eval_dataloader, threelabels=True)

In [122]:
accuracy_eval

{'accuracy': 0.8625}

In [120]:
train_model(model, train_dataloader, threelabels=True)

  0%|          | 0/120 [00:00<?, ?it/s]

ValueError: ignored

In [86]:
dataset_df = train_dataloader.dataset.to_pandas()
dataset_df.labels = dataset_df.labels.apply(lambda x: 2 if x==1 else 0)

In [92]:
dataset_modified = Dataset.from_pandas(dataset_df)
dataset_modified.set_format("torch")
train_dataloader = DataLoader(dataset_modified, batch_size=8)

In [93]:
train_dataloader.dataset

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 3200
})

In [None]:
from transformers import pipeline
model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)


[{'label': 'positive', 'score': 0.9343641400337219}]

In [None]:
sentiment_task("ben seni hiç sevmem")

[{'label': 'negative', 'score': 0.8421339392662048}]