In [None]:
%pip install transformers
%pip install datasets
%pip install evaluate
%pip install sentencepiece

In [None]:
from torch.utils.data import DataLoader
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler, pipeline
from datasets import Dataset, DatasetDict
from torch.optim import AdamW
import torch
from tqdm.auto import tqdm
import evaluate

In [None]:
import os
print(os.getcwd())
torch.cuda.is_available()
print(os.listdir())

# Data Preprocessing

In [None]:
df = pd.read_csv('dataset_balanced.csv')
df.label = df.label.apply(lambda x: 1 if x == "positive" else 0)

df_generated = pd.read_csv('generated_dataset.csv')
df = pd.concat([df, df_generated], axis=0)
df.dropna(inplace=True)

#shuffling dataset
df = df.sample(frac=1).reset_index(drop=True)

#splitting for train/test
split = int(len(df.text) * 0.8)
df_train = df[:split]
df_test = df[split:]

dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test)
    })

# Training the Model

In [None]:
#get dataloaders
def dataloaders(dataset, path, batch):
  tokenizer = AutoTokenizer.from_pretrained(path)

  def preprocess_function(examples):
      return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

  tokenized_datasets = dataset.map(preprocess_function, batched=True)
  tokenized_datasets = tokenized_datasets.remove_columns(["text"])
  tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
  tokenized_datasets.set_format("torch")

  train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=batch)
  eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=batch)
  return train_dataloader, eval_dataloader

In [None]:
#Computing pre-trained model's accuracy on test dataset.
def eval_model(model, eval_dataloader, threelabels=False):
  model.eval()
  metric = evaluate.load("accuracy")
  for batch in eval_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model(**batch)

      logits = outputs.logits
      if threelabels:
        for i in logits:
          i[1] = i[2]
          i[2] = 0
      predictions = torch.argmax(logits, dim=-1)
      metric.add_batch(predictions=predictions, references=batch["labels"])
  return metric.compute()

In [None]:
def train_model(model, train_dataloader, threelabels=False):
  if threelabels:
    dataset_df = train_dataloader.dataset.to_pandas()
    dataset_df.labels = dataset_df.labels.apply(lambda x: 2 if x==1 else 0)

    dataset_modified = Dataset.from_pandas(dataset_df)
    dataset_modified.set_format("torch")
    train_dataloader = DataLoader(dataset_modified, batch_size=8)
  optimizer = AdamW(model.parameters(), lr=5e-5)

  num_epochs = 3
  num_training_steps = num_epochs * len(train_dataloader)

  lr_scheduler = get_scheduler(
      name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
  )

  model.train()
  progress_bar = tqdm(range(num_training_steps))
  for epoch in range(num_epochs):
      for batch in train_dataloader:
          batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
          loss.backward()

          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased", num_labels=2)


In [None]:
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment", num_labels=3)


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
train_dataloader, eval_dataloader = dataloaders(dataset, "cardiffnlp/twitter-xlm-roberta-base-sentiment", 8)

In [None]:
accuracy_eval = eval_model(model, eval_dataloader, threelabels=False)

In [None]:
accuracy_eval

In [None]:
train_model(model, train_dataloader, threelabels=False)

# Evaluating the Model

In [None]:
accuracy_eval = eval_model(model, eval_dataloader, threelabels=False)
accuracy_eval

In [None]:
dataset_df = train_dataloader.dataset.to_pandas()
dataset_df.labels = dataset_df.labels.apply(lambda x: 2 if x==1 else 0)

In [None]:
dataset_modified = Dataset.from_pandas(dataset_df)
dataset_modified.set_format("torch")
train_dataloader = DataLoader(dataset_modified, batch_size=8)

In [None]:
train_dataloader.dataset

# Experimenting

In [None]:
model_path = "savasy/bert-base-turkish-sentiment-cased"
sentiment_task = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path)


In [None]:
sentiment_task("Bugün çok iyi hissediyorum.")

In [None]:
torch.save(model, 'model_finetuned')