In [1]:
!pip install transformers
!pip install accelerate -U
!pip install evaluate
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.w

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import nltk
import torch
import re
import evaluate

import numpy as np
import pandas as pd


from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize




In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
imdb_data_path = '/content/drive/MyDrive/sabanci_sunum/data/IMDB.csv'

In [6]:
df_train_imdb = pd.read_csv(imdb_data_path, on_bad_lines='skip')
df_train_imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [7]:
# Step 1: Prepare data
label_encoder = LabelEncoder()

y_numeric = label_encoder.fit_transform(df_train_imdb['sentiment'])
X_train, X_test, y_train, y_test = train_test_split(df_train_imdb, y_numeric, test_size=0.25, random_state=42, stratify=y_numeric)

In [8]:
dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

{'negative': 0, 'positive': 1}

In [9]:
category_list = [words for words in df_train_imdb['sentiment'].to_list()]
category_list = list(set(category_list))[::-1]
label2id = {item: index for index, item in enumerate(category_list)}
id2label = {index: item for index, item in enumerate(category_list)}

In [10]:
label2id

{'negative': 0, 'positive': 1}

In [11]:
X_train['labels'] = X_train['sentiment'].replace(label2id)
X_test['labels'] = X_test['sentiment'].replace(label2id)

In [13]:
english_stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in english_stop_words]
    return ' '.join(filtered_tokens)

X_train['review_preprocessed'] = X_train['review'].apply(preprocess)
X_test['review_preprocessed'] = X_test['review'].apply(preprocess)

In [14]:
x_train = X_train.drop(['review','sentiment'], axis=1).reset_index(drop=True)
x_test = X_test.drop(['review','sentiment'], axis=1).reset_index(drop=True)
x_train.head()

Unnamed: 0,labels,review_preprocessed
0,0,saw adam four sons first time thing struck bel...
1,1,one shamelessly enjoyed every episode pushing ...
2,1,movie journey mind screenwriter caught paradox...
3,1,absolutely one best movies ive seen br br exce...
4,0,oh geez many films want see got stuck nephew w...


In [24]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")




tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [25]:
def preprocess_function(data_frame):
    return tokenizer(data_frame["review_preprocessed"], truncation=True)

In [26]:
train_dataset = Dataset.from_pandas(x_train)
test_dataset = Dataset.from_pandas(x_test)

tokenized_train_df = train_dataset.map(preprocess_function, batched=True)
tokenized_test_df = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/37500 [00:00<?, ? examples/s]

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

In [27]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [29]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [30]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=len(x_train['labels'].value_counts()), id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
training_args = TrainingArguments(
    output_dir= '/content/drive/MyDrive/sabanci_sunum/',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_df,
    eval_dataset=tokenized_test_df,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [36]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2616,0.246793,0.90488
2,0.1675,0.259606,0.91872


TrainOutput(global_step=4688, training_loss=0.23330393586142478, metrics={'train_runtime': 3098.2789, 'train_samples_per_second': 24.207, 'train_steps_per_second': 1.513, 'total_flos': 7788093675089088.0, 'train_loss': 0.23330393586142478, 'epoch': 2.0})