In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
from torch.optim import Adam
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import pandas as pd
import numpy as np



In [2]:
model_name = (
    f"mrm8488/camembert-base-finetuned-movie-review-sentiment-analysis"
)
tokenizer_name = (
    f"mrm8488/camembert-base-finetuned-movie-review-sentiment-analysis"
)

In [3]:
lear = 1e-5
bs = 16
num_epochs = 1
max_bert_len = 384

In [4]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [5]:
data_train = pd.read_csv("data/Train.csv")
data_val = pd.read_csv("data/Valid.csv")
data_train

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [6]:
np.percentile(data_train['text'].apply(len), 90)

2583.0999999999985

In [7]:
data_train = data_train[:]
data_val = data_val[:]

In [8]:
train_texts = data_train['text'].values
valid_texts = data_val['text'].values
train_labels = data_train['label'].values
valid_labels = data_val['label'].values

In [9]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx]),
        }

In [10]:
train_dataset = TextDataset(
    train_texts, train_labels, tokenizer, max_length=max_bert_len
)
val_dataset = TextDataset(
    valid_texts, valid_labels, tokenizer, max_length=max_bert_len
)
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=bs, shuffle=False)

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)

In [12]:
loss_fn = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=lear)

In [13]:
writer = SummaryWriter()

In [14]:
best_f1 = 0
train_dataset_len = len(train_dataset)
for epoch in range(num_epochs):
    i = 0
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = loss_fn(outputs.logits, batch['labels'].long())
        loss.backward()
        optimizer.step()
        writer.add_scalar(
            "Loss/train", loss.item(), epoch * train_dataset_len + i
        )
        i += 1

    model.eval()
    val_preds = []
    val_labels = []
    for batch in tqdm(val_loader):
        with torch.no_grad():
            outputs = model(**batch)
            val_preds.extend(outputs.logits.argmax(dim=1).long().tolist())
            val_labels.extend(batch['labels'].long().tolist())
            
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds)
    writer.add_scalar("Accuracy/val", val_accuracy, epoch)
    writer.add_scalar("F1/val", val_f1, epoch)

    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), "best_model.pth")

writer.close()

  0%|          | 0/2500 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 2500/2500 [4:47:32<00:00,  6.90s/it]  
100%|██████████| 313/313 [11:00<00:00,  2.11s/it]


In [15]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss
from torch.optim import Adam
from sklearn.metrics import accuracy_score, f1_score
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import pandas as pd

In [16]:
model_name = (
    f"mrm8488/camembert-base-finetuned-movie-review-sentiment-analysis"
)
tokenizer_name = (
    f"mrm8488/camembert-base-finetuned-movie-review-sentiment-analysis"
)

In [17]:
lear = 1e-6
bs = 16
num_epochs = 1
max_bert_len = 384

In [18]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

In [19]:
data_train = pd.read_csv("data/Train.csv")
data_val = pd.read_csv("data/Valid.csv")
data_train

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1
...,...,...
39995,"""Western Union"" is something of a forgotten cl...",1
39996,This movie is an incredible piece of work. It ...,1
39997,My wife and I watched this movie because we pl...,0
39998,"When I first watched Flatliners, I was amazed....",1


In [20]:
data_train = data_train[:]
data_val = data_val[:]

In [21]:
train_texts = data_train['text'].values
valid_texts = data_val['text'].values
train_labels = data_train['label'].values
valid_labels = data_val['label'].values

In [22]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True,
            return_tensors="pt",
        )
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx]),
        }

In [23]:
train_dataset = TextDataset(
    train_texts, train_labels, tokenizer, max_length=max_bert_len
)
val_dataset = TextDataset(
    valid_texts, valid_labels, tokenizer, max_length=max_bert_len
)
train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=bs, shuffle=False)

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)

In [25]:
loss_fn = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=lear)

In [26]:
writer = SummaryWriter()

In [27]:
best_f1 = 0
train_dataset_len = len(train_dataset)
for epoch in range(num_epochs):
    i = 0
    model.train()
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = loss_fn(outputs.logits, batch['labels'].long())
        loss.backward()
        optimizer.step()
        writer.add_scalar(
            "Loss/train", loss.item(), epoch * train_dataset_len + i
        )
        i += 1

    model.eval()
    val_preds = []
    val_labels = []
    for batch in tqdm(val_loader):
        with torch.no_grad():
            outputs = model(**batch)
            val_preds.extend(outputs.logits.argmax(dim=1).long().tolist())
            val_labels.extend(batch['labels'].long().tolist())
            
    val_accuracy = accuracy_score(val_labels, val_preds)
    val_f1 = f1_score(val_labels, val_preds)
    writer.add_scalar("Accuracy/val", val_accuracy, epoch)
    writer.add_scalar("F1/val", val_f1, epoch)

    if val_f1 > best_f1:
        best_f1 = val_f1
        torch.save(model.state_dict(), "best_model.pth")

writer.close()

  0%|          | 0/2500 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 2500/2500 [4:48:07<00:00,  6.92s/it]  
100%|██████████| 313/313 [11:31<00:00,  2.21s/it]
