In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

In [20]:
import gc
import torch
import evaluate
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score

In [3]:
SEED = 42
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
data = pd.read_csv('/kaggle/input/movie-reviews/en-reviews.csv', usecols=['review_text', 'review_type'])

In [5]:
data

Unnamed: 0,review_type,review_text
0,NEUTRAL,"The death of the main character is this fact, ..."
1,NEUTRAL,Naive films with a happy end is useful to watc...
2,NEUTRAL,"I watched the film in the original, because th..."
3,NEGATIVE,"From this low -budget thriller, quite a tolera..."
4,POSITIVE,To figure out anything-you should think.But th...
...,...,...
792985,POSITIVE,The creators of the series “This England” plea...
792986,NEUTRAL,I liked the movie.Of course he is not super at...
792987,POSITIVE,Hitokiri (literally translates as 'the one who...
792988,POSITIVE,The Americans have just forced Japan to start ...


In [6]:
data['review_type'].value_counts()

POSITIVE    525664
NEUTRAL     146646
NEGATIVE    120680
Name: review_type, dtype: int64

In [19]:
neg = data[data['review_type']=='NEGATIVE']
pos = data[data['review_type']=='POSITIVE'].sample(120680, random_state=SEED)
data = pd.concat([pos, neg])

del pos, neg
gc.collect()

In [24]:
type2label = {
    'NEGATIVE': 0,
    'POSITIVE': 1
}
data.review_type = data.review_type.map(type2label)

In [4]:
def tokenize_function(examples, text_col, tokenizer):
    return tokenizer(examples[text_col], padding='max_length', truncation=True, max_length=64)

def ds_preproc(ds, text_col, target_col):
    ds = ds.map(tokenize_function, fn_kwargs={'text_col':text_col, 'tokenizer':tokenizer})
    ds = ds.remove_columns([text_col, '__index_level_0__'])
    ds = ds.rename_column(target_col, 'labels')
    ds.set_format('torch')
    return ds

In [None]:
def train_model(model, train_dataloader, val_dataloader, optimizer, num_epochs, savepath='./pt_save_pretrained'):
    for epoch in range(num_epochs):
        model.train()
        print('Start training')
        for batch in tqdm(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        metric = evaluate.load('accuracy')
        print('Start validation')
        model.eval()
        for batch in tqdm(val_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            metric.add_batch(predictions=predictions, references=batch['labels'])
        print(f'epoch {epoch} -', metric.compute())
    model.save_pretrained(savepath)

## Domain adoptation

In [27]:
train, val = train_test_split(data, test_size=0.2, random_state=SEED)
train = Dataset.from_pandas(train)
val = Dataset.from_pandas(val)

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
tokenized_train = ds_preproc(train, 'review_text', 'review_type')
tokenized_val = ds_preproc(val, 'review_text', 'review_type')
train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=16)
val_dataloader = DataLoader(tokenized_val, batch_size=16)

del tokenized_train, tokenized_val, data, train, val
gc.collect()

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

  0%|          | 0/193088 [00:00<?, ?ex/s]

  0%|          | 0/48272 [00:00<?, ?ex/s]

59

In [28]:
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased-finetuned-sst-2-english',
    num_labels=2,
    ignore_mismatched_sizes=True
)

optimizer = AdamW(model.parameters(), lr=1e-6)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)

model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
train_model(model, train_dataloader, val_dataloader, optimizer, num_epochs)

Start training


  0%|          | 0/12068 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Start validation


  0%|          | 0/3017 [00:00<?, ?it/s]

epoch 0 - {'accuracy': 0.7146171693735499}
Start training


  0%|          | 0/12068 [00:00<?, ?it/s]

## Finetune model without domain adoptation

In [59]:
data_kaggle = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [60]:
data_kaggle

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [61]:
type2label = {
    'negative': 0,
    'positive': 1
}
data_kaggle.sentiment = data_kaggle.sentiment.map(type2label)

In [62]:
train_df, test_df = train_test_split(data_kaggle, test_size=0.3, stratify=data_kaggle['sentiment'], random_state=SEED)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['sentiment'], random_state=SEED)

In [63]:
train = Dataset.from_pandas(train_df)
val = Dataset.from_pandas(val_df)

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
tokenized_train = ds_preproc(train, 'review', 'sentiment')
tokenized_val = ds_preproc(val, 'review', 'sentiment')
train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=16)
val_dataloader = DataLoader(tokenized_val, batch_size=16)
del tokenized_train, tokenized_val, data_kaggle, train, val
gc.collect()

  0%|          | 0/28000 [00:00<?, ?ex/s]

  0%|          | 0/7000 [00:00<?, ?ex/s]

11463

In [64]:
model_woda = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased-finetuned-sst-2-english',
    num_labels=2,
    ignore_mismatched_sizes=True
)

for name, param in model_woda.named_parameters():
    if 'classifier' not in name: # classifier layer
        param.requires_grad = False

optimizer = AdamW(model_woda.parameters(), lr=5e-6)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)

model_woda.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [65]:
train_model(model_woda, train_dataloader, val_dataloader, optimizer, num_epochs, savepath='./model_woda')

Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 0 - {'accuracy': 0.784}
Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 1 - {'accuracy': 0.7864285714285715}
Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 2 - {'accuracy': 0.7888571428571428}
Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 3 - {'accuracy': 0.7887142857142857}
Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 4 - {'accuracy': 0.7897142857142857}


In [66]:
model_wda = AutoModelForSequenceClassification.from_pretrained(
    '/kaggle/input/saved-models/domain_model_v4/domain_model_v4',
    local_files_only=True,
    num_labels=2,
    ignore_mismatched_sizes=True
)

for name, param in model_wda.named_parameters():
    if 'classifier' not in name: # classifier layer
        param.requires_grad = False

optimizer = AdamW(model_wda.parameters(), lr=5e-6)
num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)

model_wda.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [67]:
train_model(model_wda, train_dataloader, val_dataloader, optimizer, num_epochs, savepath='./model_wda')

Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 0 - {'accuracy': 0.8197142857142857}
Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 1 - {'accuracy': 0.8211428571428572}
Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 2 - {'accuracy': 0.8232857142857143}
Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 3 - {'accuracy': 0.8231428571428572}
Start training


  0%|          | 0/1750 [00:00<?, ?it/s]

Start validation


  0%|          | 0/438 [00:00<?, ?it/s]

epoch 4 - {'accuracy': 0.8222857142857143}


## Getting test metrics

In [68]:
test = Dataset.from_pandas(test_df[['review']])
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

test = test.map(tokenize_function, fn_kwargs={'text_col':'review', 'tokenizer':tokenizer})
test = test.remove_columns(['review', '__index_level_0__'])
test.set_format('torch')
test_dataloader = DataLoader(test, shuffle=False, batch_size=16)

  0%|          | 0/15000 [00:00<?, ?ex/s]

In [69]:
def predict(model, dataloader):
    model.to(device)
    model.eval()
    predictions = []
    for batch in tqdm(dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().numpy())
    return predictions

In [70]:
preds_wda = predict(model_wda, test_dataloader)
accuracy_score(test_df['sentiment'].values, preds_wda) 

  0%|          | 0/938 [00:00<?, ?it/s]

0.8207333333333333

In [71]:
preds_woda = predict(model_woda, test_dataloader)
accuracy_score(test_df['sentiment'].values, preds_woda) 

  0%|          | 0/938 [00:00<?, ?it/s]

0.7851333333333333