In [None]:
!pip install tqdm
!pip install torch
!pip install scikit_learn
!pip install nltk
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install evaluate

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.2-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.5/268.5 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m83.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m77.5 MB/s[0m eta [36m0:00:0

In [None]:
import glob
import html
import os
from google.colab import drive

drive.mount('/content/drive')
BASE_PATH = '/content/drive/MyDrive'
DATA_PATH = os.path.join(BASE_PATH, "datasets")

SEPARATOR = "\t"


def clean_text(text):
    """
    Remove extra quotes from text files and html entities
    Args:
        text (str): a string of text

    Returns: (str): the "cleaned" text

    """
    text = text.rstrip()

    if '""' in text:
        if text[0] == text[-1] == '"':
            text = text[1:-1]
        text = text.replace('\\""', '"')
        text = text.replace('""', '"')

    text = text.replace('\\""', '"')

    text = html.unescape(text)
    text = ' '.join(text.split())
    return text


def parse_file(file):
    """
    Read a file and return a dictionary of the data, in the format:
    tweet_id:{sentiment, text}
    """

    data = {}
    lines = open(file, "r", encoding="utf-8").readlines()
    for _, line in enumerate(lines):
        columns = line.rstrip().split(SEPARATOR)
        tweet_id = columns[0]
        sentiment = columns[1]
        text = columns[2:]
        text = clean_text(" ".join(text))
        data[tweet_id] = (sentiment, text)
    return data


def load_from_dir(path):
    files = glob.glob(path + "/**/*.tsv", recursive=True)
    files.extend(glob.glob(path + "/**/*.txt", recursive=True))

    data = {}  # use dict, in order to avoid having duplicate tweets (same id)
    for file in files:
        file_data = parse_file(file)
        data.update(file_data)
    return list(data.values())


def load_Semeval2017A():
    train = load_from_dir(os.path.join(DATA_PATH, "Semeval2017A/train_dev"))
    test = load_from_dir(os.path.join(DATA_PATH, "Semeval2017A/gold"))

    X_train = [x[1] for x in train]
    y_train = [x[0] for x in train]
    X_test = [x[1] for x in test]
    y_test = [x[0] for x in test]

    return X_train, y_train, X_test, y_test


def load_MR():
    pos = open(os.path.join(DATA_PATH, "MR/rt-polarity.pos")).readlines()
    neg = open(os.path.join(DATA_PATH, "MR/rt-polarity.neg")).readlines()

    pos = [x.strip() for x in pos]
    neg = [x.strip() for x in neg]

    pos_labels = ["positive"] * len(pos)
    neg_labels = ["negative"] * len(neg)

    split = 5000

    X_train = pos[:split] + neg[:split]
    y_train = pos_labels[:split] + neg_labels[:split]

    X_test = pos[split:] + neg[split:]
    y_test = pos_labels[split:] + neg_labels[split:]

    return X_train, y_train, X_test, y_test

Mounted at /content/drive


In [None]:
!export PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:32

In [None]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
torch.cuda.empty_cache()
PRETRAINED_MODEL_MR = ['bert-base-cased', 'bert-base-uncased', 'distilbert-base-uncased', 'distilroberta-base']
PRETRAINED_MODEL_SEM = ['distilbert-base-cased', 'distilbert-base-uncased', 'albert-base-v2', 'bert-base-uncased']

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def prepare_dataset(X, y):
    texts, labels = [], []
    for text, label in zip(X, y):
        texts.append(text)
        labels.append(label)

    return Dataset.from_dict({'text': texts, 'label': labels})


if __name__ == '__main__':

      # load the raw data
      DATASET = 'MR'
      PRETRAINED_MODEL = 'bert-base-cased'
      if DATASET == "Semeval2017A":
            X_train, y_train, X_test, y_test = load_Semeval2017A()
      elif DATASET == "MR":
            X_train, y_train, X_test, y_test = load_MR()
      else:
          raise ValueError("Invalid dataset")

      # encode labels
      le = LabelEncoder()
      le.fit(list(set(y_train)))
      y_train = le.transform(y_train)
      y_test = le.transform(y_test)
      n_classes = len(list(le.classes_))

      # prepare datasets
      train_set = prepare_dataset(X_train, y_train)
      test_set = prepare_dataset(X_test, y_test)

      # define model and tokenizer
      tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
      model = AutoModelForSequenceClassification.from_pretrained(
          PRETRAINED_MODEL, num_labels=n_classes)

      # tokenize datasets
      tokenized_train_set = train_set.map(tokenize_function)
      tokenized_test_set = test_set.map(tokenize_function)

      # TODO: Main-lab-Q7 - remove this section once you are ready to execute on a GPU
      #  create a smaller subset of the dataset
      n_samples = 400
      small_train_dataset = tokenized_train_set.shuffle(
          seed=42).select(range(n_samples))
      small_eval_dataset = tokenized_test_set.shuffle(
          seed=42).select(range(n_samples))

      # TODO: Main-lab-Q7 - customize hyperparameters once you are ready to execute on a GPU
      # training setup
      print(f'\nDataset: {DATASET}\nPre-Trained model: {PRETRAINED_MODEL}\n')

      args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
      )
      trainer = Trainer(
          model=model,
          args=args,
          train_dataset=small_train_dataset,
          eval_dataset=small_eval_dataset,
          compute_metrics=compute_metrics,
      )

      # train
      trained_model = trainer.train()

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/662 [00:00<?, ? examples/s]


Dataset: MR
Pre-Trained model: bert-base-cased





Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.516899,0.7525
2,No log,1.266807,0.775
3,No log,1.36854,0.785
4,No log,1.356325,0.8125
5,0.233400,1.531102,0.8025
6,0.233400,1.533484,0.8025
7,0.233400,1.563491,0.8
8,0.233400,1.604665,0.8075
9,0.233400,1.621222,0.8075
10,0.000900,1.628371,0.8075


In [None]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
torch.cuda.empty_cache()

##############
PRETRAINED_MODEL_MR = ['bert-base-cased', 'bert-base-uncased', 'distilbert-base-uncased', 'distilroberta-base']
PRETRAINED_MODEL_SEM = ['distilbert-base-cased', 'distilbert-base-uncased', 'albert-base-v2', 'bert-base-uncased']

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def prepare_dataset(X, y):
    texts, labels = [], []
    for text, label in zip(X, y):
        texts.append(text)
        labels.append(label)

    return Dataset.from_dict({'text': texts, 'label': labels})


if __name__ == '__main__':

      # load the raw data
      DATASET = 'MR'
      PRETRAINED_MODEL = 'bert-base-uncased'
      if DATASET == "Semeval2017A":
            X_train, y_train, X_test, y_test = load_Semeval2017A()
      elif DATASET == "MR":
            X_train, y_train, X_test, y_test = load_MR()
      else:
          raise ValueError("Invalid dataset")

      # encode labels
      le = LabelEncoder()
      le.fit(list(set(y_train)))
      y_train = le.transform(y_train)
      y_test = le.transform(y_test)
      n_classes = len(list(le.classes_))

      # prepare datasets
      train_set = prepare_dataset(X_train, y_train)
      test_set = prepare_dataset(X_test, y_test)

      # define model and tokenizer
      tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
      model = AutoModelForSequenceClassification.from_pretrained(
          PRETRAINED_MODEL, num_labels=n_classes)

      # tokenize datasets
      tokenized_train_set = train_set.map(tokenize_function)
      tokenized_test_set = test_set.map(tokenize_function)

      # TODO: Main-lab-Q7 - remove this section once you are ready to execute on a GPU
      #  create a smaller subset of the dataset
      n_samples = 400
      small_train_dataset = tokenized_train_set.shuffle(
          seed=42).select(range(n_samples))
      small_eval_dataset = tokenized_test_set.shuffle(
          seed=42).select(range(n_samples))

      # TODO: Main-lab-Q7 - customize hyperparameters once you are ready to execute on a GPU
      # training setup
      print(f'\nDataset: {DATASET}\nPre-Trained model: {PRETRAINED_MODEL}\n')

      args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
      )
      trainer = Trainer(
          model=model,
          args=args,
          train_dataset=small_train_dataset,
          eval_dataset=small_eval_dataset,
          compute_metrics=compute_metrics,
      )

      # train
      trained_model = trainer.train()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/662 [00:00<?, ? examples/s]


Dataset: MR
Pre-Trained model: bert-base-uncased





Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.802864,0.7675
2,No log,1.370706,0.7725
3,No log,1.089064,0.815
4,No log,1.146359,0.8375
5,0.212400,1.19174,0.8425
6,0.212400,1.230707,0.845
7,0.212400,1.257243,0.8425
8,0.212400,1.274726,0.845
9,0.212400,1.285037,0.845
10,0.000100,1.288708,0.845


In [None]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
torch.cuda.empty_cache()
PRETRAINED_MODEL_MR = ['bert-base-cased', 'bert-base-uncased', 'distilbert-base-uncased', 'distilroberta-base']
PRETRAINED_MODEL_SEM = ['distilbert-base-cased', 'distilbert-base-uncased', 'albert-base-v2', 'bert-base-uncased']

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def prepare_dataset(X, y):
    texts, labels = [], []
    for text, label in zip(X, y):
        texts.append(text)
        labels.append(label)

    return Dataset.from_dict({'text': texts, 'label': labels})


if __name__ == '__main__':

      # load the raw data
      DATASET = 'MR'
      PRETRAINED_MODEL = 'distilbert-base-uncased'
      if DATASET == "Semeval2017A":
            X_train, y_train, X_test, y_test = load_Semeval2017A()
      elif DATASET == "MR":
            X_train, y_train, X_test, y_test = load_MR()
      else:
          raise ValueError("Invalid dataset")

      # encode labels
      le = LabelEncoder()
      le.fit(list(set(y_train)))
      y_train = le.transform(y_train)
      y_test = le.transform(y_test)
      n_classes = len(list(le.classes_))

      # prepare datasets
      train_set = prepare_dataset(X_train, y_train)
      test_set = prepare_dataset(X_test, y_test)

      # define model and tokenizer
      tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
      model = AutoModelForSequenceClassification.from_pretrained(
          PRETRAINED_MODEL, num_labels=n_classes)

      # tokenize datasets
      tokenized_train_set = train_set.map(tokenize_function)
      tokenized_test_set = test_set.map(tokenize_function)

      # TODO: Main-lab-Q7 - remove this section once you are ready to execute on a GPU
      #  create a smaller subset of the dataset
      n_samples = 400
      small_train_dataset = tokenized_train_set.shuffle(
          seed=42).select(range(n_samples))
      small_eval_dataset = tokenized_test_set.shuffle(
          seed=42).select(range(n_samples))

      # TODO: Main-lab-Q7 - customize hyperparameters once you are ready to execute on a GPU
      # training setup
      print(f'\nDataset: {DATASET}\nPre-Trained model: {PRETRAINED_MODEL}\n')

      args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
      )
      trainer = Trainer(
          model=model,
          args=args,
          train_dataset=small_train_dataset,
          eval_dataset=small_eval_dataset,
          compute_metrics=compute_metrics,
      )

      # train
      trained_model = trainer.train()

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.we

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/662 [00:00<?, ? examples/s]


Dataset: MR
Pre-Trained model: distilbert-base-uncased





Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.601335,0.72
2,No log,1.172449,0.735
3,No log,1.169616,0.775
4,No log,1.233487,0.8075
5,0.225600,1.372808,0.7975
6,0.225600,1.453992,0.7925
7,0.225600,1.514231,0.79
8,0.225600,1.544617,0.79
9,0.225600,1.543927,0.785
10,0.000200,1.552416,0.785


In [None]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
torch.cuda.empty_cache()
PRETRAINED_MODEL_MR = ['bert-base-cased', 'bert-base-uncased', 'distilbert-base-uncased', 'distilroberta-base']
PRETRAINED_MODEL_SEM = ['distilbert-base-cased', 'distilbert-base-uncased', 'albert-base-v2', 'bert-base-uncased']

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def prepare_dataset(X, y):
    texts, labels = [], []
    for text, label in zip(X, y):
        texts.append(text)
        labels.append(label)

    return Dataset.from_dict({'text': texts, 'label': labels})


if __name__ == '__main__':

      # load the raw data
      DATASET = 'MR'
      PRETRAINED_MODEL = 'distilroberta-base'
      if DATASET == "Semeval2017A":
            X_train, y_train, X_test, y_test = load_Semeval2017A()
      elif DATASET == "MR":
            X_train, y_train, X_test, y_test = load_MR()
      else:
          raise ValueError("Invalid dataset")

      # encode labels
      le = LabelEncoder()
      le.fit(list(set(y_train)))
      y_train = le.transform(y_train)
      y_test = le.transform(y_test)
      n_classes = len(list(le.classes_))

      # prepare datasets
      train_set = prepare_dataset(X_train, y_train)
      test_set = prepare_dataset(X_test, y_test)

      # define model and tokenizer
      tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
      model = AutoModelForSequenceClassification.from_pretrained(
          PRETRAINED_MODEL, num_labels=n_classes)

      # tokenize datasets
      tokenized_train_set = train_set.map(tokenize_function)
      tokenized_test_set = test_set.map(tokenize_function)

      # TODO: Main-lab-Q7 - remove this section once you are ready to execute on a GPU
      #  create a smaller subset of the dataset
      n_samples = 400
      small_train_dataset = tokenized_train_set.shuffle(
          seed=42).select(range(n_samples))
      small_eval_dataset = tokenized_test_set.shuffle(
          seed=42).select(range(n_samples))

      # TODO: Main-lab-Q7 - customize hyperparameters once you are ready to execute on a GPU
      # training setup
      print(f'\nDataset: {DATASET}\nPre-Trained model: {PRETRAINED_MODEL}\n')

      args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
      )
      trainer = Trainer(
          model=model,
          args=args,
          train_dataset=small_train_dataset,
          eval_dataset=small_eval_dataset,
          compute_metrics=compute_metrics,
      )

      # train
      trained_model = trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bia

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/662 [00:00<?, ? examples/s]


Dataset: MR
Pre-Trained model: distilroberta-base





Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.633308,0.675
2,No log,0.870643,0.7625
3,No log,1.325891,0.755
4,No log,1.291804,0.7775
5,0.342200,1.625464,0.7725
6,0.342200,1.512927,0.7875
7,0.342200,1.546722,0.8
8,0.342200,1.626568,0.795
9,0.342200,1.629778,0.7975
10,0.010200,1.637603,0.8


In [None]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
torch.cuda.empty_cache()
PRETRAINED_MODEL_MR = ['bert-base-cased', 'bert-base-uncased', 'distilbert-base-uncased', 'distilroberta-base']
PRETRAINED_MODEL_SEM = ['distilbert-base-cased', 'distilbert-base-uncased', 'albert-base-v2', 'bert-base-uncased']

metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def prepare_dataset(X, y):
    texts, labels = [], []
    for text, label in zip(X, y):
        texts.append(text)
        labels.append(label)

    return Dataset.from_dict({'text': texts, 'label': labels})


if __name__ == '__main__':

      # load the raw data
      DATASET = 'Semeval2017A'
      PRETRAINED_MODEL = 'distilbert-base-cased'
      if DATASET == "Semeval2017A":
            X_train, y_train, X_test, y_test = load_Semeval2017A()
      elif DATASET == "MR":
            X_train, y_train, X_test, y_test = load_MR()
      else:
          raise ValueError("Invalid dataset")

      # encode labels
      le = LabelEncoder()
      le.fit(list(set(y_train)))
      y_train = le.transform(y_train)
      y_test = le.transform(y_test)
      n_classes = len(list(le.classes_))

      # prepare datasets
      train_set = prepare_dataset(X_train, y_train)
      test_set = prepare_dataset(X_test, y_test)

      # define model and tokenizer
      tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
      model = AutoModelForSequenceClassification.from_pretrained(
          PRETRAINED_MODEL, num_labels=n_classes)

      # tokenize datasets
      tokenized_train_set = train_set.map(tokenize_function)
      tokenized_test_set = test_set.map(tokenize_function)

      # TODO: Main-lab-Q7 - remove this section once you are ready to execute on a GPU
      #  create a smaller subset of the dataset
      n_samples = 400
      small_train_dataset = tokenized_train_set.shuffle(
          seed=42).select(range(n_samples))
      small_eval_dataset = tokenized_test_set.shuffle(
          seed=42).select(range(n_samples))

      # TODO: Main-lab-Q7 - customize hyperparameters once you are ready to execute on a GPU
      # training setup
      print(f'\nDataset: {DATASET}\nPre-Trained model: {PRETRAINED_MODEL}\n')

      args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
      )
      trainer = Trainer(
          model=model,
          args=args,
          train_dataset=small_train_dataset,
          eval_dataset=small_eval_dataset,
          compute_metrics=compute_metrics,
      )

      # train
      trained_model = trainer.train()

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.w

Map:   0%|          | 0/49570 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]


Dataset: Semeval2017A
Pre-Trained model: distilbert-base-cased





Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.994051,0.465
2,No log,0.961171,0.57
3,No log,1.65669,0.585
4,No log,1.891026,0.5725
5,0.505900,2.287052,0.58
6,0.505900,2.376569,0.58
7,0.505900,2.573236,0.5775
8,0.505900,2.751299,0.5725
9,0.505900,2.771646,0.5725
10,0.001100,2.778908,0.575


In [None]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
torch.cuda.empty_cache()
PRETRAINED_MODEL_MR = ['bert-base-cased', 'bert-base-uncased', 'distilbert-base-uncased', 'distilroberta-base']
PRETRAINED_MODEL_SEM = ['distilbert-base-cased', 'distilbert-base-uncased', 'albert-base-v2', 'bert-base-uncased']


metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def prepare_dataset(X, y):
    texts, labels = [], []
    for text, label in zip(X, y):
        texts.append(text)
        labels.append(label)

    return Dataset.from_dict({'text': texts, 'label': labels})


if __name__ == '__main__':

      # load the raw data
      DATASET = 'Semeval2017A'
      PRETRAINED_MODEL = 'distilbert-base-uncased'
      if DATASET == "Semeval2017A":
            X_train, y_train, X_test, y_test = load_Semeval2017A()
      elif DATASET == "MR":
            X_train, y_train, X_test, y_test = load_MR()
      else:
          raise ValueError("Invalid dataset")

      # encode labels
      le = LabelEncoder()
      le.fit(list(set(y_train)))
      y_train = le.transform(y_train)
      y_test = le.transform(y_test)
      n_classes = len(list(le.classes_))

      # prepare datasets
      train_set = prepare_dataset(X_train, y_train)
      test_set = prepare_dataset(X_test, y_test)

      # define model and tokenizer
      tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
      model = AutoModelForSequenceClassification.from_pretrained(
          PRETRAINED_MODEL, num_labels=n_classes)

      # tokenize datasets
      tokenized_train_set = train_set.map(tokenize_function)
      tokenized_test_set = test_set.map(tokenize_function)

      # TODO: Main-lab-Q7 - remove this section once you are ready to execute on a GPU
      #  create a smaller subset of the dataset
      n_samples = 400
      small_train_dataset = tokenized_train_set.shuffle(
          seed=42).select(range(n_samples))
      small_eval_dataset = tokenized_test_set.shuffle(
          seed=42).select(range(n_samples))

      # TODO: Main-lab-Q7 - customize hyperparameters once you are ready to execute on a GPU
      # training setup
      print(f'\nDataset: {DATASET}\nPre-Trained model: {PRETRAINED_MODEL}\n')

      args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
      )
      trainer = Trainer(
          model=model,
          args=args,
          train_dataset=small_train_dataset,
          eval_dataset=small_eval_dataset,
          compute_metrics=compute_metrics,
      )

      # train
      trained_model = trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.we

Map:   0%|          | 0/49570 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]


Dataset: Semeval2017A
Pre-Trained model: distilbert-base-uncased





Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.965626,0.465
2,No log,0.852479,0.6075
3,No log,1.57278,0.6075
4,No log,2.018121,0.615
5,0.442300,1.984584,0.63
6,0.442300,2.287934,0.595
7,0.442300,2.465963,0.5975
8,0.442300,2.395487,0.5925
9,0.442300,2.399767,0.605
10,0.000900,2.411487,0.605


In [None]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
torch.cuda.empty_cache()
PRETRAINED_MODEL_MR = ['bert-base-cased', 'bert-base-uncased', 'distilbert-base-uncased', 'distilroberta-base']
PRETRAINED_MODEL_SEM = ['distilbert-base-cased', 'distilbert-base-uncased', 'albert-base-v2', 'bert-base-uncased']


metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def prepare_dataset(X, y):
    texts, labels = [], []
    for text, label in zip(X, y):
        texts.append(text)
        labels.append(label)

    return Dataset.from_dict({'text': texts, 'label': labels})


if __name__ == '__main__':

      # load the raw data
      DATASET = 'Semeval2017A'
      PRETRAINED_MODEL = 'albert-base-v2'
      if DATASET == "Semeval2017A":
            X_train, y_train, X_test, y_test = load_Semeval2017A()
      elif DATASET == "MR":
            X_train, y_train, X_test, y_test = load_MR()
      else:
          raise ValueError("Invalid dataset")

      # encode labels
      le = LabelEncoder()
      le.fit(list(set(y_train)))
      y_train = le.transform(y_train)
      y_test = le.transform(y_test)
      n_classes = len(list(le.classes_))

      # prepare datasets
      train_set = prepare_dataset(X_train, y_train)
      test_set = prepare_dataset(X_test, y_test)

      # define model and tokenizer
      tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
      model = AutoModelForSequenceClassification.from_pretrained(
          PRETRAINED_MODEL, num_labels=n_classes)

      # tokenize datasets
      tokenized_train_set = train_set.map(tokenize_function)
      tokenized_test_set = test_set.map(tokenize_function)

      # TODO: Main-lab-Q7 - remove this section once you are ready to execute on a GPU
      #  create a smaller subset of the dataset
      n_samples = 400
      small_train_dataset = tokenized_train_set.shuffle(
          seed=42).select(range(n_samples))
      small_eval_dataset = tokenized_test_set.shuffle(
          seed=42).select(range(n_samples))

      # TODO: Main-lab-Q7 - customize hyperparameters once you are ready to execute on a GPU
      # training setup
      print(f'\nDataset: {DATASET}\nPre-Trained model: {PRETRAINED_MODEL}\n')

      args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
      )
      trainer = Trainer(
          model=model,
          args=args,
          train_dataset=small_train_dataset,
          eval_dataset=small_eval_dataset,
          compute_metrics=compute_metrics,
      )

      # train
      trained_model = trainer.train()

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model 

Map:   0%|          | 0/49570 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]


Dataset: Semeval2017A
Pre-Trained model: albert-base-v2





Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.056818,0.48
2,No log,1.180015,0.46
3,No log,1.154089,0.46
4,No log,1.121303,0.4325
5,1.095800,1.171339,0.46
6,1.095800,1.091835,0.46
7,1.095800,1.160875,0.46
8,1.095800,1.100862,0.46
9,1.095800,1.124585,0.46
10,1.081600,1.130775,0.46


In [None]:
import numpy as np
import evaluate
from datasets import Dataset
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import LabelEncoder
import torch
torch.cuda.empty_cache()
PRETRAINED_MODEL_MR = ['bert-base-cased', 'bert-base-uncased', 'distilbert-base-uncased', 'distilroberta-base']
PRETRAINED_MODEL_SEM = ['distilbert-base-cased', 'distilbert-base-uncased', 'albert-base-v2', 'bert-base-uncased']


metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def prepare_dataset(X, y):
    texts, labels = [], []
    for text, label in zip(X, y):
        texts.append(text)
        labels.append(label)

    return Dataset.from_dict({'text': texts, 'label': labels})


if __name__ == '__main__':

      # load the raw data
      DATASET = 'Semeval2017A'
      PRETRAINED_MODEL = 'bert-base-uncased'
      if DATASET == "Semeval2017A":
            X_train, y_train, X_test, y_test = load_Semeval2017A()
      elif DATASET == "MR":
            X_train, y_train, X_test, y_test = load_MR()
      else:
          raise ValueError("Invalid dataset")

      # encode labels
      le = LabelEncoder()
      le.fit(list(set(y_train)))
      y_train = le.transform(y_train)
      y_test = le.transform(y_test)
      n_classes = len(list(le.classes_))

      # prepare datasets
      train_set = prepare_dataset(X_train, y_train)
      test_set = prepare_dataset(X_test, y_test)

      # define model and tokenizer
      tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
      model = AutoModelForSequenceClassification.from_pretrained(
          PRETRAINED_MODEL, num_labels=n_classes)

      # tokenize datasets
      tokenized_train_set = train_set.map(tokenize_function)
      tokenized_test_set = test_set.map(tokenize_function)

      # TODO: Main-lab-Q7 - remove this section once you are ready to execute on a GPU
      #  create a smaller subset of the dataset
      n_samples = 400
      small_train_dataset = tokenized_train_set.shuffle(
          seed=42).select(range(n_samples))
      small_eval_dataset = tokenized_test_set.shuffle(
          seed=42).select(range(n_samples))

      # TODO: Main-lab-Q7 - customize hyperparameters once you are ready to execute on a GPU
      # training setup
      print(f'\nDataset: {DATASET}\nPre-Trained model: {PRETRAINED_MODEL}\n')

      args = TrainingArguments(
        output_dir="output",
        evaluation_strategy="epoch",
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
      )
      trainer = Trainer(
          model=model,
          args=args,
          train_dataset=small_train_dataset,
          eval_dataset=small_eval_dataset,
          compute_metrics=compute_metrics,
      )

      # train
      trained_model = trainer.train()

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Map:   0%|          | 0/49570 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]


Dataset: Semeval2017A
Pre-Trained model: bert-base-uncased





Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.056626,0.46
2,No log,0.906902,0.595
3,No log,1.037395,0.6225
4,No log,1.890174,0.5825
5,0.714000,1.730373,0.6225
6,0.714000,1.922395,0.62
7,0.714000,2.241256,0.6
8,0.714000,2.357258,0.6125
9,0.714000,2.380605,0.6025
10,0.066400,2.403538,0.6025
