In [24]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVR

from datasets import Dataset
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader
import torch
import tqdm
from transformers import TrainingArguments, Trainer

In [2]:
def compute_metrics_for_regression_sklearn(y_pred, y_true):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    single_squared_errors = ((y_pred - y_true)**2).tolist()

    # Compute accuracy
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}


def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)

    mse = mean_squared_error(labels, logits)
    mae = mean_absolute_error(labels, logits)
    r2 = r2_score(labels, logits)
    single_squared_errors = ((logits - labels).flatten()**2).tolist()

    # Compute accuracy
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)

    return {"mse": mse, "mae": mae, "r2": r2, "accuracy": accuracy}

## Load Data and Manipulate it

In [3]:
lens = 0

base_path = "semeval-2017-tweets_Subtask-A/downloaded/"
base_df = pd.DataFrame()
colnames=['id', 'sentiment', 'tweet']
for df_path in os.listdir(base_path):
    path = os.path.join(base_path, df_path)
    print(df_path)
    df = pd.read_csv(path, sep="\t", header=None)
    if df.shape[1] > 3:
        df = df.iloc[:, 0:3]
    df.columns = colnames
    print(df["sentiment"].value_counts())
    print("================")
    base_df = pd.concat([base_df, df], ignore_index=True)
    lens += len(df)

twitter-2016dev-A.tsv
sentiment
positive    829
neutral     746
negative    391
Name: count, dtype: int64
twitter-2016test-A.tsv
sentiment
neutral     10342
positive     7059
negative     3231
Name: count, dtype: int64
twitter-2013train-A.tsv
sentiment
neutral     4586
positive    3640
negative    1458
Name: count, dtype: int64
twitter-2016train-A.tsv
sentiment
positive    3017
neutral     2001
negative     850
Name: count, dtype: int64
twitter-2015test-A.tsv
sentiment
positive    1038
neutral      987
negative     365
Name: count, dtype: int64
twitter-2015train-A.tsv
sentiment
neutral     253
positive    170
negative     66
Name: count, dtype: int64
twitter-2013dev-A.tsv
sentiment
neutral     739
positive    575
negative    340
Name: count, dtype: int64
twitter-2016devtest-A.tsv
sentiment
positive    994
neutral     681
negative    325
Name: count, dtype: int64
twitter-2013test-A.tsv
sentiment
neutral     1513
positive    1475
negative     559
Name: count, dtype: int64
twitter-2014sar

In [4]:
base_df

Unnamed: 0,id,sentiment,tweet
0,638060586258038784,neutral,05 Beat it - Michael Jackson - Thriller (25th ...
1,638061181823922176,positive,Jay Z joins Instagram with nostalgic tribute t...
2,638083821364244480,neutral,Michael Jackson: Bad 25th Anniversary Edition ...
3,638091450132078593,positive,I liked a @YouTube video http://t.co/AaR3pjp2P...
4,638125563790557184,positive,18th anniv of Princess Diana's death. I still ...
...,...,...,...
50127,210378118865756160,neutral,It's a Wednesday girls night out as '90's band...
50128,245177521304399872,positive,"night college course sorted, just have to enro..."
50129,259280987089932288,positive,For the 1st time in 30 years. For your splendi...
50130,201113950211940352,positive,NURSES DAY - 12 MAY 2012. Nursing: The heart b...


In [5]:
base_df.drop_duplicates(inplace=True)

In [6]:
base_df

Unnamed: 0,id,sentiment,tweet
0,638060586258038784,neutral,05 Beat it - Michael Jackson - Thriller (25th ...
1,638061181823922176,positive,Jay Z joins Instagram with nostalgic tribute t...
2,638083821364244480,neutral,Michael Jackson: Bad 25th Anniversary Edition ...
3,638091450132078593,positive,I liked a @YouTube video http://t.co/AaR3pjp2P...
4,638125563790557184,positive,18th anniv of Princess Diana's death. I still ...
...,...,...,...
50127,210378118865756160,neutral,It's a Wednesday girls night out as '90's band...
50128,245177521304399872,positive,"night college course sorted, just have to enro..."
50129,259280987089932288,positive,For the 1st time in 30 years. For your splendi...
50130,201113950211940352,positive,NURSES DAY - 12 MAY 2012. Nursing: The heart b...


In [7]:
X = base_df["tweet"]
y = base_df["sentiment"]

In [8]:
y.value_counts()

sentiment
neutral     22182
positive    19572
negative     7713
Name: count, dtype: int64

In [9]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, stratify=y_train_val, random_state=42, shuffle=True)

In [10]:
len(X_train), len(X_val), len(X_test)

(31658, 7915, 9894)

In [11]:
y_train = y_train.map({"negative": 0, "neutral": 1, "positive":2})
y_val = y_val.map({"negative": 0, "neutral": 1, "positive":2})
y_test = y_test.map({"negative": 0, "neutral": 1, "positive":2})

## Baseline Model

In [12]:
count_vect = CountVectorizer(stop_words="english", lowercase=True, ngram_range=(1, 3), analyzer="word")
X_train_counts = count_vect.fit_transform(X_train)
X_val_counts = count_vect.transform(X_val)
X_test_counts = count_vect.transform(X_test)

In [13]:
tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
X_val_tfidf = tfidf_transformer.transform(X_val_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [None]:
svr = SVR()
svr.fit(X_train_tfidf, y_train)

In [None]:
y_val_pred_svr = svr.predict(X_val_tfidf)
r2_score(y_val, y_val_pred_svr)

In [None]:
compute_metrics_for_regression_sklearn(y_val_pred_svr, y_val)

## Fine-tuning Standard Bert

In [12]:
# train_ds = Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train}))
# val_ds = Dataset.from_pandas(pd.DataFrame({"text": X_val, "label": y_val}))
# test_ds = Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test}))
class SemevalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
BASE_MODEL = "distilbert-base-uncased"
LEARNING_RATE = 2e-5
MAX_LENGTH = 256
BATCH_SIZE = 4
EPOCHS = 20

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
model.train()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [15]:
train_encodings = tokenizer(X_train.values.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(X_val.values.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.values.tolist(), truncation=True, padding=True)

In [16]:
train_dataset = SemevalDataset(train_encodings, y_train.values.tolist())
val_dataset = SemevalDataset(val_encodings, y_val.values.tolist())
test_dataset = SemevalDataset(test_encodings, y_test.values.tolist())

In [17]:
# ds = {"train": train_ds, "validation": val_ds, "test": test_ds}

# def preprocess_function(examples):
#     examples = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH)
#     # examples["label"] = float(examples["label"])

#     return examples

# for split in ds:
#     ds[split] = ds[split].map(preprocess_function, remove_columns=["__index_level_0__", "text"])

In [26]:
torch.device("mps")

device(type='mps')

In [27]:
from torch.optim import AdamW
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
device = torch.device('mps')
model.to(device)
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in tqdm.tqdm(range(3)):
    for batch in tqdm.tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        labels = labels.to(torch.float32)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()
        print("prca")

model.eval()

  0%|                                                                                                                                                                                           | 0/3 [00:00<?, ?it/s]
  0%|                                                                                                                                                                                        | 0/1979 [00:00<?, ?it/s][A
  0%|                                                                                                                                                                              | 1/1979 [00:05<2:51:09,  5.19s/it][A

prca



  0%|▏                                                                                                                                                                             | 2/1979 [00:06<1:35:31,  2.90s/it][A

prca



  0%|▎                                                                                                                                                                             | 3/1979 [00:07<1:08:55,  2.09s/it][A

prca



  0%|▎                                                                                                                                                                               | 4/1979 [00:08<56:29,  1.72s/it][A

prca



  0%|▍                                                                                                                                                                               | 5/1979 [00:09<49:18,  1.50s/it][A

prca


  0%|▍                                                                                                                                                                             | 5/1979 [00:10<1:09:17,  2.11s/it]
  0%|                                                                                                                                                                                           | 0/3 [00:10<?, ?it/s]

KeyboardInterrupt



In [19]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir="./models/bert-base-uncased",          # output directory
    num_train_epochs=EPOCHS,              # total number of training epochs
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
#     logging_steps=10,
)

# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()

In [17]:
ds

{'train': Dataset({
     features: ['label', 'input_ids', 'attention_mask'],
     num_rows: 31658
 }),
 'validation': Dataset({
     features: ['label', 'input_ids', 'attention_mask'],
     num_rows: 7915
 }),
 'test': Dataset({
     features: ['label', 'input_ids', 'attention_mask'],
     num_rows: 9894
 })}

In [18]:
# training_args = TrainingArguments(
#     output_dir="./models/bert-base-uncased",
#     learning_rate=LEARNING_RATE,
#     per_device_train_batch_size=BATCH_SIZE,
#     per_device_eval_batch_size=BATCH_SIZE,
#     num_train_epochs=EPOCHS,
#     evaluation_strategy="epoch",
#     save_strategy="epoch",
#     save_total_limit=2,
#     metric_for_best_model="accuracy",
#     load_best_model_at_end=True,
#     weight_decay=0.01,
# )

training_args = TrainingArguments(
   output_dir="./models/bert-base-uncased",
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
#    push_to_hub=True,
)

In [19]:
class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        print(len(inputs))
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        print("hima ste em")
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        print("hasel em ste")
        return (loss, outputs) if return_outputs else loss

In [None]:
# trainer = RegressionTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=ds["train"],
#     eval_dataset=ds["validation"],
#     compute_metrics=compute_metrics_for_regression,
#     data_collator=data_collator,
#     tokenizer=tokenizer
# )
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=ds["train"],
   eval_dataset=ds["validation"],
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics_for_regression,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
