In [None]:
!pip install datasets transformers huggingface_hub
!apt-get install git-lfs
!pip install git+https://github.com/huggingface/accelerate
!pip install vaderSentiment


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.
Collecting git+https://github.com/huggingface/accelerate
  Cloning https://github.com/huggingface/accelerate to /tmp/pip-req-build-361jxuw9
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate /tmp/pip-req-build-361jxuw9
  Resolved https://github.com/huggingface/accelerate to commit abce3604f0de71cb947d6cf653c51b3e19f65162
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("joshu710/CMSC473")
model = AutoModelForSequenceClassification.from_pretrained("joshu710/CMSC473")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from datasets import load_dataset, concatenate_datasets

dataset_en = load_dataset("tyqiangz/multilingual-sentiments","english")
dataset_jp = load_dataset("tyqiangz/multilingual-sentiments","japanese")
dataset_sp = load_dataset("tyqiangz/multilingual-sentiments","spanish")
dataset_cn = load_dataset("tyqiangz/multilingual-sentiments","chinese")


In [None]:
en_train_dataset = dataset_en["train"].shuffle(seed=42)
en_test_dataset = dataset_en["test"].shuffle(seed=42)

jp_train_dataset = dataset_jp["train"].shuffle(seed=42).select([i for i in list(range(3000))])
jp_test_dataset = dataset_jp["test"].shuffle(seed=42).select([i for i in list(range(1000))])

sp_train_dataset = dataset_sp["train"].shuffle(seed=42)
sp_test_dataset = dataset_sp["test"].shuffle(seed=42)


cn_train_dataset = dataset_cn["train"].shuffle(seed=42).select([i for i in list(range(3000))])
cn_test_dataset = dataset_cn["test"].shuffle(seed=42).select([i for i in list(range(1000))])

In [None]:
def preprocess_function(examples):
   return tokenizer(examples["text"], truncation=True)

en_tokenized_train = en_train_dataset.map(preprocess_function, batched=True)
en_tokenized_test = en_test_dataset.map(preprocess_function, batched=True)


jp_tokenized_train = jp_train_dataset.map(preprocess_function, batched=True)
jp_tokenized_test = jp_test_dataset.map(preprocess_function, batched=True)



sp_tokenized_train = sp_train_dataset.map(preprocess_function, batched=True)
sp_tokenized_test = sp_test_dataset.map(preprocess_function, batched=True)


cn_tokenized_train = cn_train_dataset.map(preprocess_function, batched=True)
cn_tokenized_test = cn_test_dataset.map(preprocess_function, batched=True)

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
import numpy as np
from datasets import load_metric

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels, average="macro")["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
from transformers import TrainingArguments, Trainer
from collections import Counter
from sklearn.metrics import f1_score, accuracy_score
repo_name = "CMSC473"

training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   evaluation_strategy="epoch",
   load_best_model_at_end=True,
)

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=en_tokenized_train,
   eval_dataset=en_tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)
trainer.evaluate()








  load_accuracy = load_metric("accuracy")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.7144548296928406,
 'eval_accuracy': 0.6770114942528735,
 'eval_f1': 0.6695419855265502,
 'eval_runtime': 5.6114,
 'eval_samples_per_second': 155.042,
 'eval_steps_per_second': 9.802}

In [None]:
predictions = []

for sent in en_tokenized_test['text']:
  sid_obj = SentimentIntensityAnalyzer()
  sentiment_dict = sid_obj.polarity_scores(sent)
  if sentiment_dict['compound'] >= 0.05:
    predictions.append(0)
  elif sentiment_dict['compound'] <= - 0.05:
    predictions.append(2)
  else:
    predictions.append(1)

print(predictions)
correct = en_tokenized_test['label']
print(correct)
accuracy = accuracy_score(correct,predictions)
f1_macro = f1_score(correct, predictions, average='macro')
f1_micro = f1_score(correct, predictions, average='micro')

print("Accuracy:", accuracy)
print("Macro F1:", f1_macro)
print("Micro F1:", f1_macro)





[0, 1, 2, 0, 2, 2, 0, 2, 1, 1, 0, 0, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 2, 1, 2, 1, 0, 0, 1, 0, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 0, 2, 0, 2, 1, 2, 2, 2, 0, 0, 1, 2, 1, 0, 0, 0, 2, 2, 0, 2, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 2, 2, 1, 2, 0, 0, 0, 1, 1, 0, 2, 0, 0, 0, 0, 0, 1, 1, 1, 0, 2, 1, 1, 2, 1, 2, 2, 0, 1, 1, 0, 0, 1, 1, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 2, 0, 0, 1, 1, 0, 0, 0, 1, 2, 0, 1, 2, 1, 1, 0, 0, 0, 1, 1, 2, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 2, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 2, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 2, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 2, 0, 0, 0, 2, 1, 2, 0, 0, 0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 1, 2, 0, 0, 2, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 2, 0, 2, 1, 2, 2, 2, 1, 0, 1, 0, 0, 0, 2, 0, 2, 1, 0, 1, 0, 0, 2, 2, 1, 2, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 2, 2, 0, 1, 0, 1, 0, 2, 2, 0, 2, 0, 0, 0, 1, 0, 2, 1, 0, 0, 1, 0, 2, 0, 1, 2, 2, 2, 0, 0, 0, 1, 2, 0, 1, 1, 1, 2, 1, 0, 0, 1, 0, 2, 2, 0, 0, 1, 2, 0, 0, 0, 2, 2, 1, 0, 0, 0, 0, 1, 1, 

In [None]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=sp_tokenized_train,
   eval_dataset=sp_tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.8078656792640686,
 'eval_accuracy': 0.6459770114942529,
 'eval_f1': 0.6400475915414066,
 'eval_runtime': 3.7183,
 'eval_samples_per_second': 233.976,
 'eval_steps_per_second': 14.792}

In [None]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=jp_tokenized_train,
   eval_dataset=jp_tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5997752547264099,
 'eval_accuracy': 0.735,
 'eval_f1': 0.7277418772316994,
 'eval_runtime': 11.5201,
 'eval_samples_per_second': 86.805,
 'eval_steps_per_second': 5.469}

In [None]:
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=cn_tokenized_train,
   eval_dataset=cn_tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.5439351797103882,
 'eval_accuracy': 0.784,
 'eval_f1': 0.7854031134380772,
 'eval_runtime': 8.3188,
 'eval_samples_per_second': 120.21,
 'eval_steps_per_second': 7.573}

In [None]:
print(en_tokenized_test['label'])

[0, 1, 2, 0, 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 2, 0, 1, 1, 0, 1, 2, 1, 2, 0, 1, 1, 2, 0, 2, 1, 1, 2, 0, 0, 2, 1, 2, 2, 1, 0, 2, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 2, 0, 2, 0, 1, 0, 0, 2, 2, 2, 1, 1, 2, 1, 0, 0, 0, 2, 0, 0, 2, 0, 1, 1, 1, 0, 1, 0, 1, 2, 0, 2, 0, 1, 2, 0, 0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 2, 1, 0, 2, 0, 0, 0, 2, 0, 1, 1, 0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 2, 1, 1, 0, 0, 0, 1, 2, 1, 2, 0, 1, 1, 0, 1, 2, 0, 0, 2, 1, 2, 0, 1, 1, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 2, 1, 0, 0, 2, 2, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 1, 0, 2, 0, 0, 1, 2, 1, 1, 1, 0, 1, 2, 0, 0, 0, 0, 2, 2, 1, 0, 0, 0, 1, 2, 2, 0, 1, 2, 1, 0, 0, 2, 0, 1, 0, 0, 2, 0, 0, 1, 2, 2, 0, 1, 1, 2, 1, 1, 0, 0, 0, 0, 2, 2, 1, 0, 2, 0, 1, 0, 2, 0, 0, 2, 0, 2, 1, 2, 2, 2, 1, 1, 1, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 2, 2, 2, 1, 1, 0, 2, 2, 0, 1, 0, 1, 1, 2, 2, 0, 2, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 2, 2, 1, 1, 1, 2, 1, 0, 0, 2, 1, 2, 2, 1, 2, 1, 1, 0, 1, 0, 1, 1, 2, 2, 0, 1, 2, 2, 0, 2, 1, 0, 2, 1, 2, 2, 2, 1, 1, 1, 