In [None]:
!pip install transformers torch scikit-learn accelerate



In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
!cp "/content/drive/MyDrive/machineLearning/twitter-datasets.zip" "/content"

!unzip -q twitter-datasets.zip
print("Data ready")

Data ready


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import csv
from os import path

Useful methods

In [3]:
def load_tweets(filename):
  with open(filename, 'r', encoding='utf-8') as f:
    return [line.strip() for line in f]

In [4]:
def create_csv_submission(ids, y_pred, file_name):
    """
    This function creates a csv file named `file_name` in the format required for a submission in Kaggle or AIcrowd.
    The file will contain two columns the first with `ids` and the second with `y_pred`.
    y_pred must be a list or np.array of 1 and -1 otherwise the function will raise a ValueError.

    Args:
        ids (list,np.array): indices
        y_pred (list,np.array): predictions on data correspondent to indices
        name (str): name of the file to be created
    """
    # Check that y_pred only contains -1 and 1
    if not all(i in [-1, 1] for i in y_pred):
        raise ValueError("y_pred can only contain values -1, 1")

    with open(file_name, "w") as csv_file:
        fieldnames = ["Id", "Prediction"]
        writer = csv.DictWriter(csv_file, delimiter = ",", fieldnames=fieldnames)
        writer.writeheader()
        for i in range(len(ids)):
            writer.writerow({"Id": int(ids[i]), "Prediction": int(y_pred[i])})

Data Loading

In [10]:
tweet_pos = load_tweets("twitter-datasets/train_pos.txt")
tweet_neg = load_tweets("twitter-datasets/train_neg.txt")
tweet_test = load_tweets("twitter-datasets/test_data.txt")

tweets = tweet_pos + tweet_neg
labels = [1]*len(tweet_pos) + [0]*len(tweet_neg)

Preparing the data

In [11]:
class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = dict(encodings) # Conversion en dict pour éviter le KeyError
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweets, labels, test_size=0.2, random_state=42, stratify=labels)

model_name = "vinai/bertweet-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenizing train data
train_encodings = tokenizer(train_tweets, truncation=True, padding=True, max_length=128)

# Tokenizing test data
test_encodings = tokenizer(test_tweets, truncation=True, padding=True, max_length=128)

train_dataset = TwitterDataset(train_encodings, train_labels)
test_dataset = TwitterDataset(test_encodings, test_labels)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

bpe.codes: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Setting up the model

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def compute_metrics(pred):
    labels = pred.label_ids
    acc = accuracy_score(labels, pred.predictions.argmax(-1))
    f1 = f1_score(labels, pred.predictions.argmax(-1))
    return {"accuracy": acc, "f1": f1}

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    fp16=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training the model

In [13]:
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3267,0.233382,0.90255,0.904582
2,0.1651,0.327743,0.904725,0.905036
3,0.1228,0.367516,0.90395,0.90457


TrainOutput(global_step=30000, training_loss=0.20041949814856053, metrics={'train_runtime': 2971.6158, 'train_samples_per_second': 161.528, 'train_steps_per_second': 10.096, 'total_flos': 1.80066628512e+16, 'train_loss': 0.20041949814856053, 'epoch': 3.0})

Doing the final prediction

In [14]:
final_encoding = tokenizer(tweet_test, truncation=True, padding=True, max_length=128)
final_dataset = TwitterDataset(final_encoding, [0]*len(tweet_test))

predictions = trainer.predict(final_dataset)
final_proba_predictions = predictions.predictions.argmax(-1)

final_predictions = [1 if i == 1 else -1 for i in final_proba_predictions]

create_csv_submission(range(1, len(final_predictions)+1), final_predictions, "submission.csv")

Downloading the submission file

In [15]:
from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>