In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_file_path = '/content/drive/My Drive/sentiment_analysis/train_150k.txt'
test_file_path = '/content/drive/My Drive/sentiment_analysis/test_62k.txt'

In [None]:
import pandas as pd

# Load train and test datasets
train_df = pd.read_csv(train_file_path, sep='\t', header=None, names=['label', 'text'])
test_df = pd.read_csv(test_file_path, sep='\t', header=None, names=['label', 'text'])

# Check the first few rows of each file
print("Training data:\n", train_df.head())
print("Testing data:\n", test_df.head())


Training data:
    label                                               text
0      0  Starting  back at work today   Looks like it'l...
1      1  Sugar levels dropping... munchies setting in. ...
2      1     @karineb22 yeah!!! have a great summer break! 
3      1  hannah montana was very good.  now going to re...
4      1  @Mayra326 aww, have fun!  I just had my 3D las...
Testing data:
    label                                               text
0      1  @justineville ...yeahhh. ) i'm 39 tweets from ...
1      0  @ApplesnFeathers aww. Poor baby! On your only ...
2      0  @joeymcintyre With my refunded $225 (Australia...
3      0  It's fine. Today sucks just because me those t...
4      0  Im just chilling on psp and stuff, but sitting...


In [None]:
def clean_text(text):
    return text.lower()

# Apply cleaning
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)


In [None]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from transformers import TrainingArguments, Trainer

# Load the tokenizer and model (DistilBERT for binary classification)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the data with padding and truncation
def tokenize(batch):
    return tokenizer(batch['clean_text'], padding='max_length', truncation=True, max_length=160)

# Convert pandas dataframe to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Apply tokenizer to both datasets
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Remove unnecessary columns from datasets
train_dataset = train_dataset.remove_columns(['text', 'clean_text'])
test_dataset = test_dataset.remove_columns(['text', 'clean_text'])

# Set format for PyTorch tensors
train_dataset.set_format('torch')
test_dataset.set_format('torch')

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_dir="./logs",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Start training
trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/149985 [00:00<?, ? examples/s]

Map:   0%|          | 0/61998 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.3752,0.370708
2,0.2833,0.396732
3,0.1867,0.586655


Evaluation results: {'eval_loss': 0.5866552591323853, 'eval_runtime': 293.879, 'eval_samples_per_second': 210.964, 'eval_steps_per_second': 13.186, 'epoch': 3.0}


In [None]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install evaluate rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=762fcd4ba7db4baab3dbed89af128faeca5bafee6e26de92b829ebe179fbb36c
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import evaluate

# Make predictions on the test set
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

# Convert predictions to a list
test_df['predicted'] = predicted_labels

# Load metrics using the evaluate library
accuracy_metric = evaluate.load('accuracy')
rouge_metric = evaluate.load('rouge')
bleu_metric = evaluate.load('bleu')

# Convert predictions and references to lists of strings for ROUGE and BLEU
predicted_texts = [str(pred) for pred in test_df['predicted']]
reference_texts = [str(label) for label in test_df['label']]

# Calculate accuracy
accuracy_score = accuracy_metric.compute(predictions=predicted_labels, references=test_df['label'])
print(f"Accuracy: {accuracy_score['accuracy']}")

# Calculate ROUGE score
# ROUGE requires a list of lists for references, so we can convert reference_texts accordingly
rouge_score = rouge_metric.compute(predictions=predicted_texts, references=[[ref] for ref in reference_texts])
print(f"ROUGE score: {rouge_score}")

# Prepare data for BLEU score calculation (BLEU expects a list of tokens, hence tokenization is needed)
# Here, we can use simple split for tokenization, or you can use a tokenizer of your choice
predicted_tokens = [pred.split() for pred in predicted_texts]
reference_tokens = [[label.split()] for label in reference_texts]  # BLEU expects references in a list of lists
'''
# Calculate BLEU score
bleu_score = bleu_metric.compute(predictions=predicted_tokens, references=reference_tokens)
print(f"BLEU score: {bleu_score['bleu']}")
'''

Accuracy: 0.8368011871350689
ROUGE score: {'rouge1': 0.8367931223587858, 'rouge2': 0.0, 'rougeL': 0.8368334462402013, 'rougeLsum': 0.8367850575825027}


'\n# Calculate BLEU score\nbleu_score = bleu_metric.compute(predictions=predicted_tokens, references=reference_tokens)\nprint(f"BLEU score: {bleu_score[\'bleu\']}")\n'