In [1]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

  from .autonotebook import tqdm as notebook_tqdm
2025-09-25 22:56:44.905361: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [3]:
df = pd.read_csv('../data/training.1600000.processed.noemoticon.csv', encoding='latin', header=None)

df = df.sample(frac=1, random_state=42).reset_index(drop=True).head(int(len(df) * .10))

df.columns = ['label', 'id', 'date', 'query', 'user_id', 'text']
df = df.drop(['id', 'date', 'query', 'user_id'], axis=1)

lab_to_sentiment = {0:0, 4:1}

def label_decoder(label):
  return lab_to_sentiment[label]

df['label'] = df['label'].apply(lambda x: label_decoder(x))

stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

def preprocess(text, stem=False):
  text = re.sub(text_cleaning_re, ' ', str(text).lower()).strip()
  tokens = []
  for token in text.split():
    if token not in stop_words:
      if stem:
        tokens.append(stemmer.stem(token))
      else:
        tokens.append(token)
  return " ".join(tokens)

df['text'] = df['text'].apply(lambda x: preprocess(x))

df

Unnamed: 0,label,text
0,0,ahhh hope ok
1,0,cool tweet apps razr 2
2,0,know family drama lame hey next time u hang ki...
3,0,school email open geography stuff revise stupi...
4,0,upper airways problem
...,...,...
159995,1,thank
159996,0,aaaaaaaah wrong
159997,1,see tomorrow gamerdna tweetup
159998,0,forgot xbox live would today


In [None]:
TRAIN_SIZE = 0.8
MAX_NB_WORDS = 100000
MAX_SEQUENCE_LENGTH = 100
LR = 1e-3
BATCH_SIZE = 64
EPOCHS = 3
MODEL_NAME = "albert/albert-base-v2"

In [5]:
train_df, test_df = train_test_split(
    df, 
    test_size=1 - TRAIN_SIZE, 
    random_state=42, 
    stratify=df['label']
)

print("Train Data size:", len(train_df))
print("Test Data size", len(test_df))

Train Data size: 128000
Test Data size 32000


In [6]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)\

train_dataset.set_format('torch')
test_dataset.set_format('torch')

In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    id2label={0: "NEGATIVE", 1: "POSITIVE"},
    label2id={"NEGATIVE": 0, "POSITIVE": 1}
)

model.to(device)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(30000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=

In [8]:
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 128000/128000 [00:05<00:00, 21895.45 examples/s]
Map: 100%|██████████| 32000/32000 [00:01<00:00, 22037.58 examples/s]


In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [None]:
training_args = TrainingArguments(
    output_dir="models",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,         
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    load_best_model_at_end=True,
    logging_steps=100,
    fp16=True
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [12]:
train_result = trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.463,0.460097,0.781531,0.7575,0.829717,0.791965
2,0.4198,0.445086,0.790594,0.7791,0.81257,0.795483
3,0.3727,0.457258,0.791406,0.784884,0.804215,0.794432


In [13]:
eval_results = trainer.evaluate()

In [14]:
print("\nTest Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")


Test Results:
eval_loss: 0.4451
eval_accuracy: 0.7906
eval_precision: 0.7791
eval_recall: 0.8126
eval_f1: 0.7955
eval_runtime: 59.5629
eval_samples_per_second: 537.2470
eval_steps_per_second: 8.3940
epoch: 3.0000
