In [50]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [51]:
import sys
!{sys.executable} -m pip uninstall transformers accelerate torch datasets -y
!{sys.executable} -m pip install transformers==4.38.2 accelerate==0.28.0 torch==2.2.1 datasets==2.18.0

Found existing installation: transformers 4.38.2
Uninstalling transformers-4.38.2:
  Successfully uninstalled transformers-4.38.2
Found existing installation: accelerate 0.28.0
Uninstalling accelerate-0.28.0:
  Successfully uninstalled accelerate-0.28.0
Found existing installation: torch 2.2.1
Uninstalling torch-2.2.1:
  Successfully uninstalled torch-2.2.1
Found existing installation: datasets 2.18.0
Uninstalling datasets-2.18.0:
  Successfully uninstalled datasets-2.18.0
Collecting transformers==4.38.2
  Obtaining dependency information for transformers==4.38.2 from https://files.pythonhosted.org/packages/b6/4d/fbe6d89fde59d8107f0a02816c4ac4542a8f9a85559fdf33c68282affcc1/transformers-4.38.2-py3-none-any.whl.metadata
  Using cached transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
Collecting accelerate==0.28.0
  Obtaining dependency information for accelerate==0.28.0 from https://files.pythonhosted.org/packages/a0/11/9bfcf765e71a2c84bbf715719ba520aeacb2ad84113f14803ff1947ddf69/ac

In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('Twitter_Data.csv')
df['category'] = df['category'] + 1  # Shift labels from -1,0,1 to 0,1,2
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
def tokenize(batch):
    texts = [str(t) if t is not None else "" for t in batch['clean_text']]  # Changed to 'clean_text'
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.rename_column("category", "labels")
test_dataset = test_dataset.rename_column("category", "labels")

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/130384 [00:00<?, ? examples/s]

Map:   0%|          | 0/32596 [00:00<?, ? examples/s]

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    num_labels=3, 
    problem_type="single_label_classification"  # Fix for single-label task
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [10]:
train_subset = train_dataset.select(range(10000))
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=test_dataset,
)
trainer.train()

Step,Training Loss
10,1.1053
20,1.0942
30,1.0871
40,1.0768
50,1.0785
60,1.0657
70,1.0699
80,1.0775
90,1.0569
100,1.0119


TrainOutput(global_step=1250, training_loss=0.6157991268157958, metrics={'train_runtime': 937.159, 'train_samples_per_second': 10.671, 'train_steps_per_second': 1.334, 'total_flos': 331174402560000.0, 'train_loss': 0.6157991268157958, 'epoch': 1.0})

In [11]:
eval_results = trainer.evaluate()
print(eval_results)

{'eval_loss': 0.37972041964530945, 'eval_runtime': 760.6107, 'eval_samples_per_second': 42.855, 'eval_steps_per_second': 5.358, 'epoch': 1.0}


In [12]:
trainer.save_model("./my_distilbert_model")
tokenizer.save_pretrained("./my_distilbert_model")

('./my_distilbert_model/tokenizer_config.json',
 './my_distilbert_model/special_tokens_map.json',
 './my_distilbert_model/vocab.txt',
 './my_distilbert_model/added_tokens.json',
 './my_distilbert_model/tokenizer.json')