In [12]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split

In [26]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [46]:
# data
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

def tokenize_data(df, tokenizer):
    return tokenizer(df["Text"].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')

In [65]:
file_path = "./../../data/binary_class_sentences.csv"
df = pd.read_csv(file_path, header=None, names=['Text', 'Label'])
df['Label'] = df['Label'].astype(int)
train_dataset_df, eval_dataset_df = train_test_split(df, shuffle=True)
# Grab labels
train_labels = train_dataset_df["Label"].tolist()
eval_labels = eval_dataset_df["Label"].tolist()

# Tokenize data
tokenized_train_dataset = tokenize_data(train_dataset_df, tokenizer)
tokenized_eval_dataset = tokenize_data(eval_dataset_df, tokenizer)

# Apply labels to tokenized data
train_dataset = CustomDataset(tokenized_train_dataset, train_labels)
eval_dataset = CustomDataset(tokenized_eval_dataset, eval_labels)

In [67]:
example = train_dataset[0]
print("Shape of input_ids for one example:", example['input_ids'].shape)
print("Shape of attention_mask for one example:", example['attention_mask'].shape)
print("Label for one example:", example['labels'])

Shape of input_ids for one example: torch.Size([84])
Shape of attention_mask for one example: torch.Size([84])
Label for one example: tensor(0)


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [68]:
print(len(train_dataset))
print(len(eval_dataset))

2987
996


In [51]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_strategy="epoch"
)

# Initialize trainer
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

In [2]:
trainer.train()

NameError: name 'trainer' is not defined

In [1]:
trainer.save_model("./results")

NameError: name 'trainer' is not defined

In [56]:
results = trainer.evaluate(eval_dataset)
print(results)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


{'eval_loss': 0.12166711688041687, 'eval_runtime': 105.4452, 'eval_samples_per_second': 9.446, 'eval_steps_per_second': 0.152, 'epoch': 3.0}


In [58]:
prediction_output = trainer.predict(eval_dataset)
print(prediction_output.predictions)  # Raw model predictions
print(prediction_output.label_ids)    # True labels

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


[[-2.704887   2.886109 ]
 [ 2.2272937 -2.7609262]
 [ 2.0492885 -2.5689569]
 ...
 [ 2.3261707 -2.7602344]
 [ 2.353644  -2.8311214]
 [ 2.4253397 -2.9407072]]
[1 0 0 0 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0 1 0 0 1 1
 0 1 1 0 1 0 0 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 1 1 0 1 1 1
 0 1 0 1 0 1 1 0 0 1 1 1 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1 1 1 1 0 1 1 0
 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 1 0
 0 0 0 1 0 0 0 1 1 0 1 0 1 0 0 1 1 0 1 1 1 0 1 0 1 1 0 0 1 0 1 1 1 0 0 0 1
 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0
 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 1 1 0 1 1 0 1 1 1 1 0 1 1 0 1 1 1 0
 1 1 1 1 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 1 1 1 0 0
 1 1 1 0 0 0 0 0 0 1 0 1 0 0 1 1 1 1 1 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0 1 0 0
 1 1 0 1 0 1 1 0 0 0 1 1 1 1 0 0 1 0 0 1 0 1 1 0 0 0 1 1 0 0 1 0 0 0 1 1 1
 1 1 0 0 0 0 1 1 1 0 1 1 0 1 0 0 0 1 1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 1 0 0 0
 0 1 0 1 0 0 1 1 1 

In [60]:
import torch
import torch.nn.functional as F

# Assuming 'predictions' is the array of logits from your model
logits = torch.tensor(prediction_output.predictions)
probabilities = F.softmax(logits, dim=1).numpy()

# Now 'probabilities' contains a probability for each class (column)

In [62]:
import numpy as np
predicted_classes = np.argmax(probabilities, axis=1)

In [4]:
for i, row in test_df.iterrows():
    sentence = row['Text']  # Replace 'Text' with the name of your column containing the sentences
    print(f"Sentence: {sentence}")
    print(f"Predicted Class: {predicted_classes[i]}")
    print(f"Probability of Class 0: {probabilities[i][0]}")
    print(f"Probability of Class 1: {probabilities[i][1]}")

NameError: name 'test_df' is not defined