<a href="https://colab.research.google.com/github/amenehmn/Text_Mining/blob/main/text_classification_by_ParsBert_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
# Import libraries
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset, TensorDataset, RandomSampler, SequentialSampler
import torch.nn as nn
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import random
import numpy as np

# Check if GPU is available
if torch.cuda.is_available():    
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
# Load ParsBERT tokenizer and model with four labels
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary")
model = AutoModelForSequenceClassification.from_pretrained("HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary", num_labels=8, ignore_mismatched_sizes=True).to(device)

# Load data
Textlist = open(r"Topic-Classification.txt",mode='r',encoding='utf-8').read().splitlines()
SEED = 402
random.seed(SEED)
Textlist = random.sample(Textlist,600)

LabeledText = []
texts = []
labels = []
for i in Textlist:
    text = i.split("~")[0]
    label = i.split("~")[1]
    texts.append(text)
    labels.append(label)

# create a dictionary that maps each unique string label to an integer value
label_dict = {}
counter = 0
for label in labels:
  if label not in label_dict:
    label_dict[label] = counter
    counter += 1
# create a list of int labels using the dictionary
int_labels = []
for label in labels:
  int_labels.append(label_dict[label])

train_texts, test_texts, train_labels , test_labels = train_test_split(texts, int_labels, test_size=0.2) # split the data into train and test sets


# Encode data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Convert to tensors
train_inputs = torch.tensor(train_encodings['input_ids'])
train_masks = torch.tensor(train_encodings['attention_mask'])
train_labels = torch.tensor(train_labels)
test_inputs = torch.tensor(test_encodings['input_ids'])
test_masks = torch.tensor(test_encodings['attention_mask'])
test_labels = torch.tensor(test_labels)

# Define batch size
batch_size = 4

# Create data loaders
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# Train the model
model.train()
epochs = 3
for epoch in range(epochs):
    for batch in train_dataloader:
        # Get inputs, masks and labels from batch
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]

        # Backward pass and update parameters
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Evaluate the model
model.eval()
predictions = []
true_labels = []
for batch in test_dataloader:
    # Get inputs, masks and labels from batch
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # Forward pass
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # Get logits and labels
    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Append to lists
    predictions.append(logits)
    true_labels.append(label_ids)

# Flatten lists
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

# Get predicted labels
pred_labels = np.argmax(predictions, axis=1)

# Calculate accuracy score
acc_score = accuracy_score(true_labels, pred_labels)
print(f"Accuracy score: {acc_score}")

print("Classification report:")
print(classification_report(true_labels, pred_labels))

#A test example
PreText = ["سوپرماریو رکوردهای فروش فیلم انیمیشن را شکست، اما دل منتقدان را راضی نکرد"]

# Encode the text and label
preText_inputs = tokenizer(PreText, return_tensors="pt").to(device)
outputs = model(**preText_inputs)

# Get the predicted label
predicted_label = outputs.logits.argmax(-1).item()
print(f"The predicted label for the text is: {predicted_label}")

for key, value in label_dict.items():
  if value == predicted_label:
    print("The predicted label for the text is: %s" % key)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased-sentiment-deepsentipers-binary and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy score: 0.9583333333333334
Classification report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97        20
           1       1.00      0.80      0.89        15
           2       0.89      1.00      0.94        16
           3       0.95      0.95      0.95        19
           4       1.00      1.00      1.00        15
           5       0.91      1.00      0.95        10
           6       0.92      1.00      0.96        12
           7       1.00      1.00      1.00        13

    accuracy                           0.96       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.96      0.96      0.96       120

