In [13]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("/content/X_data_stemmed.csv", encoding = 'utf-8-sig')
data

Unnamed: 0.1,Unnamed: 0,0
0,0,آرتمیس دازده ساله با کمک محافظ شخص فق ماهر بات...
1,1,کرده بد در مسیر ماجخط ساحل تاالب میراند جاییکه...
2,2,به رابطه دستان که آدامسن دا شر میکرد دامن نزد ...
3,3,بد زن پنجاه پنج کیل اگر چیز را پنجبار لمس می...
4,4,مال چند سال پیشه فل با تیک صب یک از سم را زمین...
...,...,...
1376,1376,نارنیا پاسخ داده خاهد شد انتقا تیسراک بس سخ...
1377,1377,خندیدن کردند البته نم تانستند جل خد را بگیرند ...
1378,1378,تجیز کرد فراه ساز همه گنه سایل راحت که ض کنن ا...
1379,1379,اسـبپـسـرکا زندگ پیشتاز خدا خدا م کردند دباره ...


In [3]:
y_data = pd.read_csv("/content/y_data.csv", encoding = 'utf-8-sig')
y_data

Unnamed: 0.1,Unnamed: 0,Author,Author_ID
0,0,Artemis Fowl,1
1,1,Artemis Fowl,1
2,2,Artemis Fowl,1
3,3,Artemis Fowl,1
4,4,Artemis Fowl,1
...,...,...,...
1376,1376,c.s.lewis,10
1377,1377,c.s.lewis,10
1378,1378,c.s.lewis,10
1379,1379,c.s.lewis,10


In [5]:
texts = list(data['0'].values)  # List of input texts
labels = list((y_data['Author_ID']-1).values)  # List of corresponding labels

In [6]:
# Step 2: Load pre-trained BERT model and tokenizer
model_name = 'HooshvareLab/bert-fa-base-uncased'
# HooshvareLab/bert-fa-base-uncased
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Step 3: Tokenize your data
max_length = 128  # Maximum sequence length for BERT

inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
labels = torch.tensor(labels)

In [15]:
# Step 1.1: Split the data into training, validation, and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, stratify=y_data['Author_ID'], random_state=42)

# Number of splits
n_splits = 5

# Initialize KFold
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels,stratify= train_labels, test_size=0.2, random_state=42)

model_no = 0

for train_index, val_index in kf.split(train_texts):
    model_no += 1
    train_texts, val_texts = [texts[i] for i in train_index], [texts[i] for i in val_index]
    train_labels, val_labels = [labels[i] for i in train_index], [labels[i] for i in val_index]

    # Step 3.1: Tokenize your data for training, validation, and testing sets
    train_inputs = tokenizer(train_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    val_inputs = tokenizer(val_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    test_inputs = tokenizer(test_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')

    # Convert labels to tensors
    train_labels = torch.tensor(train_labels)
    val_labels = torch.tensor(val_labels)
    test_labels = torch.tensor(test_labels)


    # Step 4: Fine-tune BERT for classification
    optimizer = AdamW(model.parameters(), lr=1e-5)
    train_dataset = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    epochs = 3
    for epoch in range(epochs):
        print('epoch:' , epoch)
        model.train()
        for batch in train_loader:
            print('batch' , batch)
            input_ids, attention_mask, batch_labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=batch_labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

    # Step 5: Evaluate your model
    model.eval()
    val_dataset = TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'], val_labels)
    val_loader = DataLoader(val_dataset, batch_size=32)
    predictions = []
    true_labels = []

    for batch in val_loader:
        input_ids, attention_mask, batch_labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())
        true_labels.extend(batch_labels.cpu().numpy())

    # Model no
    print(f"Model no {model_no}")

    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predictions)
    print("Validation Accuracy:", accuracy)
    print("Classification Report:")
    print(classification_report(true_labels, predictions))

    # Step 6: Inference
    # Make predictions on new, unseen data
    test_dataset = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)
    test_loader = DataLoader(test_dataset, batch_size=32)
    predictions = []
    true_labels = []

    for batch in test_loader:
        input_ids, attention_mask, batch_labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        _, predicted_labels = torch.max(logits, dim=1)
        predictions.extend(predicted_labels.cpu().numpy())
        true_labels.extend(batch_labels.cpu().numpy())

    # Calculate evaluation metrics for test set
    accuracy = accuracy_score(true_labels, predictions)
    print("Test Accuracy:", accuracy)
    print("Classification Report for Test Set:")
    print(classification_report(true_labels, predictions))


  test_labels = torch.tensor(test_labels)


epoch: 0
batch [tensor([[    2, 26621,  2872,  ...,  4628,  2803,     4],
        [    2,  1360, 11468,  ...,  6077,  2803,     4],
        [    2,  3371,  2875,  ...,  2911,  2803,     4],
        ...,
        [    2,  2808,  2789,  ...,  1376,  4445,     4],
        [    2, 24347,  4056,  ..., 23802,  2800,     4],
        [    2,  3484,  3360,  ...,  5980,  3197,     4]]), tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), tensor([2, 2, 2, 2, 5, 5, 1, 5, 2, 2, 5, 4, 3, 3, 2, 3, 2, 0, 3, 2, 3, 5, 6, 0,
        5, 3, 2, 2, 2, 2, 5, 0])]


KeyboardInterrupt: 