In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification 

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
df = pd.read_csv("data_train.csv")

In [24]:
label_cols = ['label_1', 'label_2', 'label_3', 'label_4', 'label_5']

In [26]:
for col in label_cols:
    df[col] = 0

for i, row in df.iterrows():
    labels = row['intent'].split(',')
    for label in labels:
        label_col = f'label_{label.strip()}'
        df.at[i, label_col] = 1

df = df.drop('intent', axis=1)

In [27]:
df.drop(columns=['id'], inplace=True)

In [28]:
class MultiLabelDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = str(self.data['comment'].iloc[index])
        labels = torch.tensor(self.data.iloc[index][['label_1', 'label_2', 'label_3', 'label_4', 'label_5']], dtype=torch.float32)

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': labels
        }

In [29]:
train_dataset = MultiLabelDataset(df[5000:], tokenizer)
val_dataset = MultiLabelDataset(df[:5000], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [30]:
import torch.nn as nn
from transformers import BertModel

class BERTMultiLabelClassifier(nn.Module):
    def __init__(self, num_labels=5):
        super(BERTMultiLabelClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

model = BERTMultiLabelClassifier()

In [43]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()
num_epoch = 3

for epoch in range(num_epoch):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_losses = []
    for batch in val_loader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            val_loss = criterion(outputs, labels)
            val_losses.append(val_loss.item())

    avg_val_loss = sum(val_losses) / len(val_losses)
    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

In [41]:
torch.save(model.state_dict(), 'bert_sentiment_model.pth')

In [176]:
import numpy as np
def predict_with_threshold(probs, threshold=0.95):
    predicted_labels = (probs > threshold)
    if np.sum(predicted_labels) == 0:
        predicted_labels[0, np.argmax(probs)] = 1
    indices = np.where(predicted_labels == 1)[1] + 1
    return ','.join(str(x) for x in indices)

In [69]:
def predict_labels(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    outputs = model(inputs['input_ids'], inputs['attention_mask'])
    logits = outputs
    probs = torch.sigmoid(logits).detach().cpu().numpy()
    return probs

# Example usage
text_sample = "خوب وخوش طعم بود"
probs = predict_labels(text_sample)
print("Predicted Probabilities:", predict_with_threshold(probs))

Predicted Probabilities: 1,5


In [177]:
df_test = pd.read_csv("data_test_users.csv")

In [178]:
df_test['intent'] = None

In [179]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       9000 non-null   int64 
 1   comment  9000 non-null   object
 2   intent   0 non-null      object
dtypes: int64(1), object(2)
memory usage: 211.1+ KB


In [180]:
for ind, row in df_test.iterrows():
    probs = predict_labels(row['comment'])
    df_test.at[ind, 'intent'] = predict_with_threshold(probs)

In [181]:
df_test

Unnamed: 0,id,comment,intent
0,15336956,خیلی خوبه عالیه,1
1,15336959,زیبا بود,1
2,15336960,به علت شکیتگی مرجوع کردم,1
3,15336961,هم جعبه ماوس باز شده بود و هم ماوس شکسته بود,1
4,15336964,چراغ قوه اش خوب بود,1
...,...,...,...
8995,14426574,من سایز ۱۸ تا ۲۴ ماه سفارش دادم اما وقتی رسید ...,4
8996,14426637,ولی این بار برای من اشتباه ارسال شده بود,4
8997,14426701,متاسفاته من مهتابی سفارش داده بودم ولی برام آف...,4
8998,14426742,دقیقاهمین چیزی که داخل عکسه ر,4


In [182]:
df_test.to_csv("result.csv")

In [183]:
df_test['intent'].value_counts()

intent
4      2072
5      1734
1      1646
2      1577
3      1389
1,3     255
1,2     211
1,5      77
1,4      39
Name: count, dtype: int64