In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import json

# Load saved tokenizer, model, and label classes
model_dir = "saved_model_new"

tokenizer = BertTokenizer.from_pretrained(model_dir)
model = BertForSequenceClassification.from_pretrained(model_dir)
model.eval()

with open(f"{model_dir}/label_classes.json", "r") as f:
    label_classes = json.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


  from .autonotebook import tqdm as notebook_tqdm


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [2]:
def predict_from_series(series, threshold=0.5, batch_size=16):
    model.eval()
    predictions = []

    for i in range(0, len(series), batch_size):
        batch_texts = series.iloc[i:i+batch_size].tolist()
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=256)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.sigmoid(logits)

        for prob_vector in probs:
            predicted_indices = (prob_vector >= threshold).nonzero(as_tuple=True)[0].tolist()
            if predicted_indices:
                predicted_labels = [label_classes[i] for i in predicted_indices]
            else:
                predicted_labels = ["Miscellaneous"]
            predictions.append(predicted_labels)

    return pd.Series(predictions, index=series.index)


In [3]:
# df['predicted_categories'] = predict_from_series(df['reviews'])

In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('reviews.csv')

In [6]:
df['category'] = predict_from_series(df['reviews'])

In [7]:
df

Unnamed: 0,reviews,category
0,pretty decent airline,"[Boarding Process, Cabin Environment]"
1,not a good airline,"[Boarding Process, Cabin Environment]"
2,flight was fortunately short,[Miscellaneous]
3,i will never fly again with adria,[Miscellaneous]
4,it ruined our last days of holidays,[Miscellaneous]
...,...,...
4224,trip was pleasant in general,[Miscellaneous]
4225,unaccommodating staff,[Staff Behaviour]
4226,you get what you paid for,[Miscellaneous]
4227,very bad customer service,"[Customer Service, In-flight Experience, Staff..."


In [8]:
pd.set_option('display.max_rows', None)


In [9]:
df

Unnamed: 0,reviews,category
0,pretty decent airline,"[Boarding Process, Cabin Environment]"
1,not a good airline,"[Boarding Process, Cabin Environment]"
2,flight was fortunately short,[Miscellaneous]
3,i will never fly again with adria,[Miscellaneous]
4,it ruined our last days of holidays,[Miscellaneous]
5,had very bad experience,[In-flight Experience]
6,worse than the budget airlines,"[Boarding Process, Cabin Environment, Value fo..."
7,book another company,[Miscellaneous]
8,combined two flights,[Miscellaneous]
9,the crew was nice,"[In-flight Experience, Staff Behaviour]"


In [18]:
df[df['category']=='Miscellaneous']

Unnamed: 0,reviews,category
