In [1]:
pip install transformers

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m94.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.7 MB/s[0m eta [36m0:00:0

In [2]:
import numpy as np
import pandas as pd
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
MAX_LEN = 100
BATCH_SIZE = 16

no_of_classes_sentiment = 3
no_of_classes_topic = 8

PRE_TRAINED_MODEL_NAME = '/content/drive/MyDrive/fmc/huggingface_bert_model.bin'
tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/fmc/tsel_cx_tnps_sentiment_token.bin")

FINE_TUNED_MODEL_NAME_SENTIMENT = ''
FINE_TUNED_MODEL_NAME_TOPIC = ''

class_params_senti = {}
class_params_topic = {}

In [5]:
class TselDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, item):
    review = str(self.texts[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      #pad_to_max_length=True,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }


def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TselDataset(
    texts=df.text.to_numpy(),
    targets=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

class TextClassifier(nn.Module):

  def __init__(self, n_classes):
    super(TextClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.2)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output[1])
    return self.out(output)


def get_predictions(model, data_loader):
  model = model.eval()

  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      texts = d["review_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      probs = F.softmax(outputs, dim=1)

      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

In [6]:
model_topic = TextClassifier(no_of_classes_topic)
model_topic = model_topic.to(device)
state_dict_topic = torch.load(FINE_TUNED_MODEL_NAME_TOPIC, map_location=device)
model_topic.load_state_dict(state_dict_topic)

model_sentiment = TextClassifier(no_of_classes_sentiment)
model_sentiment = model_sentiment.to(device)
state_dict_sentiment = torch.load(FINE_TUNED_MODEL_NAME_SENTIMENT, map_location=device)
model_sentiment.load_state_dict(state_dict_sentiment)

<All keys matched successfully>

In [None]:
input_dataframe_names = ['tnps_datamart_part-0.csv', 'tnps_datamart_part-1.csv', 'tnps_datamart_part-2.csv', 'tnps_datamart_part-3.csv', 'tnps_datamart_part-4.csv', 'tnps_datamart_part-5.csv']

output_dataframe_names = ['output_tnps_datamart_part-0.csv', 'output_tnps_datamart_part-1.csv', 'output_tnps_datamart_part-2.csv', 'output_tnps_datamart_part-3.csv', 'output_tnps_datamart_part-4.csv', 'output_tnps_datamart_part-5.csv']

def pairing_function(self, x, y):
    mydic = {}
    for i in zip(dict(sorted(y.items(), key=lambda x:x[1])), x):
        mydic[i[0]] = i[1]

    return mydic

In [None]:
for i, csv_name in enumerate(input_dataframe_names):
    df = pd.read_csv(csv_name)
    dfp = pd.DataFrame({
        'text': df['sentiment_text'],
        'label': [0]*df.shape[0]
    })

    # PRE-PROCESS
    dfp['text'] = dfp['text'].replace('\d+|[^\w\s]|NULL|~', ' ', regex=True)
    dfp['text'] = dfp['text'].str.strip()

    dfp_final = dfp[(dfp['text'] != '') & (dfp['text'].notnull())]

    predicting_data_loader = create_data_loader(dfp_final, tokenizer, MAX_LEN, BATCH_SIZE)

    # MAIN-PROCESS
    # Topic
    y_review_texts_topic, y_pred_topic, y_pred_probs_topic, y_test_topic = get_predictions(
        model_topic,
        predicting_data_loader
    )

    # Sentiment
    y_review_texts_sent, y_pred_sent, y_pred_probs_sent, y_test_sent = get_predictions(
        model_sentiment,
        predicting_data_loader
    )

    # POST-PROCESS
    y_prediction_probs_topic_df = pd.DataFrame({
        'y_pred_proba': y_pred_probs_topic.tolist()
    })

    y_prediction_probs_senti_df = pd.DataFrame({
        'y_pred_proba': y_pred_probs_sent.tolist()
    })

    y_prediction_probs_topic = np.vectorize(pairing_function)(
        y_prediction_probs_topic_df['y_pred_proba'], class_params_topic)

    y_prediction_probs_sentiment = np.vectorize(pairing_function)(
        y_prediction_probs_senti_df['y_pred_proba'], class_params_senti)

    # Post-process (assign to raw dataframe)
    dfp_final['y_pred_topic'] = y_pred_topic
    dfp_final['y_pred_proba_topic'] = y_prediction_probs_topic
    dfp_final['y_pred_sentiment'] = y_pred_sent
    dfp_final['y_pred_proba_sentiment'] = y_prediction_probs_sentiment

    # Joining data
    df_new = df.merge(dfp_final.iloc[:,2:], left_index=True, right_index=True, how='outer')
    
    df_new.to_csv(output_dataframe_names[i], index=False)