In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import transformers
from transformers import BertConfig, BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tokenizer = BertTokenizer.from_pretrained("/content/drive/MyDrive/fmc/tsel_cx_tnps_sentiment_tokenv2.bin")
PRE_TRAINED_MODEL_NAME = '/content/drive/MyDrive/fmc/huggingface_bert_modelv2.bin'
FINE_TUNED_MODEL_NAME = '/content/drive/MyDrive/fmc/tsel_cx_tnps_topic_modelv2.bin'

In [None]:
MAX_LEN = 100
BATCH_SIZE = 32
no_of_classes = 9
class_names = ['network', 'pricing', 'product', 'service', 'halo', 'mytsel', 'fintech', 'others', 'migrasi']

In [None]:
class TselDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, item):
    review = str(self.texts[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      #pad_to_max_length=True,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TselDataset(
    texts=df.text.to_numpy(),
    targets=df.label.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )



In [None]:
class TextClassifier(nn.Module):

  def __init__(self, n_classes):
    super(TextClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.2)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    #print(pooled_output)
    output = self.drop(pooled_output[1])
    return self.out(output), pooled_output[0]

In [None]:
model = TextClassifier(no_of_classes)
model = model.to(device)
state_dict = torch.load(FINE_TUNED_MODEL_NAME, map_location=device)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()

  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:

      texts = d["review_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      outputs, lhs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      _, preds = torch.max(outputs, dim=1)

      probs = F.softmax(outputs, dim=1)

      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(probs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values, lhs, attention_mask

In [None]:
predict_text='kualitas jaringan dan pelayanan pelanggan yang baik'
data = {'text':[predict_text], 'label':[0]}
dfp = pd.DataFrame(data)
predicting_data_loader = create_data_loader(dfp, tokenizer, MAX_LEN, BATCH_SIZE)
y_review_texts, y_pred, y_pred_probs, y_test, lhs, attention_mask = get_predictions(
  model,
  predicting_data_loader
)
#print(class_names[y_pred.item()])

In [None]:
print(class_names[y_pred.item()])

network


In [None]:
embeddings = lhs
mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
masked_embeddings = embeddings * mask
summed = torch.sum(masked_embeddings, 1)
summed_mask = torch.clamp(mask.sum(1), min=1e-9)
mean_pooled = summed / summed_mask
mean_pooled = mean_pooled.detach().numpy()


In [None]:
sentences = [
    "browsing internet",
    "jangkauan lokasi lemot cepat lambat kencang kenceng hilang sinyal hilang kuat",
    "sms voice telpon panggilan"
]

In [None]:
sentences = [
    "paket internet, paket kuota",
    "paket internet combo sakti, paket kuota combo sakti, paket combo sakti",
    "paket internet internet sakti, paket kuota internet sakti, paket internet sakti",
    "paket internet darurat, paket kuota darurat, paket darurat"
]

In [None]:
sentences = [
    "browsing internet",
    "jangkauan lokasi lemot cepat lambat kencang kenceng hilang sinyal hilang kuat",
    "sms voice telpon panggilan"
]
tokens = {'input_ids': [], 'attention_mask': []}

for sentence in sentences:
    new_tokens = tokenizer.encode_plus(sentence, max_length=100,
                                       truncation=True, padding='max_length',
                                       return_tensors='pt')

    tokens['input_ids'].append(new_tokens['input_ids'][0])
    tokens['attention_mask'].append(new_tokens['attention_mask'][0])

tokens['input_ids'] = torch.stack(tokens['input_ids'])
tokens['attention_mask'] = torch.stack(tokens['attention_mask'])

outputs,lhs_ref = model(**tokens)
embeddings_ref = lhs_ref
attention_mask_ref = tokens['attention_mask']
mask_ref = attention_mask_ref.unsqueeze(-1).expand(embeddings_ref.size()).float()
masked_embeddings_ref = embeddings_ref * mask_ref
summed_ref = torch.sum(masked_embeddings_ref, 1)
summed_mask_ref = torch.clamp(mask_ref.sum(1), min=1e-9)
mean_pooled_ref = summed_ref / summed_mask_ref
mean_pooled_ref = mean_pooled_ref.detach().numpy()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(
    mean_pooled,
    mean_pooled_ref[:]
)

array([[0.85500836, 0.88252974, 0.75214326]], dtype=float32)