<a href="https://colab.research.google.com/github/asyrofist/extractreq/blob/main/transformerIndo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [7]:
import re, string
from torch import clamp
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

class TokenSimilarity:

    def load_pretrained(self, from_pretrained:str="indobenchmark/indobert-base-p1"):
        self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained)
        self.model = AutoModel.from_pretrained(from_pretrained)
        
    def __cleaning(self, text:str):
        # clear punctuations
        text = text.translate(str.maketrans('', '', string.punctuation))

        # clear multiple spaces
        text = re.sub(r'/s+', ' ', text).strip()

        return text
        
    def __process(self, first_token:str, second_token:str):
        inputs = self.tokenizer([first_token, second_token],
                                max_length=self.max_length,
                                truncation=self.truncation,
                                padding=self.padding,
                                return_tensors='pt')

        attention = inputs.attention_mask

        outputs = self.model(**inputs)

        # get the weights from the last layer as embeddings
        embeddings = outputs[0] # when used in older transformers version
        # embeddings = outputs.last_hidden_state # when used in newer one

        # add more dimension then expand tensor
        # to match embeddings shape by duplicating its values by rows
        mask = attention.unsqueeze(-1).expand(embeddings.shape).float()

        masked_embeddings = embeddings * mask
        
        # MEAN POOLING FOR 2ND DIMENSION
        # first, get sums by 2nd dimension
        # second, get counts of 2nd dimension
        # third, calculate the mean, i.e. sums/counts
        summed = masked_embeddings.sum(1)
        counts = clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed/counts

        # return mean pooling as numpy array
        return mean_pooled.detach().numpy()
        
    def predict(self, first_token:str, second_token:str,
                return_as_embeddings:bool=False, max_length:int=16,
                truncation:bool=True, padding:str="max_length"):
        self.max_length = max_length
        self.truncation = truncation
        self.padding = padding

        first_token = self.__cleaning(first_token)
        second_token = self.__cleaning(second_token)

        mean_pooled_arr = self.__process(first_token, second_token)
        if return_as_embeddings:
            return mean_pooled_arr

        # calculate similarity
        similarity = cosine_similarity([mean_pooled_arr[0]], [mean_pooled_arr[1]])

        return similarity

In [8]:
model = TokenSimilarity()
model.load_pretrained('indobenchmark/indobert-base-p2')

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/224k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/475M [00:00<?, ?B/s]

In [9]:
token1 = 'bisa'
token2 = 'dapat'
token3 = 'mampu'

print("kalimat1:\t{}".format(token1))
print("kalimat2:\t{}".format(token2))
print("kalimat3:\t{}".format(token3))

varToken1 = (model.predict(token1, token2))
varToken2 = (model.predict(token1, token3))
print("\nPerbandingan kalimat 1 dan kalimat 2 sebagai berikut:")
print("Kalimat 1\t: {}".format(token1))
print("Kalimat 2\t: {}".format(token2))
print("adalah\t\t: {}".format(varToken1))

print("\nPerbandingan kalimat 2 dan kalimat 3 sebagai berikut:")
print("Kalimat 2\t: {}".format(token2))
print("Kalimat 3\t: {}".format(token3))
print("adalah\t\t: {}".format(varToken2))

kalimat1:	bisa
kalimat2:	dapat
kalimat3:	mampu

Perbandingan kalimat 1 dan kalimat 2 sebagai berikut:
Kalimat 1	: bisa
Kalimat 2	: dapat
adalah		: [[0.7891123]]

Perbandingan kalimat 2 dan kalimat 3 sebagai berikut:
Kalimat 2	: dapat
Kalimat 3	: mampu
adalah		: [[0.7988268]]


In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
import pandas as pd
file_param = "/content/drive/MyDrive/dataset/indo/dataSinta.xlsx"

class ekstraksiExcel: #template class partOf
  def __init__(self, inputData  = file_param): 
      self.__data = inputData # data inisiliasi file parameter
  def preprocessing(self): # function melihat struktur dataset di excel
      xl = pd.ExcelFile(self.__data)
      for sh in xl.sheet_names:
        df = xl.parse(sh)
        print('Processing: [{}] ...'.format(sh))
        print(df.head())
  def fulldataset(self, inputSRS): # function membuat dataset
      xl = pd.ExcelFile(self.__data)
      dfs = {sh:xl.parse(sh) for sh in xl.sheet_names}[inputSRS]
      return dfs

In [23]:
# ekstraksiExcel().preprocessing()
ekstraksiExcel().fulldataset('sinta1')['judul'][0]

0    ﻿Ketika Plagiarisme adalah Suatu Permasalahan ...
1    ﻿ANALISIS PENAMAAN KEDAI KOPI DI SURABAYA: KAJ...
Name: judul, dtype: object

In [24]:
token1 = ekstraksiExcel().fulldataset('sinta1')['judul'][0]
token2 = ekstraksiExcel().fulldataset('sinta1')['judul'][1]
# token3 = 'mampu'

print("kalimat1:\t{}".format(token1))
print("kalimat2:\t{}".format(token2))
# print("kalimat3:\t{}".format(token3))

varToken1 = (model.predict(token1, token2))
# varToken2 = (model.predict(token1, token3))
print("\nPerbandingan kalimat 1 dan kalimat 2 sebagai berikut:")
print("Kalimat 1\t: {}".format(token1))
print("Kalimat 2\t: {}".format(token2))
print("adalah\t\t: {}".format(varToken1))

# print("\nPerbandingan kalimat 2 dan kalimat 3 sebagai berikut:")
# print("Kalimat 2\t: {}".format(token2))
# print("Kalimat 3\t: {}".format(token3))
# print("adalah\t\t: {}".format(varToken2))

kalimat1:	﻿Ketika Plagiarisme adalah Suatu Permasalahan Etika
kalimat2:	﻿ANALISIS PENAMAAN KEDAI KOPI DI SURABAYA: KAJIAN ETNOLINGUISTIK

Perbandingan kalimat 1 dan kalimat 2 sebagai berikut:
Kalimat 1	: ﻿Ketika Plagiarisme adalah Suatu Permasalahan Etika
Kalimat 2	: ﻿ANALISIS PENAMAAN KEDAI KOPI DI SURABAYA: KAJIAN ETNOLINGUISTIK
adalah		: [[0.4612782]]
