In [49]:
!pip install -U datasets -qq

In [50]:
import numpy as np
import pandas as pd
from datasets import Dataset, load_from_disk
from torch.utils.data import DataLoader
import torch
from transformers import AutoTokenizer, AutoModel

In [51]:
data = pd.read_csv("clean_uas.csv")["teks"]
len(data)

10105

In [52]:
Dataset.from_dict({"data": data}).save_to_disk("nlp")
ds = load_from_disk("nlp").with_format("torch")
dataloader = DataLoader(ds, batch_size=32, num_workers=2)

Saving the dataset (0/1 shards):   0%|          | 0/10105 [00:00<?, ? examples/s]

In [53]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
ds = ds.map(lambda e: tokenizer(e['data'], truncation=True, padding='max_length'), batched=True)

In [54]:
ds.set_format(type='torch', columns=['data', 'input_ids', 'attention_mask'])
dataloader = torch.utils.data.DataLoader(ds, batch_size=128)

In [55]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [56]:
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-multilingual-mpnet-base-v2').to(device)

In [73]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [74]:
from tqdm import tqdm

In [59]:
for idx, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
  # Compute token embeddings
  with torch.no_grad():
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    # labels = batch['labels'].to(device)
    outputs = model(input_ids, attention_mask=attention_mask)

  # Perform pooling. In this case, max pooling.
  sentence_embeddings = mean_pooling(outputs, attention_mask).to('cpu')
  np.save(f'/content/corpus_embed/sentence_embeddings_{idx}.npy', sentence_embeddings)

  0%|          | 0/79 [00:04<?, ?it/s]


FileNotFoundError: ignored

In [60]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [68]:
import os
list_file = []
for file in os.listdir("/content"):
    if file.startswith("sentence_embedd"):
        list_file.append(file)
list_file

[]

In [75]:
from google.colab import files
for i in list_file: files.download(i)

In [76]:
!pip install natsort -qq

In [77]:
from natsort import natsorted
import glob

In [78]:
corpus = []
for e in natsorted(glob.glob("/content/corpus_embed/*.npy")):
    print(e)
    corpus.append(np.load(e))
    mean_pooled = np.vstack(corpus)
    print('Success corpus append')
np.save('corpus_dense_embeddings_all_data_ordered.npy', mean_pooled)

NameError: ignored

In [79]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=904bf87f285c1db82b2924f4116e58699f5576deee19c6b9598223fc4a4e777f
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_tr

In [61]:
from sentence_transformers import SentenceTransformer
modelmpnet = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

In [62]:
import pandas as pd
master = pd.read_csv("UASDataset_3bulan.csv")
master["clean_final"] = data
master

Unnamed: 0.1,Unnamed: 0,Tanggal,Kategori Umum,Kategori Khusus,Judul,Teks,clean_final
0,0,2023-08-01,Market,Komoditas,Bursa Kripto CFX Janjikan Transaksi Kripto yan...,"Bisnis.com, JAKARTA - PT Bursa Komoditi Nusant...",bursa kripto cfx janji transaksi kripto transp...
1,1,2023-08-01,Finansial,Multifinance,Kabar Terbaru dari OJK soal Asing Jadi Strateg...,"Bisnis.com, JAKARTA — Otoritas Jasa Keuangan (...",kabar baru ojk asing strategic investor leasin...
2,2,2023-08-01,Finansial,Perbankan,"Viral Hilang Akses ke Layanan BCA, Manajemen I...","Bisnis.com, JAKARTA - Isu tentang hilangnya ak...",viral hilang akses layan bca manajemen ingat ...
3,3,2023-08-01,Finansial,Asuransi,"Laba Semester I/2023 Capai Rp7,61 Miliar, Asur...","Bisnis.com, JAKARTA— Perusahaan asuransi umum ...",laba semester i capai rp miliar asuransi cak...
4,4,2023-08-01,Market,Korporasi,Semen Indonesia (SMGR) Raih Pendapatan Rp17 Tr...,"Bisnis.com, JAKARTA – Emiten pelat merah PT Se...",semen indonesia smgr raih dapat rp triliun sem...
...,...,...,...,...,...,...,...
10100,10100,2023-10-31,Finansial,Asuransi,"Cairkan Dana Pensiun di Taspen, Ini Syarat dan...","Bisnis.com, JAKARTA –– Pemerintah menyelenggar...",cair dana pensiun taspen syarat bisnis com j...
10101,10101,2023-10-31,Market,Bursa & saham,"Indeks Bisnis-27 Ditutup Lesu, Saham INKP, UNT...","Bisnis.com, JAKARTA - Indeks Bisnis-27 terpant...",indeks bisnis tutup lesu saham inkp untr ak...
10102,10102,2023-10-31,Finansial,Perbankan,Bank Milik Taipan Tahir (MAYA) Raup Laba Bersi...,"Bisnis.com, JAKARTA - PT Bank Mayapada Interna...",bank milik taipan tahir maya raup laba bersih ...
10103,10103,2023-10-31,Finansial,Perbankan,"Laba Bersih Bank Permata (BNLI) Rp2,14 Triliun...","Bisnis.com, JAKARTA -- PT Bank Permata Tbk. (B...",laba bersih bank permata bnli rp triliun sept...


In [63]:
query = "rupiah melemah"
corpus_embeddings = np.load('/content/corpus_dense_embeddings_all_data_ordered.npy')
query_embeddings = modelmpnet.encode(query)

rank = cosine_similarity([query_embeddings], corpus_embeddings)
rank_dict = {i: rank[0, i] for i in range(len(rank[0]))}

dense_rank = dict(sorted(rank_dict.items(), key=lambda item: item[1], reverse=True))

corpus_id = list(dense_rank.keys())
result = []
for id in corpus_id[:30]:
    result.append(master['Judul'][id])

In [64]:
eval_query = pd.read_excel('/content/UAS_valuation_NLP.xlsx')
eval_query.reset_index(inplace=True)

In [65]:
eval_query

Unnamed: 0,index,no,query,id_query
0,0,1,RUU APBN,76776552241425
1,1,2,krisis pangan,"568,806,756,1144,1930,4144,2737,9577,9027,9282..."
2,2,3,ekspor impor pertanian,337712871282
3,3,4,Penerimaan pajak,14235234148628252916292553356024
4,4,5,Saham Goto naik,137047901026
5,5,6,Bisnis Ekonomi Syariah,"2971,4522,4539,5166,5979,6676,6826,6943,7750,9..."
6,6,7,investasi IKN,150041514225434861436143
7,7,8,rupiah melemah,"29,64,124,553,728,968,1163,1213,1222,1232,1269..."
8,8,9,tarif saldo lrt jakarta,310833022492980425117946
9,9,10,bank dki pembayaran non tunai,330234466231249226932819


In [80]:
all_evaluation = []
for currq in range(len(eval_query)):
  query = eval_query["query"][currq]
  list_relevant = eval_query['id_query'][currq].split(",")
  list_relevant = list(map(str.strip, list_relevant))

  corpus_embeddings = np.load('/content/corpus_dense_embeddings_all_data_ordered.npy')
  query_embeddings = modelmpnet.encode(query)

  rank = cosine_similarity([query_embeddings], corpus_embeddings)
  rank_dict = {i: rank[0, i] for i in range(len(rank[0]))}
  dense_rank = dict(sorted(rank_dict.items(), key=lambda item: item[1], reverse=True))
  corpus_id = list(dense_rank.keys())
  id_relevant = corpus[:30]

  result_evaluation = {
      "query" : eval_query["query"][currq]
  }

  true_doc = 0
  for id in id_relevant:
    if str(id) in list_relevant:
      true_doc +=1


In [81]:
def get_evaluation(df, df_query, list_panjang):
  id_top30 = []
  all_evaluation = []
  for i in range(len(df_query)):
      current_query = df_query["query"][i]
      corpus_embeddings = np.load('/content/corpus_dense_embeddings_all_data_ordered.npy')
      query_embeddings = modelmpnet.encode(current_query)

      rank = cosine_similarity([query_embeddings], corpus_embeddings)
      rank_dict = {i: rank[0, i] for i in range(len(rank[0]))}
      dense_rank = dict(sorted(rank_dict.items(), key=lambda item: item[1], reverse=True))
      corpus_id = list(dense_rank.keys())

      id_relevant = []
      for id in corpus_id[:max(list_panjang)]:
        id_relevant.append(df.loc[id]["Unnamed: 0"])

      id_top30.append(id_relevant)
      list_relevant1 = df_query["id_query"][i].split(",")
      list_relevant = list(map(str.strip, list_relevant1))

      result_evaluation = {
          "query" : df_query["query"][i]
      }

      print(id_relevant)
      print(list_relevant)

      for j in list_panjang :
          true_doc = 0

          for id in id_relevant[:j]:
            if str(id) in list_relevant:
              true_doc += 1

          p = true_doc/j
          recall = true_doc/len(list_relevant)
          result_evaluation[f"p{j}"] = p
          result_evaluation[f"r{j}"] = recall

      all_evaluation.append(result_evaluation)

      print(result_evaluation)
      print("======================================================================================")

  return all_evaluation

In [82]:
def get_average(evaluation, panjang, type_eval):

    if type_eval == "precision" :

        p_all = []
        for i in range(len(evaluation)):
            p_all.append(evaluation[i][f"p{panjang}"])

        rata_rata = sum(p_all) / len(p_all)

        # Menampilkan hasil
        return rata_rata

    elif type_eval == "recall" :
          recall_all = []
          for i in range(len(evaluation)):
              recall_all.append(evaluation[i][f"r{panjang}"])

          rata_rata_recall = sum(recall_all) / len(recall_all)

          # Menampilkan hasil
          return rata_rata_recall

In [83]:
evaluation = get_evaluation(master, eval_query, [30])

[5203, 7020, 5257, 5258, 6703, 1346, 9242, 7064, 1613, 7175, 4667, 5420, 1669, 1967, 1398, 1718, 3989, 4502, 6025, 6110, 1298, 6670, 1434, 2800, 5274, 1940, 7263, 5788, 5220, 3884]
['767', '765', '5224', '1425']
{'query': 'RUU APBN', 'p30': 0.0, 'r30': 0.0}
[1441, 7554, 5226, 1951, 9264, 6965, 7001, 1429, 806, 6465, 2161, 2725, 8986, 3975, 4929, 2165, 606, 7607, 8351, 4546, 5243, 5995, 1774, 7473, 5238, 5292, 7604, 8412, 6165, 2658]
['568', '806', '756', '1144', '1930', '4144', '2737', '9577', '9027', '9282', '2482']
{'query': 'krisis pangan', 'p30': 0.03333333333333333, 'r30': 0.09090909090909091}
[1296, 1299, 4564, 7001, 7604, 8882, 6965, 4658, 1344, 1298, 4485, 9213, 1287, 6002, 2658, 3934, 7865, 1318, 8233, 9642, 5995, 7039, 7746, 603, 4556, 7607, 2895, 9365, 7421, 8258]
['3377', '1287', '1282']
{'query': 'ekspor impor pertanian', 'p30': 0.03333333333333333, 'r30': 0.3333333333333333}
[6110, 5766, 5234, 6126, 8224, 6921, 9563, 6093, 1298, 2122, 2739, 1465, 9162, 6389, 8153, 1346, 9

In [84]:
p30 = get_average(evaluation, 30,  "precision" )
r30 = get_average(evaluation, 30,  "recall" )

print("Hasil Evaluasi :")
print(f"p30 : {p30}")
print(f"r30 : {r30}")

Hasil Evaluasi :
p30 : 0.05416666666666667
r30 : 0.19729888167388165


# ==========================================================================

In [85]:
import nltk
import numpy as np

In [27]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(tokenizer=nltk.word_tokenize,
                        sublinear_tf=True, min_df=5, max_df=0.95)
X_tfidf = tfidf.fit_transform(master["clean_final"])



In [87]:
def get_evaluation_TFIDF(df, df_query, list_panjang):
  all_evaluation = []
  for i in range(len(df_query)):
      current_query = df_query["query"][i]
      corpus_embeddings = X_tfidf
      query_embeddings = tfidf.transform([current_query])

      rank = cosine_similarity(corpus_embeddings, query_embeddings).reshape((-1))
      rank_tfidf = master.copy()
      rank_tfidf['sim'] = rank
      rank_tfidf_sorted = rank_tfidf.sort_values(by=['sim'], ascending=False)
      corpus_id = rank_tfidf_sorted.index.to_list()

      id_relevant = []
      for id in corpus_id[:max(list_panjang)]:
        id_relevant.append(df.loc[id]["Unnamed: 0"])

      list_relevant1 = df_query["id_query"][i].split(",")
      list_relevant = list(map(str.strip, list_relevant1))

      result_evaluation = {
          "query" : df_query["query"][i]
      }

      print(id_relevant)
      print(list_relevant)

      for j in list_panjang :
          true_doc = 0

          for id in id_relevant[:j]:
            if str(id) in list_relevant:
              true_doc += 1

          p = true_doc/j
          recall = true_doc/len(list_relevant)
          result_evaluation[f"p{j}"] = p
          result_evaluation[f"r{j}"] = recall

      all_evaluation.append(result_evaluation)

      print(result_evaluation)
      print("======================================================================================")

  return all_evaluation

In [88]:
def get_average(evaluation, panjang, type_eval):

    if type_eval == "precision" :

        p_all = []
        for i in range(len(evaluation)):
            p_all.append(evaluation[i][f"p{panjang}"])

        rata_rata = sum(p_all) / len(p_all)

        # Menampilkan hasil
        return rata_rata

    elif type_eval == "recall" :
          recall_all = []
          for i in range(len(evaluation)):
              recall_all.append(evaluation[i][f"r{panjang}"])

          rata_rata_recall = sum(recall_all) / len(recall_all)

          # Menampilkan hasil
          return rata_rata_recall

In [89]:
evaluation = get_evaluation_TFIDF(master, eval_query, [30])

[5203, 1423, 3928, 3989, 2122, 590, 4587, 765, 6737, 4161, 4367, 5234, 5254, 4045, 2753, 6787, 4334, 4692, 1413, 9639, 5257, 3810, 3020, 4667, 9301, 5957, 928, 6700, 1374, 1290]
['767', '765', '5224', '1425']
{'query': 'RUU APBN', 'p30': 0.03333333333333333, 'r30': 0.25}
[8099, 2482, 3877, 6950, 3907, 1822, 5243, 9027, 5891, 1930, 806, 6465, 1759, 792, 2165, 5210, 5100, 3975, 7361, 5273, 9577, 6665, 1461, 4242, 3377, 8991, 791, 1441, 7185, 6816]
['568', '806', '756', '1144', '1930', '4144', '2737', '9577', '9027', '9282', '2482']
{'query': 'krisis pangan', 'p30': 0.16666666666666666, 'r30': 0.45454545454545453}
[8318, 4658, 7728, 5270, 8373, 1177, 1287, 4728, 599, 1318, 1289, 4755, 8163, 1109, 5268, 4597, 8154, 1309, 8632, 8218, 719, 4761, 1475, 7947, 6002, 994, 8699, 9013, 6495, 8378]
['3377', '1287', '1282']
{'query': 'ekspor impor pertanian', 'p30': 0.03333333333333333, 'r30': 0.3333333333333333}
[5766, 7424, 908, 1350, 1010, 5335, 9422, 8222, 6041, 6114, 6110, 1520, 6128, 6246, 613

In [90]:
p30 = get_average(evaluation, 30,  "precision" )
r30 = get_average(evaluation, 30,  "recall" )

print("Hasil Evaluasi :")
print(f"p30 : {p30}")
print(f"r30 : {r30}")

Hasil Evaluasi :
p30 : 0.11666666666666665
r30 : 0.42969877344877344


In [100]:
current_query = "saham goto naik"
corpus_embeddings = X_tfidf
query_embeddings = tfidf.transform([current_query])

rank = cosine_similarity(corpus_embeddings, query_embeddings).reshape((-1))
rank_tfidf = master.copy()
rank_tfidf['sim'] = rank
rank_tfidf_sorted = rank_tfidf.sort_values(by=['sim'], ascending=False)
corpus_id = rank_tfidf_sorted.index.to_list()



In [102]:
hasil = [master["clean_final"][indeks] for indeks in corpus_id[:30]]

In [104]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(hasil)
feature_names = vectorizer.get_feature_names_out()
avg_tfidf_scores = tfidf_matrix.mean(axis=0).tolist()[0]
top_words_indices = sorted(range(len(avg_tfidf_scores)), key=lambda i: avg_tfidf_scores[i], reverse=True)

extends_query = []
for word_index in top_words_indices[5:15]:
    word = feature_names[word_index]
    score = avg_tfidf_scores[word_index]
    extends_query.append(word)

In [105]:
extends_query

['triliun',
 'miliar',
 'harga',
 'level',
 'jual',
 'kuartal',
 'walujo',
 'beli',
 'sekuritas',
 'investor']

In [108]:
query2 = " ".join(extends_query)

query3 = current_query + " " + query2

In [109]:
query3

'saham goto naik triliun miliar harga level jual kuartal walujo beli sekuritas investor'

In [114]:
def query_expension_tf_idf(query):
  corpus_embeddings = X_tfidf
  query_embeddings = tfidf.transform([query])

  rank = cosine_similarity(corpus_embeddings, query_embeddings).reshape((-1))
  rank_tfidf = master.copy()
  rank_tfidf['sim'] = rank
  rank_tfidf_sorted = rank_tfidf.sort_values(by=['sim'], ascending=False)
  corpus_id = rank_tfidf_sorted.index.to_list()

  hasil = [master["clean_final"][indeks] for indeks in corpus_id[:30]]
  vectorizer = TfidfVectorizer()
  tfidf_matrix = vectorizer.fit_transform(hasil)
  feature_names = vectorizer.get_feature_names_out()
  avg_tfidf_scores = tfidf_matrix.mean(axis=0).tolist()[0]
  top_words_indices = sorted(range(len(avg_tfidf_scores)), key=lambda i: avg_tfidf_scores[i], reverse=True)

  extends_query = []
  for word_index in top_words_indices[5:8]:
      word = feature_names[word_index]
      score = avg_tfidf_scores[word_index]
      extends_query.append(word)

  query2 = " ".join(extends_query)

  query3 = query + " " + query2

  return query3

In [115]:
def get_evaluation_TFIDF_qe(df, df_query, list_panjang):
  all_evaluation = []
  for i in range(len(df_query)):
      current_query = df_query["query"][i]
      query_exp = query_expension_tf_idf(current_query )
      corpus_embeddings = X_tfidf
      query_embeddings = tfidf.transform([query_exp])

      rank = cosine_similarity(corpus_embeddings, query_embeddings).reshape((-1))
      rank_tfidf = master.copy()
      rank_tfidf['sim'] = rank
      rank_tfidf_sorted = rank_tfidf.sort_values(by=['sim'], ascending=False)
      corpus_id = rank_tfidf_sorted.index.to_list()



      id_relevant = []
      for id in corpus_id[:max(list_panjang)]:
        id_relevant.append(df.loc[id]["Unnamed: 0"])

      list_relevant1 = df_query["id_query"][i].split(",")
      list_relevant = list(map(str.strip, list_relevant1))

      result_evaluation = {
          "query" : query_exp
      }

      print(id_relevant)
      print(list_relevant)

      for j in list_panjang :
          true_doc = 0

          for id in id_relevant[:j]:
            if str(id) in list_relevant:
              true_doc += 1

          p = true_doc/j
          recall = true_doc/len(list_relevant)
          result_evaluation[f"p{j}"] = p
          result_evaluation[f"r{j}"] = recall

      all_evaluation.append(result_evaluation)

      print(result_evaluation)
      print("======================================================================================")

  return all_evaluation

In [116]:
evaluation = get_evaluation_TFIDF_qe(master, eval_query, [30])

[5334, 3989, 3928, 928, 765, 2753, 4587, 9301, 4161, 9315, 9358, 4334, 1427, 4692, 5321, 5511, 9777, 5339, 5335, 3984, 913, 5492, 5315, 10065, 4367, 990, 9422, 2890, 5278, 2122]
['767', '765', '5224', '1425']
{'query': 'RUU APBN mulyani sri perintah', 'p30': 0.03333333333333333, 'r30': 0.25}
[2482, 5243, 1822, 8099, 5273, 635, 9027, 3393, 1759, 5267, 3877, 3975, 3907, 6272, 7820, 1154, 8140, 5210, 3014, 6950, 5557, 7601, 7444, 4191, 2512, 3736, 10080, 6465, 1312, 5082]
['568', '806', '756', '1144', '1930', '4144', '2737', '9577', '9027', '9282', '2482']
{'query': 'krisis pangan rp persen turun', 'p30': 0.06666666666666667, 'r30': 0.18181818181818182}
[8318, 1177, 8373, 4658, 8163, 4597, 4858, 4728, 8477, 1318, 8154, 1324, 1309, 1109, 8648, 4644, 4735, 1289, 8215, 8211, 1345, 8378, 8568, 4759, 5268, 8218, 4740, 8233, 8220, 8317]
['3377', '1287', '1282']
{'query': 'ekspor impor pertanian dagang us miliar', 'p30': 0.0, 'r30': 0.0}
[908, 5335, 9422, 1010, 5766, 6024, 6110, 1423, 1486, 6389

In [117]:
p30 = get_average(evaluation, 30,  "precision" )
r30 = get_average(evaluation, 30,  "recall" )

print("Hasil Evaluasi :")
print(f"p30 : {p30}")
print(f"r30 : {r30}")

Hasil Evaluasi :
p30 : 0.07291666666666666
r30 : 0.2556592712842713
