In [1]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

def retrieve_docs_and_clean():
  # Untuk mendapatkan link berita populer
  r = requests.get('https://bola.kompas.com/')
  soup = BeautifulSoup(r.content, 'html.parser')

  link = []
  for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
      i['href'] = i['href'] + '?page=all'
      link.append(i['href'])

  # Retrieve Paragraphs
  documents = []
  for i in link:
      r = requests.get(i)
      soup = BeautifulSoup(r.content, 'html.parser')

      sen = []
      for i in soup.find('div', {'class':'read__content'}).find_all('p'):
          sen.append(i.text)
      documents.append(' '.join(sen))

  # Clean Paragraphs
  documents_clean = []
  for d in documents:
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      documents_clean.append(document_test)

  return documents_clean

In [4]:
docs = retrieve_docs_and_clean()

# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)



In [6]:
# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
print(df.head())
print(df.shape)

                 0         1         2         3         4    5         6  \
absen     0.000000  0.000000  0.000000  0.000000  0.000000  0.0  0.023482   
acara     0.053817  0.000000  0.000000  0.000000  0.000000  0.0  0.000000   
ada       0.000000  0.000000  0.034877  0.026203  0.000000  0.0  0.027888   
adalah    0.000000  0.023516  0.057260  0.021510  0.053979  0.0  0.022893   
adaptasi  0.000000  0.000000  0.000000  0.000000  0.000000  0.0  0.000000   

                 7         8         9  
absen     0.000000  0.000000  0.000000  
acara     0.000000  0.000000  0.030588  
ada       0.035480  0.000000  0.021367  
adalah    0.009708  0.000000  0.035080  
adaptasi  0.000000  0.056255  0.000000  
(982, 10)


In [8]:
docs = retrieve_docs_and_clean()
# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
absen,0.0,0.0,0.0,0.0,0.0,0.0,0.023482,0.0,0.0,0.0
acara,0.053817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030588
ada,0.0,0.0,0.034877,0.026203,0.0,0.0,0.027888,0.03548,0.0,0.021367
adalah,0.0,0.023516,0.05726,0.02151,0.053979,0.0,0.022893,0.009708,0.0,0.03508
adaptasi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056255,0.0


In [16]:
def get_similar_articles(q, df):
  print("query:", q)
  print("Berikut artikel dengan nilai cosine similarity tertinggi: ")
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
  for k, v in sim_sorted:
    if v != 0.0:
      print("Nilai Similaritas:", v)
      print(docs[k])
      print()


q1 = 'barcelona'
q2 = 'gareth bale'
q3 = 'shin tae yong'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: barcelona
Berikut artikel dengan nilai cosine similarity tertinggi: 
Nilai Similaritas: 0.044126467433356555
 kompas com seremoni penghargaan the best fifa football awards telah digelar di paris perancis pada selasa dini hari wib lionel messi menjadi salah satu nama yang masuk dalam daftar pemenang the best fifa football awards dia masuk sebagai pemain terbaik di kategori putra sebelum resmi terpilih menjadi pemain terbaik fifa lionel messi bersaing dengan dua kandidat lain kedua kandidat yang meramaikan persaingan gelar pemain terbaik fifa bersama lionel messi adalah karim benzema dan kylian mbappe baca juga lionel messi pemain terbaik fifa ungguli mbappe dan benzema karim benzema dan kylian mbappe menjadi andalan di klubnya masing masing bahkan kylian mbappe juga menjadi pemain paling berkontribusi dalam perjalanan timnas perancis di piala dunia qatar namun sinar karim benzema dan kylian mbappe belum cukup untuk mengalahkan lionel messi lionel messi berhasil membawa timnas arg