# Library

In [2]:
import re
import string
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

# Ekstrak & Clean Dokumen

In [3]:
def retrieve_docs_and_clean():
  # Untuk mendapatkan link berita populer
  r = requests.get('https://tekno.kompas.com/')
  soup = BeautifulSoup(r.content, 'html.parser')

  link = []
  for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
      i['href'] = i['href'] + '?page=all'
      link.append(i['href'])

  # Retrieve Paragraphs
  documents = []
  for i in link:
      r = requests.get(i)
      soup = BeautifulSoup(r.content, 'html.parser')

      sen = []
      for i in soup.find('div', {'class':'read__content'}).find_all('p'):
          sen.append(i.text)
      documents.append(' '.join(sen))

  # Clean Paragraphs
  documents_clean = []
  for d in documents:
      document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
      document_test = re.sub(r'@\w+', '', document_test)
      document_test = document_test.lower()
      document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
      document_test = re.sub(r'[0-9]', '', document_test)
      document_test = re.sub(r'\s{2,}', ' ', document_test)
      documents_clean.append(document_test)

  return documents_clean

# TF-IDF

In [5]:
docs = retrieve_docs_and_clean()

# Create Term-Document Matrix with TF-IDF weighting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)

# Create a DataFrame
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
above,0.0,0.0,0.0,0.034315,0.0,0.0,0.0,0.0,0.0,0.0
acara,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023342,0.0,0.0
access,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02012,0.0
accounts,0.0,0.0,0.0,0.0,0.0,0.0,0.019155,0.0,0.0,0.0
ad,0.0,0.0,0.256347,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Menampilkan Dokumen

In [7]:
def get_similar_articles(q, df):
  print("query:", q)
  print("Berikut artikel dengan nilai cosine similarity tertinggi: ")
  q = [q]
  q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
  sim = {}
  for i in range(10):
    sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
  sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
  
  for k, v in sim_sorted:
    if v != 0.0:
      print("Nilai Similaritas:", v)
      print(docs[k])
      print()


q1 = 'setting'
q2 = 'iphone'
q3 = 'postingan'

get_similar_articles(q1, df)
print('-'*100)
get_similar_articles(q2, df)
print('-'*100)
get_similar_articles(q3, df)

query: setting
Berikut artikel dengan nilai cosine similarity tertinggi: 
Nilai Similaritas: 0.032567381806147065
kompas com kehabisan baterai ponsel memang menyebalkan apalagi kalau terjadi ketika sedang tidak memegang power bank atau charger sementara ada hal penting yang mesti dilakukan dengan perangkat tersebut untungnya sebagian besar smartphone modern yang beredar sekarang memiliki fitur power saving yang bisa diaktifkan agar baterai bisa tahan lebih lama di saat kritis baca juga cara mengatasi baterai tws yang eror tidak bisa mengecas di luar itu google mengatakan bahwa sebenarnya ada beberapa setting lain di smartphone android yang bisa anda atur untuk menghemat daya baterai berikut ini penjelasannya sebagaimana dihimpun kompastekno dari gizchina rabu kompas com oik yusuf pilihan untuk mengatur durasi waktu sebelum layar ponsel dimatikan otomatissmartphone secara otomatis akan mematikan layar setelah beberapa lama tidak dipakai jangka waktunya bisa diatur oleh pengguna mulai da