In [1]:
import re,string,requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
def retrieve_docs_and_clean():
    r = requests.get('https://bola.kompas.com/')
    
    soup = BeautifulSoup(r.content, 'html.parser')

    link = []
    for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
        i['href'] = i['href'] + '?page=all'
        link.append(i['href'])
    
    print(f'Number of links is ({len(link)})')
    print(f'second link is \n')
    print(link[1])
    print('===============================')
    documents = []
    for i in link:
        r = requests.get(i)
        soup = BeautifulSoup(r.content, 'html.parser')

        sen = []
        for i in soup.find('div', {'class':'read__content'}).find_all('p'):
            sen.append(i.text)
        print(f'number of sentences is {len(sen)} and first sentence is ({sen[0]})')
        documents.append(' '.join(sen))
    print('===============================')
    documents_clean = []
    for d in documents:
        document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
        document_test = re.sub(r'@\w+', '', document_test)
        document_test = document_test.lower()
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
        document_test = re.sub(r'[0-9]', '', document_test)
        document_test = re.sub(r'\s{2,}', ' ', document_test)
        documents_clean.append(document_test)

    print(documents_clean)
    return documents_clean


def get_similar_articles(q, df):
    print("query:", q)
    print("Article with the highest cosine similarity value: ")
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    print(f'QVec shape is ({q_vec.shape})')
    sim = {}
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
    print(sim)
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)

    for k, v in sim_sorted:
        if v != 0.0:
            print("Similarity Value:", v)
            print(docs[k])
            print()


In [3]:
docs = retrieve_docs_and_clean()

Number of links is (10)
second link is 

https://www.kompas.com/badminton/read/2021/07/26/13122808/rekap-hasil-badminton-olimpiade-tokyo-2-wakil-indonesia-tembus-perempat?page=all
number of sentences is 25 and first sentence is (KOMPAS.com - Indonesia sementara menjadi negara Asia Tenggara (ASEAN) dengan koleksi medali terbanyak hingga hari kedua Olimpiade Tokyo 2020, Minggu (25/7/2021) malam WIB.)
number of sentences is 23 and first sentence is (KOMPAS.com - Dua wakil Indonesia telah dipastikan menembus perempat final cabor bulu tangkis Olimpiade Tokyo 2020.)
number of sentences is 36 and first sentence is (TOKYO, KOMPAS.com - Total 12 atlet Indonesia akan bertanding pada hari ketiga Olimpiade Tokyo 2020, Senin (26/7/2021).)
number of sentences is 26 and first sentence is (KOMPAS.com - Perjalanan mulus menyertai langkah atlet-atlet Indonesia yang turun di cabang olahraga bulu tangkis Olimpiade Tokyo 2020.)
number of sentences is 29 and first sentence is (KOMPAS.com - Ada rasa bersalah

In [4]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)
X.shape

(10, 957)

In [5]:
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names())
print(df.shape)
df.head(20)

(957, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
aaron,0.0,0.0,0.061563,0.0,0.0,0.0,0.0,0.103538,0.0,0.0
ada,0.0,0.0,0.023943,0.0,0.017491,0.036119,0.0,0.0,0.0,0.03948
adalah,0.018,0.0,0.035301,0.020347,0.012894,0.026627,0.0,0.03958,0.02077,0.0
adapun,0.032784,0.076198,0.016074,0.018529,0.023485,0.036373,0.0,0.0,0.018915,0.013252
afp,0.0,0.0,0.030781,0.035483,0.0,0.0,0.0,0.0,0.0,0.0
afrika,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.029853
agar,0.0,0.057217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
agatha,0.0,0.0,0.072419,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ahsan,0.0,0.0,0.134651,0.093131,0.0,0.0,0.0,0.120778,0.0,0.0
air,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04261,0.0


In [6]:
df.tail(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
wang,0.0,0.114434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
watanabe,0.0,0.075667,0.047886,0.0,0.0,0.0,0.0,0.026845,0.028175,0.0
wave,0.0,0.0,0.0,0.0,0.0,0.054625,0.0,0.0,0.0,0.0
wib,0.054001,0.083672,0.017651,0.0,0.012894,0.013314,0.01992,0.0,0.041541,0.0
wijaya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040599,0.0,0.0
wilson,0.0,0.0,0.0,0.0,0.0,0.027312,0.0,0.0,0.0,0.0
win,0.0,0.0,0.0,0.0,0.0,0.0,0.040865,0.0,0.0,0.0
windy,0.054927,0.0,0.0,0.0,0.039347,0.040626,0.0,0.0,0.0,0.0
wing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034513,0.036222,0.0
women,0.0,0.0,0.0,0.0,0.0,0.054625,0.0,0.0,0.0,0.0


In [9]:
q = 'windy'
get_similar_articles(q, df)


query: windy
Article with the highest cosine similarity value: 
QVec shape is ((957,))
{0: 0.05492675654649779, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.03934680877143973, 5: 0.040626079737647036, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0}
Similarity Value: 0.05492675654649779
kompas com indonesia sementara menjadi negara asia tenggara asean dengan koleksi medali terbanyak hingga hari kedua olimpiade tokyo minggu malam wib indonesia sejauh ini sudah mengoleksi dua medali dengan rincian satu perak dan satu perunggu medali kedua indonesia dipersembahkan oleh atlet angkat besi senior eko yuli irawan pada minggu sore wib turun di kelas kg putra eko yuli irawan sukses meraih medali perak dengan total angkatan kg rincian dari total angkatan terbaik eko yuli irawan adalah kg snatch dan kg clean jerk meski medali emas masih luput eko yuli irawan berhasil mengukir sejarah berkat keberhasilan mendapatkan perak di olimpiade tokyo baca juga daftar perolehan medali olimpiade tokyo posisi indonesia tak berubah dapatkan in

In [10]:
q = 'adalah'
get_similar_articles(q, df)

query: adalah
Article with the highest cosine similarity value: 
QVec shape is ((957,))
{0: 0.018000187484142543, 1: 0.0, 2: 0.03530141158895425, 3: 0.020346883493442214, 4: 0.012894443060533826, 5: 0.02662735242356117, 6: 0.0, 7: 0.039580462674494124, 8: 0.020770431498533408, 9: 0.0}
Similarity Value: 0.039580462674494124
kompas com babak penyisihan grup bulu tangkis olimpiade tokyo berlanjut hari ini senin di musashino forest sport plaza empat wakil indonesia dijadwalkan turun gelanggang pada hari ini mereka tersebar di tiga nomor berbeda yakni ganda putra ganda putri dan ganda campuran keempatnya adalah marcus fernaldi gideon kevin sanjaya sukamuljo mohammad ahsan hendra setiawan greysia polii apriyani rahayu dan praveen jordan melati daeva oktavianti praveen melati yang paling pertama bermain mereka akan meladeni kekuatan wakil tuan rumah yuta watanabe arisa higashino pada laga ketiga grup c pada dua pertandingan sebelumnya praveen melati yang menempati unggulan keempat olimpiade t

In [12]:
q = 'wang'
get_similar_articles(q, df)

query: wang
Article with the highest cosine similarity value: 
QVec shape is ((957,))
{0: 0.0, 1: 0.11443353827452567, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0}
Similarity Value: 0.11443353827452567
kompas com dua wakil indonesia telah dipastikan menembus perempat final cabor bulu tangkis olimpiade tokyo adapun wakil indonesia yang menembus perempat final ialah praveen jordan melati daeva oktavianti dan marcus fernaldi gideon kevin sanjaya sukamuljo praveen melati tampil lebih dulu di nomor ganda campuran pada senin pagi wib kali ini praveen melati berhadapan dengan yuta watanabe arisa higashino jepang di laga terakhir grup c sayang pasangan nomor dunia itu kalah di laga pamungkas grup c setelah kalah dua gim langsung baca juga hasil badminton olimpiade tokyo praveen melati takluk dari wakil jepang bertanding di musashino forest sport plaza tokyo praveen melati kalah dengan skor dalam laga yang berlangsung selama menit dapatkan informasi inspirasi dan insight di e