In [1]:
import re,string,requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def retrieve_docs_and_clean():
    r = requests.get('https://bola.kompas.com/')
    
    soup = BeautifulSoup(r.content, 'html.parser')

    link = []
    for i in soup.find('div', {'class':'most__wrap'}).find_all('a'):
        i['href'] = i['href'] + '?page=all'
        link.append(i['href'])
    
    print(f'Number of links is ({len(link)})')
    print(f'second link is \n')
    print(link[1])
    print('===============================')
    documents = []
    for i in link:
        r = requests.get(i)
        soup = BeautifulSoup(r.content, 'html.parser')

        sen = []
        for i in soup.find('div', {'class':'read__content'}).find_all('p'):
            sen.append(i.text)
        print(f'number of sentences is {len(sen)} and first sentence is ({sen[0]})')
        documents.append(' '.join(sen))
    print('===============================')
    documents_clean = []
    for d in documents:
        document_test = re.sub(r'[^\x00-\x7F]+', ' ', d)
        document_test = re.sub(r'@\w+', '', document_test)
        document_test = document_test.lower()
        document_test = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', document_test)
        document_test = re.sub(r'[0-9]', '', document_test)
        document_test = re.sub(r'\s{2,}', ' ', document_test)
        documents_clean.append(document_test)

    print(documents_clean)
    return documents_clean


def get_similar_articles(q, df):
    print("query:", q)
    print("Article with the highest cosine similarity value: ")
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    print(f'QVec shape is ({q_vec.shape})')
    sim = {}
    for i in range(10):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
  
    print(sim)
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)

    for k, v in sim_sorted:
        if v != 0.0:
            print("Similarity Value:", v)
            print(docs[k])
            print()


In [3]:
docs = retrieve_docs_and_clean()


Number of links is (10)
second link is 

https://www.kompas.com/sports/read/2023/11/28/21055618/hasil-piala-dunia-u17-2023-kalahkan-mali-perancis-tantang-jerman-di-final?page=all
number of sentences is 18 and first sentence is (KOMPAS.com - Ketua Umum PSSI, Erick Thohir, mengatakan bahwa Piala Dunia U17 2023 menjadi sebuah kesempatan bagi Indonesia untuk membangun dasar kuat sepak bola.)
number of sentences is 27 and first sentence is ()
number of sentences is 22 and first sentence is ()
number of sentences is 22 and first sentence is (KOMPAS.com – Irfan Bachdim menjalani debut yang manis bersama tim barunya, Persik Kediri pada laga pekan ke-20 Liga 1 2023-2024.)
number of sentences is 33 and first sentence is ()
number of sentences is 24 and first sentence is ()
number of sentences is 17 and first sentence is (KOMPAS.com - Argentina akan bersua Jerman pada babak semifinal Piala Dunia U17 2023. Link live streaming Argentina vs Jerman termuat dalam artikel ini.)
number of sentences is 2

In [4]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)
X.shape

(10, 1255)

In [5]:
# vect = vectorizer.get_feature_names_out()
df = pd.DataFrame(X.T.toarray(), index=vectorizer.get_feature_names_out())
print(df.shape)
df.head(20)

(1255, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
ac,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032532,0.0,0.128822
acuna,0.0,0.0,0.0,0.0,0.063899,0.0,0.0,0.0,0.0,0.0
ada,0.0,0.022336,0.0,0.0,0.0,0.051512,0.0,0.025305,0.0,0.02004
adalah,0.092792,0.018134,0.027891,0.0,0.0,0.020911,0.02865,0.061632,0.0,0.0
adapun,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030308
adeyemi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030308
adidas,0.0,0.0,0.0,0.0,0.0,0.077904,0.0,0.0,0.0,0.0
adli,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030308
adu,0.0,0.025123,0.0,0.0,0.11881,0.0,0.0,0.0,0.070032,0.0
afrika,0.0,0.0,0.051955,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.tail(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
video,0.0,0.03378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
villalba,0.0,0.0,0.0,0.0,0.095849,0.0,0.0,0.0,0.0,0.0
von,0.0,0.0,0.0,0.0,0.063899,0.0,0.0,0.0,0.0,0.0
vs,0.042129,0.0,0.101303,0.0335,0.015574,0.0,0.182104,0.0,0.02295,0.073869
wakil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.030308
waktu,0.0,0.020059,0.0,0.061215,0.037945,0.023131,0.0,0.0,0.027958,0.0
waktunya,0.0,0.0,0.0,0.0,0.0,0.038952,0.0,0.0,0.0,0.0
wasit,0.0,0.057432,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025764
wayan,0.0,0.0,0.0,0.068724,0.0,0.0,0.0,0.0,0.0,0.0
wib,0.0,0.020059,0.030852,0.0,0.0,0.0,0.031691,0.0,0.027958,0.017997


In [7]:
q = 'windy'
get_similar_articles(q, df)


query: windy
Article with the highest cosine similarity value: 
QVec shape is ((1255,))
{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0}


In [8]:
q = 'wang'
get_similar_articles(q, df)

query: wang
Article with the highest cosine similarity value: 
QVec shape is ((1255,))
{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 0.0}
