In [30]:
%pip install beautifulsoup4 scikit-learn pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
import json
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [32]:
# incarcam datele din JSON
try:
    with open('lab4_docs/tesco_sample.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"Products in dataset: {len(data)}.")
except FileNotFoundError:
    print("File not found")
    data = []

Products in dataset: 266.


In [33]:
product_names = []
cleaned_descriptions = []

print("Procesare...")

for item in data:
    # extragere descriere
    raw_desc = item.get('description', '')
    name = item.get('name', 'Unknown Product')
    
    # stergere tag uri html
    soup = BeautifulSoup(raw_desc, "html.parser")
    text_desc = soup.get_text(separator=" ")
    
    product_names.append(name)
    cleaned_descriptions.append(text_desc)

print("Text curatat.")

Procesare...
Text curatat.


In [34]:
# initializam vectorizatorul TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')

# transformam descrierile in matrice
tfidf_matrix = vectorizer.fit_transform(cleaned_descriptions)

print(f"Matrix created. Dimensions: {tfidf_matrix.shape}")

Matrix created. Dimensions: (266, 4215)


In [35]:
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

In [36]:
max_score = 0
best_pair = (0, 0)

# gasim maximul in triunghiul superior al matricei
for i in range(len(cosine_sim_matrix)):
    for j in range(i + 1, len(cosine_sim_matrix)):
        score = cosine_sim_matrix[i][j]
        if score > max_score:
            max_score = score
            best_pair = (i, j)

idx1, idx2 = best_pair

print(f"scor similaritate maxim: {max_score:.4f}")
print(f"Prod 1: {product_names[idx1]}")
print(f"Prod 2: {product_names[idx2]}")

scor similaritate maxim: 1.0000
Prod 1: Unibond Sealant Re-New
Prod 2: Unibond Sealant Re-New


In [38]:
# generare document cu matricea de similaritate
with open("./lab4_docs/similarity_matrix.txt", "w", encoding="utf-8") as f:
    for row in cosine_sim_matrix:
        row_str = "\t".join(f"{value:.4f}" for value in row)
        f.write(row_str + "\n")