In [63]:
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
from collections import defaultdict
import json
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity
import random
import pandas as pd

In [66]:
# Load datasets
with open("./intermediate_data/article_data.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

with open("./intermediate_data/digikey_product_data.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

products = raw_data.get("categories", [])

# Preprocess function
def preprocess(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

# Preprocess article texts
article_texts = []
for article in articles:
    text = article.get("text")
    if text:  # ensure it's not None or empty
        article_texts.append(preprocess(text))

# Preprocess product descriptions (combine name + category)
product_texts = [preprocess(f"{p['name']} {p.get('category', '')}") for p in products]

# Combine all for dictionary
all_texts = article_texts + product_texts

# Create dictionary and corpus
dictionary = Dictionary(all_texts)
corpus = [dictionary.doc2bow(text) for text in all_texts]

# Build TF-IDF model
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# Split article and product vectors
article_tfidf_vectors = corpus_tfidf[:len(articles)]
product_tfidf_vectors = corpus_tfidf[len(articles):]

# Create similarity index for ARTICLES (reversed)
article_index = SparseMatrixSimilarity(article_tfidf_vectors, num_features=len(dictionary))

#########################

# 1. Keyword Matching – Find articles that mention this product
def find_articles_mentioning_product(product_name):
    mentions = []
    product_name_lower = product_name.lower()
    for i, article in enumerate(articles):
        text = article.get('text')
        if text and product_name_lower in text.lower():
            mentions.append((i, article))
    
    return mentions
# 2. Semantic Matching – Find similar articles for a product
def recommend_similar_articles(product_name, product_category, top_n=5):
    product_text = f"{product_name} {product_category}"
    bow = dictionary.doc2bow(preprocess(product_text))
    tfidf_vec = tfidf[bow]
    sims = article_index[tfidf_vec]
    ranked = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
    
    # Removing duplicates using article name
    seen_articles = {}
    unique_results = []
    
    for i, score in ranked:
        article = articles[i]
        article_key = article['title']
        
        if article_key not in seen_articles:
            seen_articles[article_key] = True
            unique_results.append((article, (score*100)))
            
        if len(unique_results) >= top_n:
            break
    
    return unique_results

# Select random products
noOfProducts=300
random.seed(noOfProducts)
random_products = random.sample(products, min(noOfProducts, len(products)))

all_results = []

# Number of top articles to include per product
top_n = 3

final_rows = []

for product in random_products:

    # Get top_n article recommendations
    results = recommend_similar_articles(product['name'], product.get('category', ''), top_n=top_n)

     # Filter only results with score > 0
    valid_results = [(article, score) for article, score in results if score > 0]

    # Skip product if no valid articles
    if not valid_results:
       continue

    # Build row only for products with valid articles
    row = {
        'Product Name': product['name'],
        'Product url': product.get('url', '')
    }

    # Add article scores and titles
    article_idx = 1
    for article, score in results:
        if score > 0:
            row[f'Article_{article_idx}_Score'] = round(score, 3)
            row[f'Article_{article_idx}_Title'] = article.get('title', '')
            article_idx += 1
    final_rows.append(row)

df_wide = pd.DataFrame(final_rows)

# Save to CSV
df_wide.to_csv('./intermediate_data/product_article_matches_flat.csv', index=False)