In [3]:
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
from collections import defaultdict
import json
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import SparseMatrixSimilarity

In [20]:

# Load datasets
with open("./intermediate_data/article_data.json", "r", encoding="utf-8") as f:
    articles = json.load(f)

with open("./intermediate_data/digikey_product_data.json", "r", encoding="utf-8") as f:
    raw_data = json.load(f)

products = raw_data.get("categories", [])

# Preprocess function
def preprocess(text):
    return [token for token in simple_preprocess(text) if token not in STOPWORDS]

# Preprocess article texts
article_texts = [preprocess(article["text"]) for article in articles]

# Preprocess product descriptions (combine name + category)
product_texts = [preprocess(f"{p['name']} {p.get('category', '')}") for p in products]

# Combine all for dictionary
all_texts = article_texts + product_texts

# Create dictionary and corpus
dictionary = Dictionary(all_texts)
corpus = [dictionary.doc2bow(text) for text in all_texts]

# Build TF-IDF model
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

# Split article and product vectors
article_tfidf_vectors = corpus_tfidf[:len(articles)]
product_tfidf_vectors = corpus_tfidf[len(articles):]

# Create similarity index for products
product_index = SparseMatrixSimilarity(product_tfidf_vectors, num_features=len(dictionary))

#########################

# 1. Keyword Matching – Find mentioned products in each article
def find_mentioned_products(article_text):
    mentions = []
    text_lower = article_text.lower()
    for product in products:
        if product['name'].lower() in text_lower:
            mentions.append(product)
    return mentions

# 2. Semantic Matching – Recommend similar products for an article
def recommend_similar_products(article_text, top_n=5):
    bow = dictionary.doc2bow(preprocess(article_text))
    tfidf_vec = tfidf[bow]
    sims = product_index[tfidf_vec]
    ranked = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
    return [(products[i], score) for i, score in ranked[:top_n]]

def recommend_similar_products(article_text, top_n=5):
    bow = dictionary.doc2bow(preprocess(article_text))
    tfidf_vec = tfidf[bow]
    sims = product_index[tfidf_vec]
    ranked = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
    
    # Removing duplicates by product name while keeping highest score
    seen_products = {}
    unique_results = []
    
    for i, score in ranked:
        product = products[i]
        product_key = (product['name'], product['category'])  # Use both name and category as key
        
        if product_key not in seen_products:
            seen_products[product_key] = True
            unique_results.append((product, (score*100)))
            
        if len(unique_results) >= top_n:
            break
    
    return unique_results

# Updated loop for duplicate handling
for i, article in enumerate(articles):
    print(f"\nArticle {i+1}: {article['title']}")
    
    # 1. Keyword Matching
    mentioned = find_mentioned_products(article["text"])
    if mentioned:
        print("Products mentioned in article:")
        for prod in mentioned:
            print(f"   - {prod['name']} ({prod['category']})")
    else:
        print("No direct product mentions.")

    # 2. Semantic Matching
    print("Top 5 similar products (semantic):")
    results = recommend_similar_products(article["text"])
    
    if not results:
        print("   No similar products found.")
    else:
        for product, score in results:
            print(f"   Score: {score:.3f} - {product['name']} ({product['category']})")



Article 1: Elektra Awards 2025 open for entries
Products mentioned in article:
   - Electromechanical (Other)
   - Logic (Other)
   - Power (Other)
Top 5 similar products (semantic):
   Score: 5.662 - Discrete Semiconductor Products (Other)
   Score: 3.031 - Power Entry Connectors (Cables & Connectors)
   Score: 2.976 - View All (Other)
   Score: 2.548 - Magnetic Strip, Smart Card Readers (Other)
   Score: 2.065 - Automotive Relays (Other)

Article 2: NMI hosts industry conference in Glasgow with theme of growth
Products mentioned in article:
   - Speakers (Audio Products)
   - Semiconductors (Other)
Top 5 similar products (semantic):
   Score: 16.700 - Semiconductors (Other)
   Score: 12.996 - Discrete Semiconductor Products (Other)
   Score: 6.389 - Speakers (Audio Products)
   Score: 4.873 - Digital Isolators (Other)
   Score: 4.754 - TVS - Mixed Technology (Other)

Article 3: DigiKey introduces own-brand DigiKey Standard product line
Products mentioned in article:
   - Power (Othe