In [30]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score


In [31]:
n_users = 100
n_items = 200

users = pd.DataFrame({'user_id': range(n_users)})

news = pd.DataFrame({
    'news_id': range(n_items),
    'title': [f"Article {i}" for i in range(n_items)],
    'text': [f"This is a news article about {topic}" for topic in np.random.choice(['sports', 'tech', 'politics', 'health'], n_items)]
})

interactions = pd.DataFrame({
    'user_id': np.random.choice(n_users, size=1000),
    'news_id': np.random.choice(n_items, size=1000),
    'clicked': np.random.randint(0, 2, size=1000)
})


In [36]:
from scipy.sparse.linalg import svds

user_item_matrix = interactions.pivot_table(index='user_id', columns='news_id', values='clicked')
user_item_matrix = user_item_matrix.reindex(columns=range(n_items), fill_value=0).fillna(0)
matrix = user_item_matrix.values.astype(float)

U, sigma, Vt = svds(matrix, k=20)
U, sigma, Vt = U[:, ::-1], sigma[::-1], Vt[::-1, :]

user_factors = U
item_factors = Vt.T
collab_scores = np.dot(user_factors, item_factors.T)


In [37]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(news['text'])
content_sim_matrix = cosine_similarity(tfidf_matrix)


In [38]:
# Normalize both scores
collab_norm = (collab_scores - collab_scores.min()) / (collab_scores.max() - collab_scores.min())
content_norm = (content_sim_matrix - content_sim_matrix.min()) / (content_sim_matrix.max() - content_sim_matrix.min())

# Create hybrid scores: average of collab and content similarities
hybrid_scores = np.zeros_like(collab_norm)

for user_id in range(n_users):
    user_interactions = user_item_matrix.values[user_id]
    viewed_indices = np.where(user_interactions > 0)[0]

    if len(viewed_indices) > 0:
        avg_content_score = content_norm[viewed_indices].mean(axis=0)
    else:
        avg_content_score = np.zeros(n_items)

    hybrid_scores[user_id] = 0.5 * collab_norm[user_id] + 0.5 * avg_content_score


In [39]:
hybrid_scores

array([[0.30001042, 0.2638072 , 0.23440052, ..., 0.29885988, 0.21440158,
        0.32072835],
       [0.11836865, 0.60920852, 0.12668971, ..., 0.6091145 , 0.60791419,
        0.12886456],
       [0.1119669 , 0.63316771, 0.11947373, ..., 0.60450159, 0.61010303,
        0.10654488],
       ...,
       [0.10978674, 0.61966986, 0.12076788, ..., 0.63705945, 0.6092528 ,
        0.13813493],
       [0.18616283, 0.31835863, 0.20533843, ..., 0.32076353, 0.37447371,
        0.19733959],
       [0.22326739, 0.40863152, 0.13259441, ..., 0.33426357, 0.33344102,
        0.11924878]], shape=(100, 200))