In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# 1. Load data and remove duplicates
df = pd.read_csv("shared_articles.csv")
df.dropna(subset=["text", "title"], inplace=True)
df = df.drop_duplicates(subset="contentId")

# 2. Build TF-IDF matrix and cosine similarity
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# 3. Similarity matrix using contentId as index/columns
df_results = pd.DataFrame(cosine_sim, index=df['contentId'], columns=df['contentId'])

# 4. Define content recommender
def recommend_content(content_id, top_k=5):
    try:
        if content_id not in df_results.index:
            return []
        sim_scores = df_results.loc[content_id]
        top_ids = sim_scores.sort_values(ascending=False).drop(content_id).head(top_k).index.tolist()
        return top_ids
    except Exception as e:
        print(f"Error in recommend_content({content_id}): {e}")
        return []

# 5. Build recommendation CSV
results = []

for content_id in df['contentId']:
    try:
        rec_ids = recommend_content(content_id, top_k=5)
        titles = [df.loc[df['contentId'] == rid, "title"].values[0] if rid in df['contentId'].values else "Unknown" for rid in rec_ids]
        row = {"contentId": content_id}
        for i, title in enumerate(titles):
            row[f"recommendationTitle{i+1}"] = title
        results.append(row)
    except Exception as e:
        print(f"Skipping contentId {content_id} due to error: {type(e).__name__} - {e}")

pd.DataFrame(results).to_csv("ContentFiltering.csv", index=False)
