# Interest-based Post Recommender (Hybrid: Content + CF)

_A clean, Colab-style notebook you can run end-to-end._

In [None]:
# %% [markdown]
# ## Imports & Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix, vstack as sp_vstack
import warnings
warnings.filterwarnings("ignore")


In [None]:
# %% [markdown]
# ## Load Data
# Adjust the paths below if running outside this environment.
USERS_PATH = "/mnt/data/Users.csv"
POSTS_PATH = "/mnt/data/Posts.csv"
ENG_PATH   = "/mnt/data/Engagements.csv"

users = pd.read_csv(USERS_PATH)
posts = pd.read_csv(POSTS_PATH)
eng   = pd.read_csv(ENG_PATH)

users.head(), posts.head(), eng.head()


In [None]:
# %% [markdown]
# ## Quick EDA
print("Users:", users.shape, list(users.columns))
print("Posts:", posts.shape, list(posts.columns))
print("Engagements:", eng.shape, list(eng.columns))

# Basic distributions
print("\nUnique users:", users['user_id'].nunique())
print("Unique posts:", posts['post_id'].nunique())
print("Unique creators:", posts['creator_id'].nunique())

# Plot engagement distribution (if numeric/binary)
if 'engagement' in eng.columns:
    eng['engagement'].value_counts(dropna=False).sort_index().plot(kind="bar")
    plt.title("Engagement Value Counts")
    plt.xlabel("engagement")
    plt.ylabel("count")
    plt.show()


In [None]:
# %% [markdown]
# ## Preprocess: Split Interests, Normalize Post Text

# Split and clean interests
if "top_3_interests" in users.columns:
    split_cols = users["top_3_interests"].fillna("").astype(str).str.split(",", expand=True)
    # keep up to 3 columns
    split_cols = split_cols.iloc[:, :3] if split_cols.shape[1] >= 3 else split_cols.reindex(columns=[0,1,2])
    split_cols = split_cols.rename(columns={0:"interest_1",1:"interest_2",2:"interest_3"})
    for c in ["interest_1","interest_2","interest_3"]:
        if c in split_cols.columns:
            split_cols[c] = split_cols[c].fillna("").astype(str).str.strip()
        else:
            split_cols[c] = ""
    users = pd.concat([users, split_cols], axis=1)

# Normalize posts text from content_type + tags
posts["tags"] = posts["tags"].fillna("").astype(str)
posts["content_type"] = posts["content_type"].fillna("").astype(str)
posts["post_text"] = posts["content_type"].str.replace(r"[_\-]", " ", regex=True) + " " + posts["tags"].str.replace(",", " ")
posts[["post_id","post_text"]].head()


In [None]:
# %% [markdown]
# ## TF-IDF for Content-Based Similarity
tfidf = TfidfVectorizer(lowercase=True, token_pattern=r"(?u)\b\w+\b", ngram_range=(1,2), min_df=1)
post_tfidf = tfidf.fit_transform(posts["post_text"])
post_tfidf.shape


In [None]:
# %% [markdown]
# ## Build User Content Profiles (Interests + History)

def interests_to_vector(row):
    # Join up to three interests
    interests = " ".join([str(row.get("interest_1","")), str(row.get("interest_2","")), str(row.get("interest_3",""))]).strip()
    return tfidf.transform([interests])

# Interest vectors
user_interest_vecs = [interests_to_vector(row) for _, row in users.iterrows()]
user_interest_mat = sp_vstack(user_interest_vecs) if len(user_interest_vecs) else csr_matrix((0, post_tfidf.shape[1]))

# Map IDs to indices
user_ids = users["user_id"].astype(str).tolist()
post_ids = posts["post_id"].astype(str).tolist()
u_index = {u:i for i,u in enumerate(user_ids)}
p_index = {p:i for i,p in enumerate(post_ids)}

# Ensure types in engagements
eng = eng.copy()
eng["user_id"] = eng["user_id"].astype(str)
eng["post_id"] = eng["post_id"].astype(str)

# Keep only known users/posts in engagements
eng = eng[eng["user_id"].isin(u_index) & eng["post_id"].isin(p_index)]

# Build a history-weighted user profile from engaged posts
num_users = len(u_index)
user_history_mat = csr_matrix((num_users, post_tfidf.shape[1]))

if eng.shape[0] > 0:
    grouped = eng.groupby("user_id")
    rows, mats = [], []
    for uid, grp in grouped:
        ridx = u_index[uid]
        pidx = grp["post_id"].map(p_index).values
        weights = grp["engagement"].astype(float).values
        wsum = weights.sum()
        if wsum <= 0 or len(pidx) == 0:
            continue
        # Weighted average of post vectors
        user_vec = (post_tfidf[pidx].multiply(weights[:, None])).sum(axis=0) / wsum
        rows.append(ridx)
        mats.append(csr_matrix(user_vec))
    if mats:
        stacked = sp_vstack(mats)
        user_history_mat = csr_matrix(user_history_mat, copy=True)
        user_history_mat[rows, :] = stacked

alpha = 0.6  # interests vs history weight
user_content_vec = (alpha * user_interest_mat) + ((1 - alpha) * user_history_mat)
user_content_vec.shape


In [None]:
# %% [markdown]
# ## Collaborative Filtering via SVD

num_items = len(p_index)
row_idx = eng["user_id"].map(u_index).values
col_idx = eng["post_id"].map(p_index).values
data_vals = eng["engagement"].astype(float).values

ui = csr_matrix((data_vals, (row_idx, col_idx)), shape=(num_users, num_items))

k = min(20, min(num_users, num_items) - 1) if min(num_users, num_items) > 2 else 2
svd = TruncatedSVD(n_components=k, random_state=42)
user_f = svd.fit_transform(ui)   # U * Sigma
item_f = svd.components_.T       # V
cf_scores = np.dot(user_f, item_f.T)  # (num_users x num_items)
cf_scores.shape


In [None]:
# %% [markdown]
# ## Content-Based Scores (Cosine Similarity)
content_scores = cosine_similarity(user_content_vec, post_tfidf)  # (num_users x num_items)
content_scores.shape


In [None]:
# %% [markdown]
# ## Hybrid Scores & Top-3 Recommendations
lambda_hybrid = 0.5  # blend CF and content
hybrid_scores = lambda_hybrid * content_scores + (1 - lambda_hybrid) * cf_scores

# Exclude already positively engaged posts
already = eng[eng["engagement"] > 0].groupby("user_id")["post_id"].apply(set).to_dict()

top_k = 3
recs = []
for uid in user_ids:
    uidx = u_index[uid]
    scores = hybrid_scores[uidx]
    exclude = already.get(uid, set())
    order = np.argsort(-scores)  # descending
    picks, pick_scores = [], []
    for j in order:
        pid = post_ids[j]
        if pid in exclude:
            continue
        picks.append(pid)
        pick_scores.append(float(scores[j]))
        if len(picks) == top_k:
            break
    while len(picks) < top_k:
        picks.append(None)
        pick_scores.append(None)
    recs.append({
        "user_id": uid,
        "rec_1": picks[0], "score_1": pick_scores[0],
        "rec_2": picks[1], "score_2": pick_scores[1],
        "rec_3": picks[2], "score_3": pick_scores[2],
    })

recs_df = pd.DataFrame(recs)
recs_df.head()


In [None]:
# %% [markdown]
# ## Quick (Simulated) Validation: Hit-Rate@3
np.random.seed(42)
pos = eng[eng["engagement"] > 0].copy()
if len(pos) > 0:
    pos["rand"] = np.random.rand(len(pos))
    holdout = pos[pos["rand"] < 0.1]
    holdout_map = holdout.groupby("user_id")["post_id"].apply(set).to_dict()

    hits, total = 0, 0
    for uid, items in holdout_map.items():
        rec_row = recs_df[recs_df["user_id"] == uid]
        if rec_row.empty:
            continue
        recs_set = set(rec_row[["rec_1","rec_2","rec_3"]].values.flatten().tolist())
        recs_set.discard(None)
        total += 1
        if len(recs_set & items) > 0:
            hits += 1

    hit_rate_at3 = (hits / total) if total > 0 else np.nan
else:
    hit_rate_at3 = np.nan

print("Hit-Rate@3:", hit_rate_at3)


In [None]:
# %% [markdown]
# ## Save Outputs (Recommendations CSV + Short Report)
recs_path = "/mnt/data/recommendations.csv"
recs_df.to_csv(recs_path, index=False)
print("Saved:", recs_path)

report = f"""# Hybrid Recommender (Content + CF)

**Data**: Users (n={len(users)}), Posts (n={len(posts)}), Engagements (n={len(eng)}).

## Method
1. **Content-based**:
   - TF-IDF over `content_type + tags` for each post.
   - User profile = 0.6 × TF-IDF(interests) + 0.4 × TF-IDF(history-weighted).
   - Score = cosine similarity between user profile and post vectors.

2. **Collaborative Filtering**:
   - User–Item matrix from `engagement`.
   - Low-rank factors via TruncatedSVD (k={int(min(20, min(len(users), len(posts)) - 1)) if min(len(users), len(posts)) > 2 else 2}).
   - Score = reconstructed dot product between user and item factors.

3. **Hybrid**:
   - Final score = 0.5 × Content + 0.5 × CF.
   - Exclude posts already positively engaged by the user.
   - Return Top-3 per user.

## Quick Validation (simulated)
- 10% random holdout of positive interactions per user (no timestamps available).
- **Hit-Rate@3** = {hit_rate_at3:.3f} (nan if no positives/holdouts).

## Notes & Extensions
- If timestamps exist, do a **time-based split** to avoid leakage.
- Tune weights `alpha` (interests vs history) and `lambda` (content vs CF) by validation.
- Replace TF-IDF with **SentenceTransformers** if post text is richer.
- Replace SVD with **implicit ALS** or **Neural CF** for stronger collaborative signal.
- Add **diversity** constraints (MMR) and **freshness** boosts for production.
"""

report_path = "/mnt/data/report.md"
with open(report_path, "w") as f:
    f.write(report)
print("Saved:", report_path)


In [None]:
# %% [markdown]
# ## Next Steps
# - Use time-based split if timestamps exist to avoid leakage.
# - Hyperparameter tuning for `alpha` and `lambda_hybrid`.
# - Consider implicit ALS (e.g., `implicit` library) or Neural CF for stronger CF.
# - Use SentenceTransformers for richer text embeddings if post descriptions are long.
# - Add re-ranking for diversity and freshness.
