<a href="https://colab.research.google.com/github/aman8934/post_recommendation/blob/main/post_recommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re

In [2]:
user_df=pd.read_csv('/content/Assessment data - users.csv')

In [3]:
user_df

Unnamed: 0,user_id,interested_in
0,2,"Memes,Money,News,Personal Finance,Learn,Invest..."
1,3,"Memes,Money,News,Personal Finance,Learn,Invest..."
2,4,"Memes,Money,News,Personal Finance,Learn,Invest..."
3,5,"Memes,Money,News,Personal Finance,Learn,Invest..."
4,6,"Memes,Money,News,Personal Finance,Learn,Invest..."
...,...,...
92,96,
93,97,
94,98,
95,99,


In [4]:
user_df.columns

Index(['user_id', 'interested_in'], dtype='object')

# Step 1: clean user interests and create user profile text

In [5]:
def clean_and_split(s):
  if pd.isna(s):
    return []
  # replace unusual separators, remove extra whitespace
  s = re.sub(r'[\n\r;]+', ',', str(s))
  parts=[p.strip().lower() for p in s.split(',') if p.strip()]
  return parts
user_df['interests_list']=user_df['interested_in'].apply(clean_and_split)
# create a single string profile (join interests) — useful for TF-IDF / embeddings
user_df['profile_text'] = user_df['interests_list'].apply(lambda x: ' '.join(x) if x else "")

In [6]:
user_df.head(2)

Unnamed: 0,user_id,interested_in,interests_list,profile_text
0,2,"Memes,Money,News,Personal Finance,Learn,Invest...","[memes, money, news, personal finance, learn, ...",memes money news personal finance learn invest...
1,3,"Memes,Money,News,Personal Finance,Learn,Invest...","[memes, money, news, personal finance, learn, ...",memes money news personal finance learn invest...


In [7]:
print(user_df.loc[user_df['user_id']==2, ['user_id','interests_list','profile_text']].to_dict('records'))
user_df.to_csv('user_df_cleaned.csv', index=False)


[{'user_id': 2, 'interests_list': ['memes', 'money', 'news', 'personal finance', 'learn', 'investing 101', 'market trends', 'alternate investments', 'shopping'], 'profile_text': 'memes money news personal finance learn investing 101 market trends alternate investments shopping'}]


SAME STEP FOR POST'S CONTENT DATA

In [8]:
df_posts=pd.read_csv('/content/Assessment data - posts.csv')
df_posts.columns

Index(['post_id', 'user_id', 'content', 'is_anonymous', 'created_at',
       'updated_at', 'topics', 'like_user_ids', 'shares', 'reports', 'likes'],
      dtype='object')

In [9]:
def clean_and_split_for_postcontent(s):
  if pd.isna(s):
    return ""
  # replace unusual separators, remove extra whitespace
  s = str(s).lower()
  s = re.sub(r'[^a-z0-9\s]', ' ', s)      # remove punctuation
  s = re.sub(r'\s+', ' ', s).strip()
  return s
df_posts['clean_content'] = df_posts['content'].apply(clean_and_split_for_postcontent)

#topic column inclusion
if 'topics' in df_posts.columns:
  df_posts['topics']=df_posts['topics'].fillna('')
  df_posts['clean_topics'] = df_posts['topics'].apply(clean_and_split_for_postcontent)
  df_posts['text_for_vector'] = df_posts['clean_content'] + ' ' + df_posts['clean_topics']
else:
  df_posts['text_for_vector'] = df_posts['clean_content']

In [10]:
type(df_posts['topics'][0])

str

In [11]:
print(df_posts[['post_id', 'text_for_vector']].head(3).to_dict('records'))


[{'post_id': 4858, 'text_for_vector': 'indian companies exposure to us the list contains indian companies with total buisness exposure to us some of them are highly exposed to us market for buisness '}, {'post_id': 4857, 'text_for_vector': 'many of ipo are oversubscribing to 100x 200x and as a result many of us are not getting allotments that s the story of many of us personally i tried to apply in many and got very few allotments '}, {'post_id': 4830, 'text_for_vector': 'where do you see the potential of this stock in coming months or years '}]


In [12]:
df_posts.to_csv("posts_cleaned.csv", index=False)


 Vectorize user profiles and posts using TF-IDF

In [13]:
df_users= pd.read_csv("/content/user_df_cleaned.csv")
df_posts = pd.read_csv("/content/posts_cleaned.csv")

In [14]:
print(df_posts.shape)
print(df_users.shape)

(1000, 14)
(97, 4)


In [15]:
print(df_posts['text_for_vector'].isnull().sum())
print(df_users['profile_text'].isnull().sum())


0
45


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Vectorize all texts (user profiles and post content)
all_texts = list(df_users['profile_text'].fillna('')) + list(df_posts['text_for_vector'].fillna(''))
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
tfidf.fit(all_texts)

post_vecs = tfidf.transform(df_posts['text_for_vector'].fillna(''))
user_vecs=tfidf.transform(df_users['profile_text'].fillna(''))

content_similarity=cosine_similarity(user_vecs,post_vecs)


In [17]:
content_similarity.shape

(97, 1000)

In [18]:
import numpy as np

Role and evaluation of popularity of a content


In [19]:
content_similarity[0]

array([0.02235108, 0.        , 0.        , 0.        , 0.01802426,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.02063887, 0.        , 0.        , 0.00563613,
       0.        , 0.02232629, 0.        , 0.06156599, 0.05461746,
       0.        , 0.00784725, 0.05195687, 0.        , 0.        ,
       0.        , 0.07683651, 0.        , 0.02008186, 0.02655949,
       0.        , 0.        , 0.        , 0.34995557, 0.02081095,
       0.0146051 , 0.        , 0.0438171 , 0.09512461, 0.04232845,
       0.        , 0.06205823, 0.06205823, 0.14222348, 0.        ,
       0.        , 0.        , 0.        , 0.02168854, 0.        ,
       0.02500082, 0.        , 0.        , 0.        , 0.        ,
       0.02152317, 0.        , 0.        , 0.0325001 , 0.09974473,
       0.0337083 , 0.0512379 , 0.05595427, 0.0168063 , 0.04446699,
       0.        , 0.01431092, 0.07588658, 0.        , 0.01725

In [20]:
from datetime import datetime

df_posts['popularity'] = np.log1p(df_posts['likes'] + df_posts['shares'])

df_posts['created_at'] = pd.to_datetime(df_posts['created_at'], errors='coerce')


In [21]:
# recency
current_date = datetime.now()
df_posts['age_days'] = (current_date - df_posts['created_at']).dt.days
# recent posts → higher score
df_posts['recency'] = 1 / (1 + df_posts['age_days'])
df_posts['recency'] = df_posts['recency'].fillna(df_posts['recency'].mean())

print(df_posts[['post_id', 'likes', 'shares', 'popularity', 'recency']].head())


   post_id  likes  shares  popularity   recency
0     4858     15      26    3.737670  0.041667
1     4857      5      20    3.258097  0.041667
2     4830      4      23    3.332205  0.041667
3     4829      3      33    3.610918  0.041667
4     4855      9      25    3.555348  0.041667


Scoring Criteria - Hybrid Scoring -> Content + Popularity + Recency

In [22]:
from sklearn.preprocessing import MinMaxScaler
content_scores = np.array(content_similarity)

# ensure all scores are numpy arrays
scaler=MinMaxScaler()
# normalize content_similarity per user (row-wise)
content_norm = np.zeros_like(content_scores)
for i in range(content_scores.shape[0]):
    content_norm[i, :] = scaler.fit_transform(content_scores[i, :].reshape(-1, 1)).flatten()

# get popularity and recency as 1-D arrays aligned with posts
popularity = df_posts['popularity'].to_numpy()
recency = df_posts['recency'].to_numpy()

boost = 0.5 * popularity + 0.5 * recency
boost = scaler.fit_transform(boost.reshape(-1, 1)).flatten()



In [23]:
df_posts['popularity']


Unnamed: 0,popularity
0,3.737670
1,3.258097
2,3.332205
3,3.610918
4,3.555348
...,...
995,2.564949
996,2.890372
997,2.484907
998,2.484907


In [24]:
''' for initial guesses we will give more weights(80%) to content matching of
post with user's interest and 20% to other factors e.g.
popularity ,recency'''

final_score = 0.8 * content_norm + 0.2 * boost   # start with content-heavy weight

final_score

array([[0.23190807, 0.15780094, 0.16135703, ..., 0.11967375, 0.11967375,
        0.11967375],
       [0.23190807, 0.15780094, 0.16135703, ..., 0.11967375, 0.11967375,
        0.11967375],
       [0.23190807, 0.15780094, 0.16135703, ..., 0.11967375, 0.11967375,
        0.11967375],
       ...,
       [0.18081339, 0.15780094, 0.16135703, ..., 0.11967375, 0.11967375,
        0.11967375],
       [0.18081339, 0.15780094, 0.16135703, ..., 0.11967375, 0.11967375,
        0.11967375],
       [0.18081339, 0.15780094, 0.16135703, ..., 0.11967375, 0.11967375,
        0.11967375]])

In [25]:
final_score.shape

(97, 1000)

# Recommendation step

In [26]:
recommendations = []
for i , user_id in enumerate(df_users['user_id']):
      # get the 10 highest-scoring post indices

  top_indices = np.argsort(-final_score[i])[:10]
   # map to actual post_ids
  top_posts = df_posts.iloc[top_indices]['post_id'].astype(str).tolist()
  recommendations.append((user_id, top_posts))
  # store as a comma-separated string

# make a DataFrame
df_recs = pd.DataFrame(recommendations)
df_recs.columns = ['user_id', 'recommended_posts']
df_recs


Unnamed: 0,user_id,recommended_posts
0,2,"[4658, 3289, 3638, 4149, 3649, 1611, 1672, 370..."
1,3,"[4658, 3289, 3638, 4149, 3649, 1611, 1672, 370..."
2,4,"[4658, 3289, 3638, 4149, 3649, 1611, 1672, 370..."
3,5,"[4658, 3289, 3638, 4149, 3649, 1611, 1672, 370..."
4,6,"[4658, 3289, 3638, 4149, 3649, 1611, 1672, 370..."
...,...,...
92,96,"[3930, 4289, 3677, 3626, 3714, 4740, 4816, 397..."
93,97,"[3930, 4289, 3677, 3626, 3714, 4740, 4816, 397..."
94,98,"[3930, 4289, 3677, 3626, 3714, 4740, 4816, 397..."
95,99,"[3930, 4289, 3677, 3626, 3714, 4740, 4816, 397..."


In [27]:
df_recs.to_csv("recommendations.csv", index=False)

In [35]:
type(df_posts['like_user_ids'][0])

str

In [38]:
df_posts.head(1)

Unnamed: 0,post_id,user_id,content,is_anonymous,created_at,updated_at,topics,like_user_ids,shares,reports,likes,clean_content,clean_topics,text_for_vector,popularity,age_days,recency
0,4858,18.0,Indian Companies Exposure To US . The List Con...,False,2025-10-06 12:50:00,"Oct. 6, 2025, 12:54 p.m.",,4106691413262119712358,26,0,15,indian companies exposure to us the list conta...,,indian companies exposure to us the list conta...,3.73767,23.0,0.041667


In [36]:
df_posts.columns

Index(['post_id', 'user_id', 'content', 'is_anonymous', 'created_at',
       'updated_at', 'topics', 'like_user_ids', 'shares', 'reports', 'likes',
       'clean_content', 'clean_topics', 'text_for_vector', 'popularity',
       'age_days', 'recency'],
      dtype='object')

In [48]:
type(df_recs['recommended_posts'][0])

list

In [50]:
def accuracy(df_recs, df_posts):
    # Create a dictionary mapping post_id to a set of user_ids who liked it
    post_likes_dict = {}
    splitter = lambda x: set(str(x).split(',')) if pd.notna(x) else set()
    for index, row in df_posts.iterrows():
        post_likes_dict[row['post_id']] = splitter(row['like_user_ids'])

    hit_count = 0
    total_recommendations = 0

    # Iterate through each user's recommendations
    for index, row in df_recs.iterrows():
        user_id = str(row['user_id'])
        recommended_posts = row['recommended_posts'] # Access the list directly

        total_recommendations += len(recommended_posts)

        # Check if each recommended post was liked by the user
        for post_id in recommended_posts:
            if int(post_id) in post_likes_dict and user_id in post_likes_dict[int(post_id)]:
                hit_count += 1

    # Calculate accuracy (Precision@k, where k=10 in this case)
    # It's the number of relevant recommendations (liked posts) divided by the total number of recommendations
    accuracy = hit_count / total_recommendations if total_recommendations > 0 else 0

    return accuracy

acc = accuracy(df_recs, df_posts)
print(f"Accuracy (Precision): {acc*100} %")

Accuracy (Precision): 13.195876288659795 %


from the above frequency method TF-IDF  accuracy is too low .

Using ML algorithms


In [53]:
positive_pairs = []
for _, post in df_posts.iterrows():
    if pd.isna(post['like_user_ids']):
        continue
    liked_users = [int(u) for u in str(post['like_user_ids']).split(',') if u.strip().isdigit()]
    for uid in liked_users:
        positive_pairs.append([uid, post['post_id'], 1])
df_pos = pd.DataFrame(positive_pairs, columns=['user_id', 'post_id', 'liked'])
print(df_pos.head())


   user_id  post_id  liked
0        4     4858      1
1       10     4858      1
2        6     4858      1
3       69     4858      1
4       14     4858      1


In [52]:
import numpy as np
import pandas as pd

# collect all post IDs
all_posts = df_posts['post_id'].tolist()

neg_samples = []

# pick a few random posts for each user
for uid in df_users['user_id']:
    # posts this user liked
    liked_posts = df_pos[df_pos['user_id'] == uid]['post_id'].tolist()

    # posts user has NOT liked
    unliked_posts = [p for p in all_posts if p not in liked_posts]

    # take only a few random ones (say 5)

    for pid in unliked_posts:
        neg_samples.append([uid, pid, 0])   # liked = 0

df_neg = pd.DataFrame(neg_samples, columns=['user_id','post_id','liked'])

print("Negative samples created:", df_neg.shape)
print(df_neg.head())


Negative samples created: (88692, 3)
   user_id  post_id  liked
0        2     4658      0
1        2     4603      0
2        2     4417      0
3        2     4416      0
4        2     4320      0


In [57]:
df_posts.shape

(1000, 17)

In [58]:
df_users.shape

(97, 4)

In [56]:
df_train = pd.concat([df_pos, df_neg], ignore_index=True)
print(df_train.shape)

(97541, 3)


In [59]:
df_train = df_train.merge(
    df_posts[['post_id', 'popularity', 'recency', 'shares', 'reports', 'is_anonymous']],
    on='post_id', how='left'
)
print("After adding post features:", df_train.shape)


After adding post features: (97541, 8)


In [61]:
df_train.head(2)

Unnamed: 0,user_id,post_id,liked,popularity,recency,shares,reports,is_anonymous
0,4,4858,1,3.73767,0.041667,26,0,False
1,10,4858,1,3.73767,0.041667,26,0,False


In [62]:
df_users['interest_count'] = df_users['interested_in'].apply(lambda x: len(str(x).split(',')))
df_train = df_train.merge(df_users[['user_id', 'interest_count']], on='user_id', how='left')


In [64]:
content_similarity.shape

(97, 1000)

In [66]:
def get_content_sim(uid, pid):
    try:
        u_idx = df_users.index[df_users['user_id']==uid][0]
        p_idx = df_posts.index[df_posts['post_id']==pid][0]
        return content_similarity[u_idx, p_idx]
    except:
        return 0

df_train['content_sim'] = df_train.apply(lambda x: get_content_sim(x.user_id, x.post_id), axis=1)


In [67]:
df_train

Unnamed: 0,user_id,post_id,liked,popularity,recency,shares,reports,is_anonymous,interest_count,content_sim
0,4,4858,1,3.737670,0.041667,26,0,False,9.0,0.022351
1,10,4858,1,3.737670,0.041667,26,0,False,9.0,0.022351
2,6,4858,1,3.737670,0.041667,26,0,False,9.0,0.022351
3,69,4858,1,3.737670,0.041667,26,0,False,1.0,0.000000
4,14,4858,1,3.737670,0.041667,26,0,False,9.0,0.022351
...,...,...,...,...,...,...,...,...,...,...
97536,100,1479,0,2.564949,0.020297,6,0,False,1.0,0.000000
97537,100,1477,0,2.890372,0.020297,6,0,False,1.0,0.000000
97538,100,1499,0,2.484907,0.020297,5,0,False,1.0,0.000000
97539,100,1476,0,2.484907,0.020297,6,0,False,1.0,0.000000


In [73]:
from sklearn.model_selection import train_test_split

# select the numeric feature columns
feature_cols = ['content_sim', 'popularity', 'recency', 'shares',
                 'is_anonymous', 'interest_count']

X = df_train[feature_cols]
y = df_train['liked']

# split into train/test
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Train shape:", X_train.shape, "Validation shape:", X_val.shape)


Train shape: (78032, 6) Validation shape: (19509, 6)


In [74]:
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
# sns.heatmap(df_train.corr(),annot=True)

<Figure size 1000x800 with 0 Axes>

<Figure size 1000x800 with 0 Axes>

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

# Handle missing values in the features
X_train = X_train.fillna(0)
X_val = X_val.fillna(0)


model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# predictions
y_pred_prob = model.predict_proba(X_val)[:,1]
y_pred = (y_pred_prob > 0.5).astype(int)

# evaluate
print("AUC:", roc_auc_score(y_val, y_pred_prob))
print("Accuracy:", accuracy_score(y_val, y_pred))

AUC: 0.8498914103846643
Accuracy: 0.912963247731816
