
Load Dataset

In [6]:
import pandas as pd
merged_df = pd.read_csv("merged_df.csv")
merged_df.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,user_id,clicked,content
0,N37243,finance,finance-real-estate,the 25 most desirable places to live in the us...,check out where u s residents would live if th...,https://assets.msn.com/labs/mind/AABvlID.html,[],"[{""Label"": ""United States"", ""Type"": ""G"", ""Wiki...",unknown,1.0,the 25 most desirable places to live in the us...
1,N25540,finance,finance-saving-investing,take heart millennials investing is within you...,news headlines might lead you to believe that ...,https://assets.msn.com/labs/mind/AAEmGBr.html,[],[],unknown,1.0,take heart millennials investing is within you...
2,N37129,finance,finance-taxes,don t be like these celebrities convicted of t...,these celebs were in hot water with the taxman,https://assets.msn.com/labs/mind/AAEGGF9.html,"[{""Label"": ""Tax evasion"", ""Type"": ""C"", ""Wikida...",[],U73032,0.0,don t be like these celebrities convicted of t...
3,N36064,finance,finance-companies,more store closings coming the list of retaile...,more than 10 months into 2019 more than 8 600 ...,https://assets.msn.com/labs/mind/AADN84N.html,[],[],unknown,1.0,more store closings coming the list of retaile...
4,N63006,finance,finance-savemoney,17 surprising ways penny pinching costs you more,frugal living could end up costing you in the ...,https://assets.msn.com/labs/mind/AAB4M2y.html,[],[],U67894,0.0,17 surprising ways penny pinching costs you mo...


1 Content Base Recommendation System

In [23]:
# ---------------------------
#  TF-IDF Vectorization
# ---------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer(stop_words="english", max_df=0.8, min_df=5, ngram_range=(1,2))
tfidf_matrix = vectorizer.fit_transform(merged_df['content'])
# print(tfidf_matrix)

# ---------------------------
#  Cosine Similarity Matrix
# ---------------------------
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# print(cosine_sim)
# jese ki cosine mei sabse check kr rha hai toh wo sjab khud se check krta hai toh uska 1 output aata hai 

# Reset index to get clean mapping
merged_df = merged_df.reset_index(drop=True)

[[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [24]:
# ---------------------------
#  Recommendation Function
# ---------------------------
import re
def content_base_rec(title, top_n=5):
    # cleaning
    title = re.sub(r'\W+', ' ', title)  # Remove special chars
    title = title.lower().strip()
    
    # Vectorize input title
    title_vec = vectorizer.transform([title])

    # Compute cosine similarity
    sim_scores = cosine_similarity(title_vec, tfidf_matrix).flatten()

    # Get top N indices
    top_indices = sim_scores.argsort()[::-1][:top_n]

    return merged_df.loc[top_indices, ['news_id', 'title', 'category', 'subcategory', 'url', 'abstract']]


content_base_rec("Top investment strategies in 2025")

Unnamed: 0,news_id,title,category,subcategory,url,abstract
2841,N35029,china s hottest investment overpriced sneakers,finance,markets,https://assets.msn.com/labs/mind/BBWwqU4.html,forget stocks real estate even cryptocurrencie...
1222,N3461,5 medium risk investments for high returns,finance,finance-saving-investing,https://assets.msn.com/labs/mind/AAJXp3M.html,a few high return investing strategies that co...
2736,N8988,shoppers share black friday strategies that ac...,finance,finance-savemoney,https://assets.msn.com/labs/mind/BBWqXL6.html,if at first you don t succeed talk to someone ...
1810,N15796,ubs tightens investment banking belt as earnin...,finance,finance-companies,https://assets.msn.com/labs/mind/AAJ9Rzi.html,ubs tightens investment banking belt as earnin...
1570,N25302,the main reason billionaires move has nothing ...,finance,finance-real-estate,https://assets.msn.com/labs/mind/AAJa8Vj.html,middle class americans can learn some financia...


In [25]:
content_base_rec("How to save more money on taxes this year")

Unnamed: 0,news_id,title,category,subcategory,url,abstract
66,N23833,17 moves that will make or save you money,finance,finance-savemoney,https://assets.msn.com/labs/mind/AAJWHEe.html,1
65,N36300,23 ways you re wasting money at costco,finance,finance-savemoney,https://assets.msn.com/labs/mind/AAHAsXO.html,the warehouse club can save you money or it ca...
1985,N54203,the strangest state taxes across america,finance,finance-taxes,https://assets.msn.com/labs/mind/BBVhK8m.html,from taxes on bagels to sheep here are strange...
450,N45779,25 things you should never buy and what to buy...,finance,finance-savemoney,https://assets.msn.com/labs/mind/AAHTjWa.html,if you really want to save money become a more...
1764,N52088,couples weigh strategic divorce to save on taxes,finance,finance-taxes,https://assets.msn.com/labs/mind/AAIRPeA.html,financial advisors warn that couples who split...


2 Collaborative Filtering (User-Based or Implicit)


In [26]:
from sklearn.preprocessing import LabelEncoder

# 1. Encode user_id and news_id to numeric
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

merged_df['user_enc'] = user_encoder.fit_transform(merged_df['user_id'])
merged_df['news_enc'] = item_encoder.fit_transform(merged_df['news_id'])

# 2. Create user-item matrix (rows=user, cols=news, values=clicked)
user_item_matrix = merged_df.pivot_table(
    index='user_enc', columns='news_enc', values='clicked', fill_value=0
)

# 3. Compute cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)

Recommendation Function

In [28]:
def collaborative_base_rec(input_user, df, top_k=5):
    # Encode user_id to index
    user_idx = user_encoder.transform([input_user])[0]
    
    # Get similarity scores for this user with others
    sim_scores = user_similarity[user_idx]
    
    # Get clicked items by this user
    user_clicks = user_item_matrix.iloc[user_idx]
    
    # Calculate weighted sum of clicks by similarity scores
    weighted_scores = sim_scores @ user_item_matrix.values
    
    # Remove already clicked items from recommendations
    weighted_scores[user_clicks == 1] = 0
    
    # Get top news indices based on scores
    top_news_indices = weighted_scores.argsort()[::-1][:top_k]
    
    # Decode back to news_ids
    recommended_news_ids = item_encoder.inverse_transform(top_news_indices)
    
    # Filter original df for those news and drop duplicates
    recommended_news = df[df['news_id'].isin(recommended_news_ids)][
        ['news_id', 'title', 'category', 'subcategory', 'url', 'abstract']
    ].drop_duplicates(subset='news_id')
    
    return recommended_news.reset_index(drop=True)

recommendations = collaborative_base_rec(
    input_user='U91836', 
    df=merged_df,
)
recommendations

Unnamed: 0,news_id,title,category,subcategory,url,abstract
0,N9947,10 job skills worth six figure salaries,finance,finance-career-education,https://assets.msn.com/labs/mind/AAEzJ4H.html,learn these skills to get you closer to six fi...
1,N9871,gm strike has already cost automaker more than...,finance,finance-companies,https://assets.msn.com/labs/mind/AAIB1jw.html,financial toll is rising for both the car comp...
2,N9988,everybody is calling twin snow storms mean goo...,finance,financenews,https://assets.msn.com/labs/mind/AAJytd9.html,with slick snowy roads leading to a lot of sli...
3,N9875,there are doubts about china s promise to purc...,finance,markets,https://assets.msn.com/labs/mind/AAIROfI.html,president trump said china promised to increas...
4,N9989,how should billionaires spend their money to f...,finance,markets,https://assets.msn.com/labs/mind/BBWE4pA.html,is it better to invest in developing clean ene...


3 Hybrid Recommendation System

In [29]:
def hybrid_recommendations(user_id,title, df,top_n=5):
    # clean title
    title = re.sub(r'\W+', ' ', title)  # Remove special chars
    title = title.lower().strip()
    
    
    # Get content-based recommendations
    content_recs = content_base_rec(title)
    
    # Get collaborative-based recommendations
    collab_recs = collaborative_base_rec(user_id, df)

    # Merge the recommendations
    combined_recs = pd.concat([content_recs, collab_recs]).drop_duplicates().reset_index(drop=True)
    
    # Limit the number of recommendations to the top 'n' 
    combined_recs = combined_recs.head(top_n)
    
    return combined_recs

hybrid_recommendations(
    user_id='U91836',
    title="The 1 reason you shouldnâ€™t hesitate to claim Social Security",
    df=merged_df,
)

Unnamed: 0,news_id,title,category,subcategory,url,abstract
0,N36226,the 1 reason you shouldn t hesitate to claim s...,finance,finance-retirement,https://assets.msn.com/labs/mind/BBWKwml.html,seniors are often told to wait on social secur...
1,N17087,the one reason to claim social security at 62 ...,finance,finance-retirement,https://assets.msn.com/labs/mind/AAJJ7Jo.html,should you claim early if you don t really nee...
2,N9751,11 social security mistakes you can avoid,finance,finance-retirement,https://assets.msn.com/labs/mind/BBV0eGF.html,social security s complicated but it s worth t...
3,N11593,what millennials get wrong about social security,finance,finance-billstopay,https://assets.msn.com/labs/mind/AAGb1aa.html,few issues unite millennials like the future o...
4,N63702,will you get what social security promises,finance,finance-retirement,https://assets.msn.com/labs/mind/AAJjqg8.html,the social security administration will happil...
