In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [3]:
users_df = pd.read_csv("users_table.csv")
videos_df = pd.read_csv("videos_table.csv")
engagements_df = pd.read_csv("engagements_table.csv")

In [4]:
users_df.head(5)

Unnamed: 0,user_id,name,age,gender,location,language
0,U1,Maria,46,Female,Victoriamouth,Hindi
1,U2,Eric,36,Female,Fordton,English
2,U3,Angela,27,Female,North James,Telugu
3,U4,Brooke,46,Female,New Dalton,English
4,U5,Kevin,27,Female,North Becky,English


In [6]:
videos_df.head(5)

Unnamed: 0,vid_id,length,categories
0,V1,91,Drama
1,V2,3,Action
2,V3,131,Comedy
3,V4,5,Drama
4,V5,94,Horror


In [8]:
engagements_df.head(5)

Unnamed: 0,user_id,vid_id,likes,comments,shares,watch_time,rating
0,U5,V9,1,2,0,0.56,3
1,U7,V11,1,1,1,0.54,4
2,U4,V14,1,2,3,0.99,4
3,U10,V13,1,4,1,0.93,3
4,U6,V14,1,3,0,0.58,1


In [9]:
data = engagements_df.merge(videos_df[['vid_id', 'categories']], on='vid_id', how='left')

In [10]:
data.head(5)

Unnamed: 0,user_id,vid_id,likes,comments,shares,watch_time,rating,categories
0,U5,V9,1,2,0,0.56,3,Drama
1,U7,V11,1,1,1,0.54,4,Action
2,U4,V14,1,2,3,0.99,4,Romance
3,U10,V13,1,4,1,0.93,3,Documentary
4,U6,V14,1,3,0,0.58,1,Romance


In [11]:
data_encoded = pd.get_dummies(data, columns=['categories'])

In [16]:
data_encoded.head(5)

Unnamed: 0,user_id,vid_id,likes,comments,shares,watch_time,rating,categories_Action,categories_Comedy,categories_Documentary,categories_Drama,categories_Horror,categories_Romance,categories_Thriller,user_id_enc
0,U5,V9,1,2,0,0.56,3,False,False,False,True,False,False,False,5
1,U7,V11,1,1,1,0.54,4,True,False,False,False,False,False,False,7
2,U4,V14,1,2,3,0.99,4,False,False,False,False,False,True,False,4
3,U10,V13,1,4,1,0.93,3,False,False,True,False,False,False,False,1
4,U6,V14,1,3,0,0.58,1,False,False,False,False,False,True,False,6


In [17]:
data_encoded['user_id_enc'] = data_encoded['user_id'].str.replace('U', '').astype(int)

In [18]:
data_encoded.head(5)

Unnamed: 0,user_id,vid_id,likes,comments,shares,watch_time,rating,categories_Action,categories_Comedy,categories_Documentary,categories_Drama,categories_Horror,categories_Romance,categories_Thriller,user_id_enc
0,U5,V9,1,2,0,0.56,3,False,False,False,True,False,False,False,5
1,U7,V11,1,1,1,0.54,4,True,False,False,False,False,False,False,7
2,U4,V14,1,2,3,0.99,4,False,False,False,False,False,True,False,4
3,U10,V13,1,4,1,0.93,3,False,False,True,False,False,False,False,10
4,U6,V14,1,3,0,0.58,1,False,False,False,False,False,True,False,6


In [19]:
X = data_encoded[['user_id_enc'] + [col for col in data_encoded.columns if 'categories_' in col]]
y = data_encoded[['likes', 'comments', 'shares', 'watch_time', 'rating']]

In [20]:
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

In [29]:
# Predict engagement per user × category

categories_encoded = [col for col in data_encoded.columns if 'categories_' in col]
user_category_predictions = []

# Weight factors for engagement metrics
weights = {
    'likes': 1.0,
    'comments': 0.8,
    'shares': 1.5,
    'watch_time': 2.0,
    'rating': 2.5
}

for user_idx, user in zip(users_df['user_id_enc'], users_df['user_id']):
    for cat in categories_encoded:
        # Build input row with same feature names as during training
        X_row_dict = {col: 0 for col in ['user_id_enc'] + categories_encoded}
        X_row_dict['user_id_enc'] = user_idx
        X_row_dict[cat] = 1
        X_row_df = pd.DataFrame([X_row_dict])

        # Predict engagement metrics
        pred = rf.predict(X_row_df)[0]

        # Weighted total score (custom equation)
        total_score = (
            pred[0] * weights['likes'] +
            pred[1] * weights['comments'] +
            pred[2] * weights['shares'] +
            pred[3] * weights['watch_time'] +
            pred[4] * weights['rating']
        )

        # Save results
        user_category_predictions.append({
            'user_id': user,
            'category': cat.replace('categories_', ''),
            'predicted_likes': pred[0],
            'predicted_comments': pred[1],
            'predicted_shares': pred[2],
            'predicted_watch_time': pred[3],
            'predicted_rating': pred[4],
            'total_score': total_score
        })

pred_df = pd.DataFrame(user_category_predictions)

In [31]:
pred_df.head(10)

Unnamed: 0,user_id,category,predicted_likes,predicted_comments,predicted_shares,predicted_watch_time,predicted_rating,total_score
0,U1,Action,0.708333,4.655,1.960833,0.304408,2.951667,15.361567
1,U1,Comedy,0.845,3.103667,1.422667,0.567813,3.805,16.11006
2,U1,Documentary,0.92,2.990667,1.4145,0.606952,4.87,18.823187
3,U1,Drama,0.956667,2.946667,0.880833,0.730508,2.01,11.121267
4,U1,Horror,0.78,3.973667,1.3685,0.490197,3.42,15.542077
5,U1,Romance,0.54,3.604,0.853333,0.548663,2.791333,12.77886
6,U1,Thriller,0.86,3.897333,1.16,0.479883,3.71,15.952633
7,U2,Action,0.315,4.745,0.935,0.39595,3.865,15.9679
8,U2,Comedy,0.605,2.995667,0.906667,0.577483,3.98,15.4665
9,U2,Documentary,0.32,2.528667,0.4485,0.504472,4.35,14.899627


In [32]:
final_table = pred_df.sort_values(['user_id', 'total_score'], ascending=[True, False])

In [34]:
final_table.head(10)

Unnamed: 0,user_id,category,predicted_likes,predicted_comments,predicted_shares,predicted_watch_time,predicted_rating,total_score
2,U1,Documentary,0.92,2.990667,1.4145,0.606952,4.87,18.823187
1,U1,Comedy,0.845,3.103667,1.422667,0.567813,3.805,16.11006
6,U1,Thriller,0.86,3.897333,1.16,0.479883,3.71,15.952633
4,U1,Horror,0.78,3.973667,1.3685,0.490197,3.42,15.542077
0,U1,Action,0.708333,4.655,1.960833,0.304408,2.951667,15.361567
5,U1,Romance,0.54,3.604,0.853333,0.548663,2.791333,12.77886
3,U1,Drama,0.956667,2.946667,0.880833,0.730508,2.01,11.121267
63,U10,Action,0.265,4.8425,0.725,0.2787,3.9075,15.55265
67,U10,Horror,0.095,4.4895,0.2795,0.770445,3.8085,15.16799
65,U10,Documentary,0.93,3.8,0.67,0.6835,3.45,14.967


In [37]:
# Get Top 3 Categories per User

top_n = 3  # number of top categories to keep per user
user_top_categories = (
    final_table.groupby('user_id')
    .head(top_n)[['user_id', 'category', 'total_score']]
    .reset_index(drop=True)
)

In [38]:
user_top_categories.head(10)

Unnamed: 0,user_id,category,total_score
0,U1,Documentary,18.823187
1,U1,Comedy,16.11006
2,U1,Thriller,15.952633
3,U10,Action,15.55265
4,U10,Horror,15.16799
5,U10,Documentary,14.967
6,U2,Action,15.9679
7,U2,Thriller,15.569527
8,U2,Comedy,15.4665
9,U3,Thriller,15.132057


In [39]:
# Find Matching Video IDs for those Categories

n_videos = 3  # number of videos to recommend per user
recommendations = []

for user in user_top_categories['user_id'].unique():
    
    top_categories = user_top_categories[user_top_categories['user_id'] == user]['category'].tolist()
    matched_videos = videos_df[videos_df['categories'].isin(top_categories)]

    selected_videos = matched_videos.head(n_videos)

    for _, video in selected_videos.iterrows():
        
        cat_score = user_top_categories[
            (user_top_categories['user_id'] == user) &
            (user_top_categories['category'] == video['categories'])
        ]['total_score'].values[0]

        recommendations.append({
            'user_id': user,
            'recommended_vid_id': video['vid_id'],
            'category': video['categories'],
            'total_score': cat_score
        })

In [42]:
final_recommendations = pd.DataFrame(recommendations)

In [46]:
final_recommendations.head(10)

Unnamed: 0,user_id,recommended_vid_id,category,total_score
0,U1,V3,Comedy,16.11006
1,U1,V6,Comedy,16.11006
2,U1,V7,Documentary,18.823187
3,U10,V2,Action,15.55265
4,U10,V5,Horror,15.16799
5,U10,V7,Documentary,14.967
6,U2,V2,Action,15.9679
7,U2,V3,Comedy,15.4665
8,U2,V6,Comedy,15.4665
9,U3,V3,Comedy,14.94373


In [55]:
final_recommendations = pd.DataFrame(recommendations).sort_values(
    by=['user_id', 'total_score'], ascending=[True, False]
).reset_index(drop=True)

In [56]:
final_recommendations.head(10)

Unnamed: 0,user_id,recommended_vid_id,category,total_score
0,U1,V7,Documentary,18.823187
1,U1,V3,Comedy,16.11006
2,U1,V6,Comedy,16.11006
3,U10,V2,Action,15.55265
4,U10,V5,Horror,15.16799
5,U10,V7,Documentary,14.967
6,U2,V2,Action,15.9679
7,U2,V3,Comedy,15.4665
8,U2,V6,Comedy,15.4665
9,U3,V7,Documentary,15.08434


In [57]:
final_recommendations_video_id = final_recommendations[['user_id', 'recommended_vid_id']].reset_index(drop=True)

In [58]:
final_recommendations_video_id.head(10)

Unnamed: 0,user_id,recommended_vid_id
0,U1,V7
1,U1,V3
2,U1,V6
3,U10,V2
4,U10,V5
5,U10,V7
6,U2,V2
7,U2,V3
8,U2,V6
9,U3,V7
