## Lodaing The Required Libraries

In [5]:
import numpy as np
from implicit.als import AlternatingLeastSquares
from scipy.sparse import coo_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Loading The Dataset

In [245]:
filename = 'overall_df.csv'
overall_df = pd.read_csv(filename)
overall_df = overall_df.drop(columns=['Population', 'Post_Title', 'Upvotes'])
overall_df.rename(columns={'PageRank_Within_Community': 'PR_Community'}, inplace=True)
overall_df.head()

Unnamed: 0,Subreddit,Date,User_ID,Community_Label,PageRank,PR_Community,Category
0,AdviceAnimals,8/31/2022,Lost-My-Mind-,0,0.000407,0.001599,Humor
1,AdviceAnimals,8/31/2022,SgtDoughnut,0,0.000137,0.000528,Humor
2,AdviceAnimals,8/31/2022,JasonDJ,0,8.2e-05,0.000312,Humor
3,AdviceAnimals,8/31/2022,tacknosaddle,0,0.000199,0.000771,Humor
4,AdviceAnimals,8/31/2022,jezra,0,8.1e-05,0.000309,Humor


## Creating List Of Community Dataframes

- The aim of this section is to create a list of dataframes, where each dataframe represent a specific community. The reason behind doing this is: The collaborative filtering process will be focused on each community seperately. 

In [233]:
def create_community_dataframes(df):
  community_dataframes = []

  # Grouping the nodes_df by the 'Community_Library' column
  grouped = df.groupby('Community_Label')

  # Iterating through each community and create separate DataFrames
  for community_id, community_nodes_df in grouped:
      # Removing the 'Community_Library' column from the DataFrame, as it's no longer needed
      community_nodes_df = community_nodes_df.drop(columns=['Community_Label'])

      # Appending the DataFrame to the list
      community_dataframes.append(community_nodes_df)

  return community_dataframes

In [246]:
community_dfs = create_community_dataframes(overall_df)
for df in community_dfs:
    df['Date'] = pd.to_datetime(df['Date'])
# Printing the first few nodes of a selected community DataFrame
print("\nCommunity X DataFrame:")
community_dfs[9].head(5)


Community X DataFrame:


Unnamed: 0,Subreddit,Date,User_ID,PageRank,PR_Community,Category
25616,AnimalsBeingDerps,2022-09-02,Sniflix,0.00018,0.002661,Animals and Pets
25618,AnimalsBeingDerps,2022-07-09,Sniflix,0.00018,0.002661,Animals and Pets
25634,AnimalsBeingDerps,2023-03-26,Sniflix,0.00018,0.002661,Animals and Pets
25700,AnimalsBeingDerps,2023-03-20,Sniflix,0.00018,0.002661,Animals and Pets
25941,AnimalsBeingDerps,2022-08-08,Sniflix,0.00018,0.002661,Animals and Pets


## Data Preprocessing

- The aim of this section I start figuring out how can I create a score/rate that best describes the user preference regarding a given subreddit.
- I first compute the weights of the following subreddit and category. Each one describes the following: On average how many categories/subreddits does a user view.
- I then compute the partial scores, which are engagement score for subreddit and for category. These partial scores describe or give more importance to the subreddits/categories that have been interacted with more by the user. For example if the user have performed an interaction in a subreddit X four times and in subreddit Y 2 times, then I give more importance to subreddit X than subreddit Y.  
- Finally, I compute the decay factor, where I give more importance to a subreddit that has been interacted with several times in a day by a given user than a product that has been viewed several times but across several days that are far apart by the same user. The time decay factor was calculated by followaing the log approach, in order to give importance to reecency of the view date. For instance if user x has viewed product y today and yesterday the value of the decay factor won't be the same as when user x viewed product y today and 3 months ago.

In [135]:
def calculate_subreddit_weight(df):
    # Grouping by 'User_ID' and count distinct 'Subreddit'
    grouped = df.groupby('User_ID')['Subreddit'].nunique()
    average_count = grouped.mean()
    return average_count

def calculate_category_weight(df):
    # Grouping by 'User_ID' and count distinct 'Category'
    grouped = df.groupby('User_ID')['Category'].nunique()
    average_count = grouped.mean()
    return average_count

In [158]:
def calculate_interaction_ratio_subreddit(df):
    subreddit_interactions = df.groupby(['User_ID', 'Subreddit']).size().reset_index(name='interactions')
    total_interactions = df.groupby('User_ID').size().reset_index(name='total_interactions')

    df = df.merge(subreddit_interactions, on=['User_ID', 'Subreddit'], how='left')
    df = df.merge(total_interactions, on='User_ID', how='left')

    df['Engagement_Score_Subreddit'] = df['interactions'] / df['total_interactions']
    df = df.drop(['interactions', 'total_interactions'], axis=1)
    return df

def calculate_interaction_ratio_category(df):
    category_interactions = df.groupby(['User_ID', 'Category']).size().reset_index(name='interactions')
    total_interactions = df.groupby('User_ID').size().reset_index(name='total_interactions')

    df = df.merge(category_interactions, on=['User_ID', 'Category'], how='left')
    df = df.merge(total_interactions, on='User_ID', how='left')

    df['Engagement_Score_Category'] = df['interactions'] / df['total_interactions']
    df = df.drop(['interactions', 'total_interactions'], axis=1)
    return df

In [137]:
def max_date(df):
    # Calculating the maximum date for each user-product combination
    max_date_df = df.groupby(['User_ID', 'Subreddit'])['Date'].max().reset_index()
    max_date_df.rename(columns={'Date': 'Last_Interaction_Date'}, inplace=True)

    # Joining the max_date_df with the original df
    df_with_max_date = df.merge(max_date_df, on=['User_ID', 'Subreddit'])
    return df_with_max_date

def calculate_decay_factor(row):
    half_life_days = 7
    days_diff = abs((pd.to_datetime(row['Date']) - pd.to_datetime(row['Last_Interaction_Date'])).days)
    if days_diff == 0:
        return 1
    return 1 / np.log1p(1 + (days_diff / half_life_days))

In [247]:
subreddit_weight = []
for df in community_dfs:
    subreddit_weight.append(calculate_subreddit_weight(df))

category_weight = []
for df in community_dfs:
    category_weight.append(calculate_category_weight(df))

In [248]:
temp_dfs_1 = []
for df in community_dfs:
    temp_dfs_1.append(calculate_interaction_ratio_subreddit(df))

temp_dfs_2 = []
for df in temp_dfs_1:
    temp_dfs_2.append(calculate_interaction_ratio_category(df))

In [249]:
max_date_dfs = []
for df in temp_dfs_2:
    max_date_dfs.append(max_date(df))
for df in max_date_dfs:    
    df['Decay_Factor'] = df.apply(calculate_decay_factor, axis=1)
overall_dfs = max_date_dfs.copy()
overall_dfs[9].head()

Unnamed: 0,Subreddit,Date,User_ID,PageRank,PR_Community,Category,Engagement_Score_Subreddit,Engagement_Score_Category,Last_Interaction_Date,Decay_Factor
0,AnimalsBeingDerps,2022-09-02,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.264485
1,AnimalsBeingDerps,2022-07-09,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.253438
2,AnimalsBeingDerps,2023-03-26,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.373265
3,AnimalsBeingDerps,2023-03-20,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.365468
4,AnimalsBeingDerps,2022-08-08,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.259119


## Score Calculation

- In this section I compute the implicit score by firstly computing the following equation: (subreddit weight * engagement score subreddit + category weight * engagement score category)*decay factor. The result of this equation is then used to compute the actual implicit score by integrating the pagerank score. Alpha is a parameter that controls the balance between the Total_Score and the PageRank scores. Higher alpha gives more weight to pagerank scores, making the recommendations more influenced by user importance in the community. I then compute the lower and upper bounds based on the interquartile range (IQR) and capping the Implicit_Score values that fall outside these bounds. This helps in mitigating the impact of outliers on your recommendations.

In [250]:
def calculate_implicit_score(df, subreddit_weight, category_weight, alpha):
    df['Total_Score'] = (subreddit_weight * df['Engagement_Score_Subreddit'] + category_weight * df['Engagement_Score_Category'])* df['Decay_Factor']
    df['Implicit_Score'] = (1 - alpha) * df['Total_Score'] + alpha * df['PR_Community']
    lower_quartile = df['Implicit_Score'].quantile(0.25)
    upper_quartile = df['Implicit_Score'].quantile(0.75)
    iqr = upper_quartile - lower_quartile

    # Calculating lower and upper bounds
    lower_bound = lower_quartile - 1.5 * iqr
    upper_bound = upper_quartile + 1.5 * iqr

    # Setting scores below lower bound to lower bound, and scores above upper bound to upper bound
    df['Implicit_Score'] = df['Implicit_Score'].apply(lambda x: lower_bound if x < lower_bound else x)
    df['Implicit_Score'] = df['Implicit_Score'].apply(lambda x: upper_bound if x > upper_bound else x)
    final_df = df.copy()
    return final_df

In [251]:
alpha = 0.7
main_dfs = []
for community_idx in range(len(overall_dfs)):
    community_df = overall_dfs[community_idx]
    community_subreddit_weight = subreddit_weight[community_idx]
    community_category_weight = category_weight[community_idx]
    implicit_score_df = calculate_implicit_score(community_df, community_subreddit_weight, community_category_weight, alpha)
    main_dfs.append(implicit_score_df)
main_dfs[9].head()

Unnamed: 0,Subreddit,Date,User_ID,PageRank,PR_Community,Category,Engagement_Score_Subreddit,Engagement_Score_Category,Last_Interaction_Date,Decay_Factor,Total_Score,Implicit_Score
0,AnimalsBeingDerps,2022-09-02,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.264485,0.24003,0.079254
1,AnimalsBeingDerps,2022-07-09,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.253438,0.230005,0.079254
2,AnimalsBeingDerps,2023-03-26,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.373265,0.338752,0.103489
3,AnimalsBeingDerps,2023-03-20,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.365468,0.331676,0.101366
4,AnimalsBeingDerps,2022-08-08,Sniflix,0.00018,0.002661,Animals and Pets,0.42,0.42,2023-06-22,0.259119,0.235161,0.079254
