In [None]:
import pandas as pd
import numpy as np
from lenskit import batch,topn
from lenskit.metrics import topn as tn
from lenskit.algorithms import als
from lenskit.matrix import  CSR,RatingMatrix
from scipy import spatial
import matplotlib.pyplot as plt
from lenskit import topn
from IPython.core.debugger import set_trace
from lenskit.metrics import predict
import random

# Data Prep
Here we are importing the ratings data file and the truncated users ratings file( which consists of the ratings of our sampled casual users and their popular movie ratings. 

In [None]:
train=pd.read_csv('/project/naray190/ml-20m/ratings.csv')
test=pd.read_csv('/project/naray190/ml-20m/test_casual_user_ratings.csv')


We are renaming the columns of the data to match with the values used by the functions of the "lenskit" package. 

In [None]:
train=train[['userId','movieId','rating']]
test=test[['userId','movieId','rating']]
train.columns = ['user','item','rating']
test.columns=['user','item','rating']


## Rating Normalization

Here, we are normalizing the ratings of our data sets. We are removing both user biases and item biases from the two sets of ratings data that we have.
We are normalizing our data because to implement our fold in function we need to create which overrides some of the functionality of the original matrix factorization class implemented in Lenskit. Overriding would be difficult without manually removing the biases and having two seperate user indexes which is tied into the biasing method implemented

In [None]:
gbias=train['rating'].mean()
train['rating']-=gbias
test['rating']-=gbias

In [None]:

group=train.groupby('item')['rating']
item_biases=group.sum()/(group.count()+5)
train=train.join(pd.DataFrame(item_biases),on="item",how="inner",rsuffix="_im")
train=train.assign(rating=lambda df:df.rating-df.rating_im)
test=test.join(pd.DataFrame(item_biases),on="item",how="inner",rsuffix="_im")
test=test.assign(rating=lambda df:df.rating-df.rating_im)

In [None]:
group=train.groupby('user')['rating']
user_biases_train=group.sum()/(group.count()+5)
train=train.join(pd.DataFrame(user_biases_train),on="user",how="inner",rsuffix="_um")
train=train.assign(rating=lambda df:df.rating-df.rating_um)
group=test.groupby('user')['rating']
user_biases_test=group.sum()/(group.count()+5)
test=test.join(pd.DataFrame(user_biases_test),on="user",how="inner",rsuffix="_um")
test=test.assign(rating=lambda df:df.rating-df.rating_um)


In [None]:
train=train[['user','item','rating']]#cleaning up the columns, removing the extra columns we used to subtract the biases from the ratings.
test=test[['user','item','rating']]

# Folding-In Function

Here we implement a fold-in function where we can pass a ratings matrix and it will help generate user feature vectors given the pre-existing model trained item feature vectors.

In [None]:
class FoldIn(als.BiasedMF):
    def __init__(self,*args,**kwargs):
        super (FoldIn,self).__init__(*args,**kwargs)
        self.bias=None
    def fold_in(self,new_ratings):
        #set_trace()
        rmat, users, items = sparse_ratings(new_ratings,iidx=self.item_index_)
        n_users = len(users)
        n_items = len(items)
        
        
        umat = np.full((n_users, self.features), np.nan)
        #set_trace()
        umat = als._train_matrix(rmat.N, self.item_features_, self.regularization)
        #set_trace()

        return umat,users
    

def sparse_ratings(ratings, scipy=False,uidx=None,iidx=None):
    """
    Convert a rating table to a sparse matrix of ratings.
    Args:
        ratings(pandas.DataFrame): a data table of (user, item, rating) triples.
        scipy: if ``True``, return a SciPy matrix instead of :py:class:`CSR`.
    Returns:
        RatingMatrix:
            a named tuple containing the sparse matrix, user index, and item index.
    """
    #set_trace()
    if(uidx is None): 
        uidx = pd.Index(ratings.user.unique(), name='user')
    if(iidx is None):
        iidx = pd.Index(ratings.item.unique(), name='item')
    

    row_ind = uidx.get_indexer(ratings.user).astype(np.int32)
    col_ind = iidx.get_indexer(ratings.item).astype(np.int32)

    if 'rating' in ratings.columns:
        vals = np.require(ratings.rating.values, np.float64)
    else:
        vals = None

    matrix = CSR.from_coo(row_ind, col_ind, vals, (len(uidx), len(iidx)))
    #set_trace()
    if scipy:
        matrix = CSR.to_scipy(matrix)

    return RatingMatrix(matrix, uidx, iidx)

algo=FoldIn(features=25,iterations=50,reg=0.1)#we are overwritng the model object with our own Fold-In object. 
algo.fit(train)
regularumat=algo.user_features_
poponlyumat,popuindex=algo.fold_in(test)# the folding_in function returns to us the user matrix for our sample users and a user index


In [None]:
all_users=set(train.user.unique())

In [None]:
users=set(test.user.unique())
other_users= all_users-users

Here we are calculating the average cosine similarity between two users who are not being tested. This is for a baseline when looking at the cosine similarity between the two profiles of our test users. 

In [None]:
dot_product=0
avgcos=0
ux=algo.user_index_
for i in range(10000):
    luser=random.sample(other_users,2)
    uix1=ux.get_loc(luser[0])
    uix2=ux.get_loc(luser[1])
    u1v=regularumat[uix1]
    u2v=regularumat[uix2]
    dot_product=1-spatial.distance.cosine(u1v,u2v)
    avgcos+=dot_product
print(avgcos/10000)

# User Similarity and Baseline Pairwise Similarity

In [None]:
testusers=test['user'].unique().tolist()
user_simscore=pd.DataFrame(columns=['user','simscore'])
r=0

for user in testusers:
    indexf = algo.user_index_.get_loc(user)
    indexp= popuindex.get_loc(user)
    full_v=regularumat[indexf]
    pop_v=poponlyumat[indexp]
    dot_product=1-spatial.distance.cosine(full_v,pop_v)
    user_simscore.loc[r]=[user,dot_product]
    r=r+1

Baseline pairwise similarity between the users of our test group. We look at the baseline similarity for both profiles. 

In [None]:
pairwise_sim_score=0
count=0
for user1 in testusers:
    
    if testusers.index(user1) != len(testusers)-1:
        user2=testusers[testusers.index(user1)+1]
        ind1=algo.user_index_.get_loc(user1)
        ind2=algo.user_index_.get_loc(user2)
        u1v=regularumat[ind1]
        u2v=regularumat[ind2]
        dot_product=1-spatial.distance.cosine(u1v,u2v)
        pairwise_sim_score+=dot_product
        count+=1
    


In [None]:
pairwise_sim_score/count

In [None]:
pairwise_sim_score_pop=0
count=0
for user1 in testusers:
    
    if testusers.index(user1) != len(testusers)-1:
        user2=testusers[testusers.index(user1)+1]
        ind1=popuindex.get_loc(user1)
        ind2=popuindex.get_loc(user2)
        u1v=poponlyumat[ind1]
        u2v=poponlyumat[ind2]
        dot_product=1-spatial.distance.cosine(u1v,u2v)
        pairwise_sim_score_pop+=dot_product
        count+=1
    


In [None]:
pairwise_sim_score_pop/count

In [None]:
users=test.user.unique().tolist()

In [None]:
itemmat=algo.item_features_

In [None]:
users

In [None]:
predicts=pd.DataFrame(columns=['user','item','score','rank'])

In [None]:
pop_predicts=pd.DataFrame(columns=['user','item','score','rank'])

# Scoring the items here

The following function is used to find candidate items for our user, score the items, and rank them by the score, and return the top-k recommendations. 

In [None]:

def recommend_for_users(algo,users,user_index,user_matrix,predicts,train,k):
    for user in users:
        uix=user_index.get_loc(user)
        uvfull=user_matrix[uix]
        user_movies=train.loc[train['user']==user]
        movie_list=set(user_movies['item'].tolist())
        candidates=set(train['item'].unique())-movie_list
        remove_movie=set(movie_data.loc[movie_data["popularity"]<10].item.values)
        candidates=candidates-remove_movie
        candidates=list(candidates)
        iix=algo.lookup_items(candidates)
    #for movie in candidates:
        #iix=algo.item_index_.get_loc(movie)
        score=np.matmul(algo.item_features_[iix],uvfull)
        score=score+item_biases.loc[candidates]+user_biases_train[user]+gbias
        scores=pd.DataFrame({"item":candidates,"score":score})
        scores['user']=user
        scores=scores.sort_values('score',ascending=False)
        scores=scores.head(k)
        scores['rank']=scores['score'].rank(ascending=0)
        predicts=predicts.append(scores,sort=True)
    return predicts

Here, we generate recommendations for the two sets of profiles to compare the recommendations that the full profile and the popular only profile receives.

In [None]:
predicts=recommend_for_users(algo,users,algo.user_index_,algo.user_features_,predicts,train,20)

In [None]:
pop_predicts=recommend_for_users(algo,users,popuindex,poponlyumat,predicts,train,20)

In [None]:
all_users=set(train.user.unique())

In [None]:
other=all_users-set(users)

In [None]:
other_users=random.sample(other,1000)

In [None]:
ou_list=list(other_users)

In [None]:
other_predicts=pd.DataFrame(columns=['user','item','score','rank'])

In [None]:
other_predicts=recommend_for_users(algo,ou_list,algo.user_index_,algo.user_features_,other_predicts,train,20)

In [None]:
overlap=0#calculating the average overlap between users in the test set
for i in range(10000):
    u=random.sample(users,2)
    rec1=set(predicts.item.loc[predicts.user == u[0]])
    rec2=set(predicts.item.loc[predicts.user == u[1]])
    o=len(rec1.intersection(rec2))
    overlap += o
    
overlap=overlap/1000

In [None]:
overlap/10

In [None]:
overlap=0# calculates the average overlap between recs generated for users outside of the test set.
for i in range(10000):
    u=random.sample(other_users,2)
    rec1=set(other_predicts.item.loc[other_predicts.user == u[0]])
    rec2=set(other_predicts.item.loc[other_predicts.user == u[1]])
    o=len(rec1.intersection(rec2))
    overlap += o

    overlap=overlap/10000

In [None]:
overlap

In [None]:
test_users=random.sample(all_users,100)

In [None]:
maxsimscores=[]# generates the average highest cosine similarity score a user gets for another user in the system. This 
                # is to provide some insight on the similarity scores we observe between two profiles of a user.
for u1 in users:
    maxscore=0;
    for u2 in list(all_users):
        if u2 != u1:
            u1ix=algo.user_index_.get_loc(u1)
            u2ix=algo.user_index_.get_loc(u2)
            u1v=regularumat[u1ix]
            u2v=regularumat[u2ix]
            dot_product=1-spatial.distance.cosine(u1v,u2v)
            if dot_product>maxscore:
                maxscore=dot_product
            
        
    maxsimscores.append(maxscore)
            

In [None]:
sum(maxsimscores)/1000

In [None]:
predicts.to_csv("ALS25FT20RecsReg01fwithrank.tsv",sep="\t",index=None)

In [None]:
pop_predicts.to_csv("ALS25FT20RecsPopProfileReg01fwithrank.tsv",sep="\t",index=None)

In [None]:
movie_data=pd.read_csv("/project/naray190/movie_data_20M.csv",sep=",")

In [None]:
movie_data.columns=['item','popularity','avgRating']

In [None]:
user_data=pd.read_csv("/project/naray190/user_data_20M.csv",sep=",")

In [None]:
user_data

In [None]:
user_data.columns=['user','count','popcount','unpopcount','percentpop']

Here, we are looking at the impressions(no of times being recommended overall) of the movies being recommended to the test set of users. 

In [None]:
movie_impressions=pd.DataFrame(columns=['item','impression'])

In [None]:
movie=set()
for index, row in predicts.iterrows():
    
    movie_id=row['item']
    if(movie_id in movie):
        mirow=movie_impressions.loc[movie_impressions['item'] == movie_id]
        count=mirow['impression']+1
        movie_impressions.impression.loc[movie_impressions['item'] ==  movie_id] = count
    
    else:
        newrow=pd.DataFrame([[movie_id,1]], columns=['item','impression'])
        movie.add(movie_id)    
        movie_impressions=movie_impressions.append(newrow)
            

In [None]:
pop_movie_impressions=pd.DataFrame(columns=['item','impression'])

In [None]:
movie=set()
for index, row in pop_predicts.iterrows():
    
    movie_id=row['item']
    if(movie_id in movie):
        mirow=pop_movie_impressions.loc[pop_movie_impressions['item'] == movie_id]
        count=mirow['impression']+1
        pop_movie_impressions.impression.loc[pop_movie_impressions['item'] ==  movie_id] = count
    
    else:
        newrow=pd.DataFrame([[movie_id,1]], columns=['item','impression'])
        movie.add(movie_id)    
        pop_movie_impressions=pop_movie_impressions.append(newrow)
    

In [None]:
mov_data=pd.merge(movie_impressions,movie_data,on=['item'])

In [None]:
pop_mov_data=pd.merge(pop_movie_impressions,movie_data,on=['item'])

In [None]:
mov_data.loc[mov_data.popularity<20].sort_values(by="impression",ascending=False)

In [None]:
pop_mov_data.loc[pop_mov_data.popularity<20].sort_values(by="popularity",ascending=True)

In [None]:
mov_data.to_csv("movie_recommendation_impressions_ALS25Freg01filtered.csv",sep="\t",index=None)
pop_mov_data.to_csv("movie_pop_recommendation_impressions_ALS25Freg01filtered.csv",sep="\t",index=None)

In [None]:
pred_movies=set(predicts['item'].unique())
pop_pred_movies=set(pop_predicts['item'].unique())

Calculating the overlap between the recommendations received by a test user's full and popular only profile.

In [None]:
user_overlap=pd.DataFrame(columns=["user","overlap"])

In [None]:

for user in users:
    frecs=set(predicts.item.loc[predicts.user == user])
    precs=set(pop_predicts.item.loc[pop_predicts.user == user])
    overlap=len(frecs.intersection(precs))
    newrow=pd.DataFrame([[user,overlap]],columns=["user","overlap"])
    user_overlap=user_overlap.append(newrow)

In [None]:
popIds=set(movie_data.item.loc[movie_data.popularity>1450])# extracting the movieId of the top 2500 movies in the system

Here, we generate a summary of the recommendation the test users receive. This includes how many of the movies are popular vs unpopular and the average score of the movies and the average rating of the movies being recommended. 

In [None]:
user_full_recs_summary=pd.DataFrame(columns=["user","popcount","unpopcount","avgscore","avgrating"])

In [None]:
for user in users:
    
    recs=predicts.loc[predicts.user == user]
    recset=set(recs.item)
    recpop=len(recs.loc[recs.item.isin(popIds)])
    recunpop=20-recpop
    recavgscore=recs.score.mean()
    recavgrating=movie_data.avgRating.loc[movie_data.item.isin(recset)].mean()
    newrow=pd.DataFrame([[user,recpop,recunpop,recavgscore,recavgrating]],columns=["user","popcount","unpopcount","avgscore","avgrating"])
    user_full_recs_summary=user_full_recs_summary.append(newrow)
    

In [None]:
user_full_recs_summary.to_csv("ALS25Freg01recsummaryfull.csv",sep=",",header=True,index=None)

In [None]:
user_pop_recs_summary=pd.DataFrame(columns=["user","popcount","unpopcount","avgscore","avgrating"])

In [None]:
for user in users:
    
    recs=pop_predicts.loc[pop_predicts.user == user]
    recset=set(recs.item)
    recpop=len(recs.loc[recs.item.isin(popIds)])
    recunpop=20-recpop
    recavgscore=recs.score.mean()
    recavgrating=movie_data.avgRating.loc[movie_data.item.isin(recset)].mean()
    newrow=pd.DataFrame([[user,recpop,recunpop,recavgscore,recavgrating]],columns=["user","popcount","unpopcount","avgscore","avgrating"])
    user_pop_recs_summary=user_pop_recs_summary.append(newrow)
    

In [None]:
user_pop_recs_summary.to_csv("ALS25Freg01recssummarypop.csv",sep=",",header=True,index=None)

Here, we are calculating the diversity of user recommendations, we do this by looking at how far apart the item feature vectors of the items being recommended to a user are. 

In [None]:
user_diversity=pd.DataFrame(columns=['user','diversity'])

In [None]:
for user in users:
    rec_items=list(predicts.item.loc[predicts.user == user])
    count=0
    diversity=0
    for item in rec_items:
        if rec_items.index(item) < (len(rec_items)-1): 
            iix1=algo.item_index_.get_loc(item)
            iix2=algo.item_index_.get_loc(rec_items[rec_items.index(item) + 1])
            iu=algo.item_features_[iix1]
            iv=algo.item_features_[iix2]
            dot_product=spatial.distance.cosine(iu,iv)
            diversity+=dot_product
            count+=1
    
    diversity=diversity/count
    newrow=pd.DataFrame([[user,diversity]],columns=['user','diversity'])
    user_diversity=user_diversity.append(newrow)
                  
    

In [None]:
user_diversity_pop=pd.DataFrame(columns=['user','diversity'])

In [None]:
for user in users:
    rec_items=list(pop_predicts.item.loc[pop_predicts.user == user])
    count=0
    diversity=0
    for item in rec_items:
        if rec_items.index(item) < (len(rec_items)-1): 
            iix1=algo.item_index_.get_loc(item)
            iix2=algo.item_index_.get_loc(rec_items[rec_items.index(item) + 1])
            iu=algo.item_features_[iix1]
            iv=algo.item_features_[iix2]
            dot_product=spatial.distance.cosine(iu,iv)
            diversity+=dot_product
            count+=1
    
    diversity=diversity/count
    newrow=pd.DataFrame([[user,diversity]],columns=['user','diversity'])
    user_diversity_pop=user_diversity_pop.append(newrow)
                  
    

The next two cells calculate the average diversity of all the movies that are recommended overall.

In [None]:
count=0
diver=0
pred_movie_list=list(predicts.item.unique())
for movie in pred_movie_list:
    if pred_movie_list.index(movie) < (len(pred_movie_list)-1):
        iix1=algo.item_index_.get_loc(movie)
        iix2=algo.item_index_.get_loc(pred_movie_list[pred_movie_list.index(movie) + 1])
        iu=algo.item_features_[iix1]
        iv=algo.item_features_[iix2]
        dot_product=spatial.distance.cosine(iu,iv)
        diver+=dot_product
        count+=1
print(diver/count)
    
    

In [None]:
count=0
diver=0
pred_movie_list=list(pop_predicts.item.unique())
for movie in pred_movie_list:
    if pred_movie_list.index(movie) < (len(pred_movie_list)-1):
        iix1=algo.item_index_.get_loc(movie)
        iix2=algo.item_index_.get_loc(pred_movie_list[pred_movie_list.index(movie) + 1])
        iu=algo.item_features_[iix1]
        iv=algo.item_features_[iix2]
        dot_product=spatial.distance.cosine(iu,iv)
        diver+=dot_product
        count+=1
print(diver/count)
    
    

We now are looking at how personalized recommendations are. We get the top 20 most popular movies(which is what the most unpersonalized recommender who recommend) and look at the overlap between this top-20 and the top-20 recommended to a user

In [None]:
top_movies=movie_data.sort_values('popularity',ascending=False)
top_movies=set(top_movies.head(20).item)

In [None]:
top_movies

In [None]:
personalization_recs=pd.DataFrame(columns=['user','full','popularonly'])

In [None]:
for user in users:
    movlistf=set(predicts.loc[predicts['user']==user]['item'])
    movlistp=set(pop_predicts.loc[pop_predicts['user']==user]['item'])
    full=len(movlistf.intersection(top_movies))
    popularonly=len(movlistp.intersection(top_movies))
    newrow=pd.DataFrame([[user,full,popularonly]],columns=['user','full','popularonly'])
    personalization_recs=personalization_recs.append(newrow)
    

In [None]:
personalization_recs.to_csv('ALS25Fr01filteredfvsp.csv',sep=",",header=True,index=None)

In [None]:
user_diversity_pop.to_csv("ALS25FReg01user_diversity_pop.csv",sep=",",header=True,index=None)

In [None]:
user_diversity.to_csv("ALS25FReg01user_diversity.csv",sep=",",header=True,index=None)

In [None]:
user_overlap.to_csv("ALS25Freg01filteredoverlap.csv",sep=",",header=True,index=None)

In [None]:
predicts.to_csv('ALS_30F_recs_folding_in.tsv',sep='\t',header=True,index=None)

In [None]:
user_simscore.to_csv('user_simscore_folded_in_ALS_25reg01filtered.tsv',sep='\t',header=True,index=None)