In [1]:
import pandas as pd
import numpy as np
import random

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from scipy import sparse

In [2]:
df = pd.read_csv('../100_processed0.csv')
df.head()

Unnamed: 0,username,review,movie,rating
0,Imme-van-Gorp,movie full suspense makes guess real happens w...,10 Cloverfield Lane,7
1,sonofocelot-1,leave review fairly concise film originally ca...,10 Cloverfield Lane,5
2,mhodaee,give credit owe original writers fascinating ...,10 Cloverfield Lane,5
3,fil-nik09,first must say expecting something different c...,10 Cloverfield Lane,5
4,DVR_Brale,ive always loved movies strong atmosphere thre...,10 Cloverfield Lane,7


## remove null records

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106011 entries, 0 to 106010
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   username  106011 non-null  object
 1   review    106008 non-null  object
 2   movie     106011 non-null  object
 3   rating    106011 non-null  object
dtypes: object(4)
memory usage: 3.2+ MB


In [4]:
df = df.drop(df[df.rating == 'Null'].index).reset_index(drop = True)
df.rating = df['rating'].astype(int)
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96108 entries, 0 to 96110
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   username  96108 non-null  object
 1   review    96108 non-null  object
 2   movie     96108 non-null  object
 3   rating    96108 non-null  int32 
dtypes: int32(1), object(3)
memory usage: 3.3+ MB


## TF-IDF Vector Matrix

In [5]:
%%time
tfidf_vect = TfidfVectorizer(max_features = 2000)
tfidf = tfidf_vect.fit_transform(df.review)
tfidf 

Wall time: 7.42 s


<96108x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 5750361 stored elements in Compressed Sparse Row format>

## similar users function

In [127]:
def similar_users(tgt_usr,tgt_mov,th = 0.5):
    """
    Function to find similar users from a given movie rating of a user,
    returns upto 10 similar users
    """
    # get review id
    review_id = df[(df.username == tgt_usr) & (df.movie == tgt_mov)].index[0]
    # calculate cosine similarity between that review and all other
    cos_sim = linear_kernel(tfidf[review_id:review_id+1], tfidf).flatten()
    # Indices of Similar (Users) reviews
    indexes = cos_sim.argsort()[::-1] 
    #print('Similar Users of [',tgt_usr,'] with ',th,' threshold:')
    sim_users = []
    for i in indexes[1:]:
        if(cos_sim[i]>th):
            user = df.iloc[i].tolist()[0]
#             print(user,'-----------',round(cos_sim[i]*100,2),'%')
            sim_users.append(user)
    if len(sim_users) > 0:
        return [user for user in sim_users if user!=tgt_usr][:10]
    else:
        #print(' None')
        return similar_users(tgt_usr,tgt_mov,th=round(th-0.05,2))

In [128]:
def similar_users2(tgt_usr,tgt_mov):
    """
    Function to find best 15 similar users irrespective of threshold
    """
    review_id = df[(df.username == tgt_usr) & (df.movie == tgt_mov)].index[0]
    cos_sim = linear_kernel(tfidf[review_id:review_id+1], tfidf).flatten()
    sim_userids = cos_sim.argsort()[:-16:-2]

    sim_users = []
    for i in sim_userids:
        #print(reverse_user_dict[i])
        sim_users.append(df.iloc[i].tolist()[0])
#         print(df.iloc[i].tolist()[0],'-----------',round(cos_sim[i]*100,2),'%')
        
    return sim_users[1:]

In [45]:
df.head()

Unnamed: 0,username,review,movie,rating
0,Imme-van-Gorp,movie full suspense makes guess real happens w...,10 Cloverfield Lane,7
1,sonofocelot-1,leave review fairly concise film originally ca...,10 Cloverfield Lane,5
2,mhodaee,give credit owe original writers fascinating ...,10 Cloverfield Lane,5
3,fil-nik09,first must say expecting something different c...,10 Cloverfield Lane,5
4,DVR_Brale,ive always loved movies strong atmosphere thre...,10 Cloverfield Lane,7


In [46]:
similar_users('Imme-van-Gorp','10 Cloverfield Lane')

andrisca ----------- 34.31 %
mchaney-72388 ----------- 33.63 %
neisesjason ----------- 33.4 %
DKosty123 ----------- 33.03 %
BigYeehaw ----------- 32.24 %
shawnman-56268 ----------- 32.02 %


['andrisca',
 'mchaney-72388',
 'neisesjason',
 'DKosty123',
 'BigYeehaw',
 'shawnman-56268']

In [122]:
similar_users2('Imme-van-Gorp','10 Cloverfield Lane')

Imme-van-Gorp ----------- 100.0 %
mchaney-72388 ----------- 33.63 %
DKosty123 ----------- 33.03 %
shawnman-56268 ----------- 32.02 %
BenMilliron ----------- 28.3 %
margaridamgrusso ----------- 28.07 %
doomsday-99 ----------- 27.9 %
bobdobalina-18772 ----------- 27.7 %


['mchaney-72388',
 'DKosty123',
 'shawnman-56268',
 'BenMilliron',
 'margaridamgrusso',
 'doomsday-99',
 'bobdobalina-18772']

In [123]:
similar_users('ssmp_btc','Batman v Superman_ Dawn of Justice')

quinnquintillus ----------- 48.02 %
TheFirstExecutioner ----------- 47.84 %
brando647 ----------- 47.13 %
Johnny-the-Film-Sentinel-2187 ----------- 46.95 %
schroeder-gustavo ----------- 46.75 %
sikandarv-60898 ----------- 45.32 %
talarisw ----------- 45.28 %
siksokori ----------- 45.18 %


['quinnquintillus',
 'TheFirstExecutioner',
 'brando647',
 'Johnny-the-Film-Sentinel-2187',
 'schroeder-gustavo',
 'sikandarv-60898',
 'talarisw',
 'siksokori']

In [124]:
similar_users2('ssmp_btc','Batman v Superman_ Dawn of Justice')

dimitri-dhaese ----------- 100.0 %
TheFirstExecutioner ----------- 47.84 %
Johnny-the-Film-Sentinel-2187 ----------- 46.95 %
sikandarv-60898 ----------- 45.32 %
siksokori ----------- 45.18 %
moriander ----------- 44.31 %
asrivarde ----------- 43.02 %
anjali-mohindra ----------- 42.9 %


['TheFirstExecutioner',
 'Johnny-the-Film-Sentinel-2187',
 'sikandarv-60898',
 'siksokori',
 'moriander',
 'asrivarde',
 'anjali-mohindra']

In [106]:
r1 = df[df.username == 'ssmp_btc']
r1

Unnamed: 0,username,review,movie,rating
96106,ssmp_btc,finally childhood dream came true matter much ...,Batman v Superman_ Dawn of Justice,8


In [107]:
r1.at[r1.index[0],'review']

'finally childhood dream came true matter much love spider x animated series dc superheros best past time cartoon seriesthe quality respective animated series always kept excited live action adaptation addition nostalgic memories dark knight trilogy involvement christopher nolan made hopeful justice league zack snider ruined meno matter critics told like steel bvs well expectationhowever say bad fantastic four definitely good avenger age ultron mediocre successor brilliant avenger movie batman superman scene either fighting interaction give goosebumps afleck gal gadot looked perfect roles much likable characterbut superman story arc meh lois lane supermanlex luther believable amy adams character lois lane times either smarter batman dumber loboshe acted painfully bad henry cavil present lex luther acting like sheldon cooper however may go character change next movie 1st hour like initial introduction every stage video game casual viewers wondering whats happening important plot lines l

In [108]:
r2 = df[df.username == 'dimitri-dhaese']
r2

Unnamed: 0,username,review,movie,rating
96109,dimitri-dhaese,god wanted love movie needed love movie hope g...,Batman v Superman_ Dawn of Justice,7


In [111]:
r2.at[r2.index[0],'review']

'god wanted love movie needed love movie hope gets better second viewing dont think mess big incoherent mess theres really good things going nothing save movie affleck batman right money jeremy irons alfred theyre working script couldnt save tree gal gadots wonder woman looks part setup standalone movie pretty obvious script mess theres explanation things heroes theres real macguffin future justice league movie carrot movie dumb donkey chasing obvious zack snyder doesnt know superman look ive superman collector  years trust love character love movie whatsoever dont understand motivation film think reason hes going batman clark well superman makes big hypocrite add time hes saving people expression similar daughters tell clean room didnt like superman movie love loïs clark stuff course special effects charts needed much movie tries way much fails miserably placeholder movie real icing justice league movie coming  years dont know still edit second viewing saw second time today liked time

## user_movie matrix

In [115]:
%%time

# Creating User Movie Matrix
user_movie = pd.DataFrame(df.username.unique(), columns = ['user']).set_index('user')
# Create Columns
for movie in df.movie.unique():
    user_movie[movie] = None
# Mapping the ratings
for i,user in enumerate(user_movie.index):
    this_user_ratings = df[df.username == user]
    for row in range(this_user_ratings.shape[0]):
        movie_name = this_user_ratings.movie.iloc[row]
        rating = this_user_ratings.rating.iloc[row]
        #user_movie.loc[user][] =  
        user_movie[movie_name][i] = rating
user_movie

Wall time: 6min 21s


Unnamed: 0_level_0,10 Cloverfield Lane,10 Things I Hate About You,12 Angry Men,12 Monkeys,12 Strong,12 Years a Slave,127 Hours,13 Hours,1408,1917,...,Back to the Future,Back to the Future Part II,Back to the Future Part III,Bad Teacher,Balls of Fury,Barry Lyndon,BASEketball,Batman Begins,Batman Returns,Batman v Superman_ Dawn of Justice
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Imme-van-Gorp,7,,,,,,,,,,...,,,,,,,,,,
sonofocelot-1,5,,,,,,,,,,...,,,,,,,,,,
mhodaee,5,,,,,,,,,,...,,,,,,,,,,
fil-nik09,5,,,,,,,,,,...,,,,,,,,,,7
DVR_Brale,7,,10,9,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
kristian_tolima,,,,,,,,,,,...,,,,,,,,,,7
ssmp_btc,,,,,,,,,,,...,,,,,,,,,,8
dkemathas,,,,,,,,,,,...,,,,,,,,,,7
dimitri-dhaese,,,,,,,,,,,...,,,,,,,,,,7


## recommend function

In [23]:
# defining recommender function
def recommend(target, sim_users, user_mov):
    user_movie = user_mov.replace(np.nan,0)
    # get similar users ratings for all movies / subsetting similar users from user_movie matrix
    similar_users = user_movie[user_movie.index.isin(sim_users)]
    #print(similar_users)
    # get average review given by these similar users to all movies
    similar_users = similar_users.mean(axis=0)
    #print(similar_users)
    similar_users_df = pd.DataFrame(similar_users, columns=['mean'])
    user_df = user_movie[user_movie.index == target]
    user_df_transposed = user_df.transpose()
    user_df_transposed.columns = ['rating']
    # select movies which are not watched by the user
    user_df_transposed = user_df_transposed[user_df_transposed['rating']==0]
    movies_unseen = user_df_transposed.index.tolist()
    # get average rating from similar users of the unseen movies by target user
    similar_users_df_filtered = similar_users_df[similar_users_df.index.isin(movies_unseen)]
    similar_users_df_ordered = similar_users_df_filtered.sort_values(by=['mean'], ascending=False)
    #print(similar_users_df_ordered)
    return similar_users_df_ordered[similar_users_df_ordered['mean'] > 0].index.tolist()[:10]
    

In [24]:
# Use cases
# 'nishitsingh' -> watched Avengers_ Endgame
# 'doeszdude' -> watched Avengers_ Endgame
# 'dude_14192' -> watched Avengers_ Endgame
# 'Sparky-string' -> watched Avengers_ Endgame
# 'maxshelley' -> watched Angust Rush
# 'nadiamica' -> watched Batman vs Superman

## random recommender function

In [152]:
def random_rec():
    """
    random recommender to check recommendations
    """
    all_users = df.username.unique().tolist()
    tgt_usr = random.choice(all_users)
    mov_of_usr = df.movie[df.username == tgt_usr].tolist()
    tgt_mov = random.choice(mov_of_usr)
    print('Movies watched by [',tgt_usr,']:\n**',df.movie[df.username == tgt_usr].tolist())
    print('input user  =',tgt_usr)
    print('input movie =',tgt_mov)
    print("\nRecommendations")
    return recommend(tgt_usr, similar_users(tgt_usr, tgt_mov), user_movie)

In [164]:
random_rec()

Movies watched by [ troubledyouth66 ]:
** ['2012', 'Avatar']
input user  = troubledyouth66
input movie = Avatar

Recommendations


['2001_ A Space Odyssey',
 'Arrival',
 '28 Days Later...',
 'Avengers_ Age of Ultron',
 'Alien_ Covenant']

In [None]:
# using second similar users function.

In [150]:
def random_rec2():
    """
    random recommender which uses similar_users2 function
    """
    all_users = df.username.unique().tolist()
    tgt_usr = random.choice(all_users)
    mov_of_usr = df.movie[df.username == tgt_usr].tolist()
    tgt_mov = random.choice(mov_of_usr)
    print('Movies watched by [',tgt_usr,']:\n**',df.movie[df.username == tgt_usr].tolist())
    print('input user  =',tgt_usr)
    print('input movie =',tgt_mov)
    print("\nRecommendations")
    return recommend(tgt_usr, similar_users2(tgt_usr, tgt_mov), user_movie)

In [174]:
random_rec2()

Movies watched by [ kirbylee70-599-526179 ]:
** ['12 Monkeys', 'Angels & Demons', 'Aquaman', 'Avengers_ Endgame', 'Avengers_ Infinity War']
input user  = kirbylee70-599-526179
input movie = Aquaman

Recommendations


['Avengers_ Age of Ultron',
 'Arrival',
 'A Star Is Born',
 'Ant-Man',
 '12 Years a Slave',
 '1917',
 'Batman v Superman_ Dawn of Justice',
 '13 Hours',
 'American Made',
 'American Sniper']