In [1]:
# !pip install lightfm

Collecting lightfm
  Downloading lightfm-1.15.tar.gz (302 kB)
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py): started
  Building wheel for lightfm (setup.py): finished with status 'done'
  Created wheel for lightfm: filename=lightfm-1.15-cp37-cp37m-win_amd64.whl size=418663 sha256=5e299088364b556b8658d2597f869668ccde3569ef55ab9d4ed50ad7dc94692f
  Stored in directory: c:\users\wolf\appdata\local\pip\cache\wheels\f0\cd\a5\b07914aa223c05ed61880d4c59f64a7febf117dbd2c2cbcf49
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.15


In [2]:
import numpy as np
import pandas as pd
from lightfm.datasets import fetch_movielens
from lightfm import LightFM



In [3]:
with open('movies.csv', 'r') as movies_file:
    for i in range(5):
        print(movies_file.readline())

movieId,title,genres

1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy

2,Jumanji (1995),Adventure|Children|Fantasy

3,Grumpier Old Men (1995),Comedy|Romance

4,Waiting to Exhale (1995),Comedy|Drama|Romance



In [4]:
data_movies = pd.read_csv('movies.csv')
data_ratings = pd.read_csv('ratings.csv')

In [5]:
# Search duplicates

data_movies.title.value_counts()

War of the Worlds (2005)             2
Men with Guns (1997)                 2
River's Edge (1986)                  1
Talaash (2012)                       1
Boondock Saints, The (2000)          1
                                    ..
99 francs (2007)                     1
We're No Angels (1955)               1
Krull (1983)                         1
Since You Went Away (1944)           1
Children of the Revolution (1996)    1
Name: title, Length: 10327, dtype: int64

In [6]:
data_movies[data_movies.title == 'Men with Guns (1997)']

Unnamed: 0,movieId,title,genres
1403,1788,Men with Guns (1997),Action|Drama
6270,26982,Men with Guns (1997),Drama


In [7]:
data_movies[data_movies.title == 'War of the Worlds (2005)']

Unnamed: 0,movieId,title,genres
6662,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
7963,64997,War of the Worlds (2005),Action|Sci-Fi


In [8]:
# Replace duplicated id

indexes = data_ratings[data_ratings.movieId == 64997].index
data_ratings.loc[indexes, 'movieId'] = 34048

indexes = data_ratings[data_ratings.movieId == 26982].index
data_ratings.loc[indexes, 'movieId'] = 1788

In [9]:
# Drop duplicates, only two samples

drop_indexes = data_movies[data_movies['movieId'] == 64997].index
data_movies.drop(drop_indexes, inplace=True)

drop_indexes = data_movies[data_movies['movieId'] == 26982].index
data_movies.drop(drop_indexes, inplace=True)

In [10]:
def get_score(scores, threshold, mean_total_score):
    
    """
    Formula for movie scoring
        (V / V+M)*R + (M / V+M)*C
    V - кол-во голосов за фильм
    M - порог голосов
    R - среднее арифметическое всех голосов за фильм
    С - средний рейтинг всех фильмов
    """
    
    num_votes = len(scores)
    mean_movie_score = np.mean(scores)
    
    movie_score = (
        (num_votes / (num_votes + threshold)) * mean_movie_score +
        (threshold / (num_votes + threshold)) * mean_total_score)
    
    return movie_score

In [11]:
threshold = 3.0
mean_total_score = data_ratings.rating.mean()

movie_scores = []

for index, row in data_movies.iterrows():
    movie_id = row['movieId']
    scores = (data_ratings[data_ratings['movieId'] == movie_id]
                .rating.to_list())
    movie_score = get_score(scores, threshold, mean_total_score)
    movie_scores.append(movie_score)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [12]:
movie_scores = pd.Series(movie_scores)
movie_scores.name = 'score'
movie_scores

0        3.902343
1        3.358427
2        3.205747
3        2.967897
4        3.262316
           ...   
10322    3.637638
10323    3.262638
10324    3.387638
10325    3.925092
10326    3.637638
Name: score, Length: 10327, dtype: float64

In [13]:
# Movies without scores

movie_scores.isna().sum()

4

In [14]:
mean_movie_score = movie_scores.mean()

In [15]:
data_movies.shape

(10327, 3)

In [16]:
# Fill the missing by mean

data_movies = (data_movies.join(movie_scores)
               .fillna({'score': mean_movie_score}))

In [17]:
data_movies.score.isna().sum()

0

In [309]:
data_movies

Unnamed: 0,movieId,title,genres,score
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.902343
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.358427
2,3,Grumpier Old Men (1995),Comedy|Romance,3.205747
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.967897
4,5,Father of the Bride Part II (1995),Comedy,3.262316
...,...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy,3.387638
10325,146878,Le Grand Restaurant (1966),Comedy,3.925092
10326,148238,A Very Murray Christmas (2015),Comedy,3.637638
10327,148626,The Big Short (2015),Drama,3.391152


In [310]:
data_ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
...,...,...,...,...
105334,668,142488,4.0,1451535844
105335,668,142507,3.5,1451535889
105336,668,143385,4.0,1446388585
105337,668,144976,2.5,1448656898


In [30]:
import pandas as pd
import numpy as np
from scipy import sparse
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity


def get_interaction_matrix(df, users, items, ratings, 
                              normalize=False, threshold=None):   
    interaction_matrix = (df.groupby([users, items])[ratings]
                        .sum().unstack().reset_index()
                        .fillna(0).set_index(users))
    if normalize:
        interaction_matrix = (interaction_matrix.applymap(
            lambda x: 1 if x > threshold else 0))
    return interaction_matrix



In [308]:
interactions = get_interaction_matrix(df=data_ratings, users='userId', 
    items='movieId', ratings='rating', threshold=threshold)

# interactions.shape

NameError: name 'get_interaction_matrix' is not defined

In [32]:
interactions

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
# def get_user_dict

user_dict = (data_ratings.groupby('userId').count()
             .reset_index()[['userId']])
user_dict

Unnamed: 0,userId
0,1
1,2
2,3
3,4
4,5
...,...
663,664
664,665
665,666
666,667


In [159]:
movies_dict = (data_movies.groupby(['movieId', 'title'])
             .sum()
             .reset_index()[['movieId', 'title']]
             .set_index('movieId'))

In [94]:
# def run_predictor(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs=-1):
#     x = sparse.csr_matrix(interactions.values)
#     model = LightFM(no_components= n_components, loss=loss,k=k)
#     model.fit(x,epochs=epoch,num_threads = n_jobs)
#     return model

# mf_model = runMF(interactions = interactions,
#                  n_components = 30,
#                  loss = 'warp',
#                  k =15,
#                  epoch = 30,
#                  n_jobs = 4)

x = sparse.csr_matrix(interactions.values)
model = LightFM(no_components=30, loss='warp', k=15)
model.fit(x, epochs=30, num_threads = 4)



<lightfm.lightfm.LightFM at 0x196004c5688>

In [95]:
model

<lightfm.lightfm.LightFM at 0x196004c5688>

In [193]:
# def sample_recommendation_user

# model, interactions, user_id, user_dict, 
# item_dict,threshold = 0,nrec_items = 10, show = True
user_id=10 
threshold=4
nrec_items=10


n_users, n_items = interactions.shape
user_x = user_dict.loc[user_id]

scores = pd.Series(model.predict(user_ids=user_x, 
                                 item_ids=np.arange(n_items)))
scores.index = interactions.columns

rated_movies = (interactions.loc[user_id, :]
                [interactions.loc[user_id, :] > 0]
                .sort_values(ascending=False))

recommend_ids = (scores[~(interactions.loc[user_id, :] > 0)]
                 .sort_values(ascending=False)
                 [:nrec_items])

# Show rated and recommended
(movies_dict.loc[rated_movies.index.to_list()], 
 movies_dict.loc[recommend_ids.index.to_list()])


(                                                     title
 movieId                                                   
 39                                         Clueless (1995)
 2081                            Little Mermaid, The (1989)
 7451                                     Mean Girls (2004)
 2572                     10 Things I Hate About You (1999)
 3362                              Dog Day Afternoon (1975)
 2528                                    Logan's Run (1976)
 2390                                   Little Voice (1998)
 253      Interview with the Vampire: The Vampire Chroni...
 356                                    Forrest Gump (1994)
 1093                                     Doors, The (1991)
 81591                                    Black Swan (2010)
 4880                                Life as a House (2001)
 3159                                  Fantasia 2000 (1999)
 1727                           Horse Whisperer, The (1998)
 3186                              Girl,

In [291]:
# def sample_recommendation_item

# model = mf_model,
# interactions = interactions,
item_id = 51
# user_dict = user_dict,
# item_dict = movies_dict,
number_of_user = 15

n_users, n_items = interactions.shape
x = np.array(interactions.columns)
scores = pd.Series(model.predict(
    np.arange(n_users), 
    np.repeat(x.searchsorted(item_id), n_users)))

similar_users = scores.sort_values(ascending=False)[:number_of_user].index.to_list()
similar_users

[378, 662, 444, 92, 584, 1, 456, 594, 570, 647, 224, 301, 95, 449, 265]

In [263]:
similarities = cosine_similarity(
    sparse.csr_matrix(model.item_embeddings))
item_emdedding_distance_matrix = pd.DataFrame(similarities)
item_emdedding_distance_matrix.columns = interactions.columns
item_emdedding_distance_matrix.index = interactions.columns

item_emdedding_distance_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.744691,0.370390,0.260378,0.511054,0.565728,0.568575,-0.004583,-0.231928,0.556536,...,-0.264735,-0.557663,-0.692606,-0.569276,-0.334060,-0.452378,-0.399604,-0.696777,-0.368320,-0.430646
2,0.744691,1.000000,0.508692,0.183482,0.558701,0.611110,0.644875,0.023530,-0.136803,0.801006,...,-0.128570,-0.321282,-0.725036,-0.352115,-0.143900,-0.454270,-0.475902,-0.497002,-0.300195,-0.167047
3,0.370390,0.508692,1.000000,0.305321,0.773694,0.347169,0.603282,0.253674,0.330179,0.384357,...,-0.141158,-0.183937,-0.297129,-0.193653,-0.224544,-0.158280,-0.171066,-0.247391,-0.158259,-0.023801
4,0.260378,0.183482,0.305321,1.000000,0.326979,0.314381,0.402603,0.517955,0.467044,0.389882,...,-0.117448,-0.194975,-0.232293,-0.095813,-0.094927,-0.104862,-0.008614,-0.140214,-0.237739,-0.050893
5,0.511054,0.558701,0.773694,0.326979,1.000000,0.396878,0.778763,0.367771,0.315620,0.443602,...,-0.161987,-0.234907,-0.299958,-0.275708,-0.242143,-0.219014,-0.289114,-0.316650,-0.176587,-0.093724
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146684,-0.452378,-0.454270,-0.158280,-0.104862,-0.219014,-0.239735,-0.460850,0.112494,0.461578,-0.372449,...,0.231123,0.341799,0.528599,0.357504,0.456470,1.000000,0.889398,0.623812,0.362128,0.196460
146878,-0.399604,-0.475902,-0.171066,-0.008614,-0.289114,-0.363378,-0.504186,0.208007,0.480803,-0.408023,...,0.328680,0.467891,0.399366,0.435921,0.564292,0.889398,1.000000,0.673211,0.200698,0.340995
148238,-0.696777,-0.497002,-0.247391,-0.140214,-0.316650,-0.386523,-0.419051,0.093678,0.413629,-0.366654,...,0.274701,0.890128,0.560011,0.906857,0.696315,0.623812,0.673211,1.000000,0.239955,0.798903
148626,-0.368320,-0.300195,-0.158259,-0.237739,-0.176587,-0.202013,-0.137692,-0.043629,0.254198,-0.265670,...,-0.009581,0.102113,0.688411,0.099445,0.045669,0.362128,0.200698,0.239955,1.000000,-0.063986


In [300]:
# def item_item_recommendation


item_id = 49
item_dict = movies_dict.index.to_list()
n_items = 10

recommended_items = (item_emdedding_distance_matrix.loc[item_id, :]
 .sort_values(ascending = False)[1: n_items+1])

movies_dict.loc[recommended_items.index.to_list()]

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
467,Live Nude Girls (1995)
1548,"War at Home, The (1996)"
844,"Story of Xinghua, The (Xinghua san yue tian) (..."
200,"Tie That Binds, The (1995)"
988,Grace of My Heart (1996)
979,Nothing Personal (1995)
439,Dangerous Game (1993)
679,"Run of the Country, The (1995)"
1121,Glory Daze (1995)
2101,Squanto: A Warrior's Tale (1994)


In [307]:
# item_id
movies_dict.loc[item_id]

title    When Night Is Falling (1995)
Name: 49, dtype: object

In [58]:




# #print training and testing data
# print(repr(data['train']))
# print(repr(data['test']))

# #create model
# model = LightFM(loss = 'warp')

# #train mode
# model.fit(data['train'], epochs=30, num_threads=2)

# #recommender fucntion
# def sample_recommendation(model, data, user_ids):
#     #number of users and movies in training data
#     n_users, n_items = data['train'].shape
#     for user_id in user_ids:
#     	#movies they already like
#         known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

#         #movies our model predicts they will like
#         scores = model.predict(user_id, np.arange(n_items))
#         #sort them in order of most liked to least
#         top_items = data['item_labels'][np.argsort(-scores)]
#         #print out the results
#         print("User %s" % user_id)
#         print("     Known positives:")

#         for x in known_positives[:3]:
#             print("        %s" % x)

#         print("     Recommended:")

#         for x in top_items[:3]:
#             print("        %s" % x)
            
# sample_recommendation(model, data, [3, 25, 451])
