In [1]:
# !pip install lightfm

Collecting lightfm
  Downloading lightfm-1.15.tar.gz (302 kB)
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py): started
  Building wheel for lightfm (setup.py): finished with status 'done'
  Created wheel for lightfm: filename=lightfm-1.15-cp37-cp37m-win_amd64.whl size=418663 sha256=5e299088364b556b8658d2597f869668ccde3569ef55ab9d4ed50ad7dc94692f
  Stored in directory: c:\users\wolf\appdata\local\pip\cache\wheels\f0\cd\a5\b07914aa223c05ed61880d4c59f64a7febf117dbd2c2cbcf49
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.15


In [1]:
import numpy as np
import pandas as pd
from lightfm.datasets import fetch_movielens
from lightfm import LightFM



In [2]:
with open('movies.csv', 'r') as movies_file:
    for i in range(5):
        print(movies_file.readline())

movieId,title,genres

1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy

2,Jumanji (1995),Adventure|Children|Fantasy

3,Grumpier Old Men (1995),Comedy|Romance

4,Waiting to Exhale (1995),Comedy|Drama|Romance



In [3]:
data_movies = pd.read_csv('movies.csv')
data_ratings = pd.read_csv('ratings.csv')

In [4]:
# Search duplicates

data_movies.title.value_counts()

Men with Guns (1997)             2
War of the Worlds (2005)         2
Ocean's Twelve (2004)            1
Porky's Revenge (1985)           1
Showgirls (1995)                 1
                                ..
Family Plot (1976)               1
Zelary (2003)                    1
Mr. Holmes (2015)                1
Heiress, The (1949)              1
Hot Tub Time Machine 2 (2015)    1
Name: title, Length: 10327, dtype: int64

In [5]:
data_movies[data_movies.title == 'Men with Guns (1997)']

Unnamed: 0,movieId,title,genres
1403,1788,Men with Guns (1997),Action|Drama
6270,26982,Men with Guns (1997),Drama


In [6]:
data_movies[data_movies.title == 'War of the Worlds (2005)']

Unnamed: 0,movieId,title,genres
6662,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
7963,64997,War of the Worlds (2005),Action|Sci-Fi


In [7]:
# Replace duplicated id

indexes = data_ratings[data_ratings.movieId == 64997].index
data_ratings.loc[indexes, 'movieId'] = 34048

indexes = data_ratings[data_ratings.movieId == 26982].index
data_ratings.loc[indexes, 'movieId'] = 1788

In [8]:
# Drop duplicates, only two samples

drop_indexes = data_movies[data_movies['movieId'] == 64997].index
data_movies.drop(drop_indexes, inplace=True)

drop_indexes = data_movies[data_movies['movieId'] == 26982].index
data_movies.drop(drop_indexes, inplace=True)

In [9]:
def get_score(scores, threshold, mean_total_score):
    
    """
    Formula for movie scoring
        (V / V+M)*R + (M / V+M)*C
    V - кол-во голосов за фильм
    M - порог голосов
    R - среднее арифметическое всех голосов за фильм
    С - средний рейтинг всех фильмов
    """
    
    num_votes = len(scores)
    mean_movie_score = np.mean(scores)
    
    movie_score = (
        (num_votes / (num_votes + threshold)) * mean_movie_score +
        (threshold / (num_votes + threshold)) * mean_total_score)
    
    return movie_score

In [10]:
threshold = 3.5
mean_total_score = data_ratings.rating.mean()

movie_scores = []

for index, row in data_movies.iterrows():
    movie_id = row['movieId']
    scores = (data_ratings[data_ratings['movieId'] == movie_id]
                .rating.to_list())
    movie_score = get_score(scores, threshold, mean_total_score)
    movie_scores.append(movie_score)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [18]:
movie_scores = pd.Series(movie_scores)
movie_scores.name = 'score'
movie_scores

0        3.901524
1        3.359256
2        3.208276
3        2.986826
4        3.264259
           ...   
10322    3.624217
10323    3.290884
10324    3.401995
10325    3.893689
10326    3.624217
Name: score, Length: 10327, dtype: float64

In [30]:
# Movies without scores

movie_scores.isna().sum()

4

In [31]:
mean_movie_score = movie_scores.mean()

In [32]:
data_movies.shape

(10327, 3)

In [35]:
# Fill the missing by mean

data_movies = (data_movies.join(movie_scores)
               .fillna({'score': mean_movie_score}))

In [36]:
data_movies.score.isna().sum()

0

In [37]:
data_movies

Unnamed: 0,movieId,title,genres,score
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.901524
1,2,Jumanji (1995),Adventure|Children|Fantasy,3.359256
2,3,Grumpier Old Men (1995),Comedy|Romance,3.208276
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2.986826
4,5,Father of the Bride Part II (1995),Comedy,3.264259
...,...,...,...,...
10324,146684,Cosmic Scrat-tastrophe (2015),Animation|Children|Comedy,3.401995
10325,146878,Le Grand Restaurant (1966),Comedy,3.893689
10326,148238,A Very Murray Christmas (2015),Comedy,3.624217
10327,148626,The Big Short (2015),Drama,3.401874


In [58]:




# #print training and testing data
# print(repr(data['train']))
# print(repr(data['test']))

# #create model
# model = LightFM(loss = 'warp')

# #train mode
# model.fit(data['train'], epochs=30, num_threads=2)

# #recommender fucntion
# def sample_recommendation(model, data, user_ids):
#     #number of users and movies in training data
#     n_users, n_items = data['train'].shape
#     for user_id in user_ids:
#     	#movies they already like
#         known_positives = data['item_labels'][data['train'].tocsr()[user_id].indices]

#         #movies our model predicts they will like
#         scores = model.predict(user_id, np.arange(n_items))
#         #sort them in order of most liked to least
#         top_items = data['item_labels'][np.argsort(-scores)]
#         #print out the results
#         print("User %s" % user_id)
#         print("     Known positives:")

#         for x in known_positives[:3]:
#             print("        %s" % x)

#         print("     Recommended:")

#         for x in top_items[:3]:
#             print("        %s" % x)
            
# sample_recommendation(model, data, [3, 25, 451])
