In [1]:
# !pip install lightfm

In [2]:
# ! pip install pytelegrambotapi

In [3]:
import numpy as np
import pandas as pd
import telebot
import datetime

from lightfm import LightFM
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from telebot import types



In [4]:
with open('movies.csv', 'r') as movies_file:
    for i in range(5):
        print(movies_file.readline())

movieId,title,genres

1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy

2,Jumanji (1995),Adventure|Children|Fantasy

3,Grumpier Old Men (1995),Comedy|Romance

4,Waiting to Exhale (1995),Comedy|Drama|Romance



In [5]:
data_movies = pd.read_csv('movies.csv')
data_ratings = pd.read_csv('ratings.csv')

In [6]:
data_ratings['rating']

0         4.0
1         1.5
2         4.0
3         4.0
4         4.0
         ... 
105521    4.0
105522    1.0
105523    1.0
105524    1.0
105525    5.0
Name: rating, Length: 105526, dtype: float64

In [7]:
# Search duplicates

data_movies.title.value_counts()

War of the Worlds (2005)            2
Men with Guns (1997)                2
CQ (2001)                           1
1900 (Novecento) (1976)             1
Mystery of the Wax Museum (1933)    1
                                   ..
Welcome to Me (2014)                1
Saludos Amigos (1943)               1
Big Momma's House 2 (2006)          1
Myth of Fingerprints, The (1997)    1
In Bed (En la cama) (2005)          1
Name: title, Length: 10327, dtype: int64

In [8]:
data_movies[data_movies.title == 'Men with Guns (1997)']

Unnamed: 0,movieId,title,genres
1403,1788,Men with Guns (1997),Action|Drama
6270,26982,Men with Guns (1997),Drama


In [9]:
data_movies[data_movies.title == 'War of the Worlds (2005)']

Unnamed: 0,movieId,title,genres
6662,34048,War of the Worlds (2005),Action|Adventure|Sci-Fi|Thriller
7963,64997,War of the Worlds (2005),Action|Sci-Fi


In [10]:
# Replace duplicated id

indexes = data_ratings[data_ratings.movieId == 64997].index
data_ratings.loc[indexes, 'movieId'] = 34048

indexes = data_ratings[data_ratings.movieId == 26982].index
data_ratings.loc[indexes, 'movieId'] = 1788

In [11]:
# Drop duplicates (only two)

drop_indexes = data_movies[data_movies['movieId'] == 64997].index
data_movies.drop(drop_indexes, inplace=True)

drop_indexes = data_movies[data_movies['movieId'] == 26982].index
data_movies.drop(drop_indexes, inplace=True)

In [13]:
data_ratings.tail()

Unnamed: 0,userId,movieId,rating,timestamp
105521,385515568,1682,4.0,1606674317
105522,385515568,778,1.0,1606674321
105523,385515568,5445,1.0,1606674324
105524,385515568,7361,1.0,1606674326
105525,385515568,1617,5.0,1606674331


In [15]:
# Save the prepared files

data_movies.to_csv('df_movies.csv')
data_ratings.to_csv('df_ratings.csv')

====================================================================

## Telegram bot

In [527]:
class MovieRecommender():
    
    """
    Telegram chat-bot which can recommend films and save user scores
    """
    
    def __init__(self, data_movies, data_ratings):
        
        self.data_movies = data_movies.copy()
        self.data_ratings = data_ratings.copy()
        
        # Default values
        self.model_parameters = {
            'no_components': 30,
            'loss': 'warp',
            'k': 15
        }

        self.fit_parameters = {
            'epochs': 30,
            'num_threads': 4
        }
        
        self.user_id = 0
        self.selected_movie_id = 0
        
        self.n_rec_items = 6
        self.interactions = None
        self.similarity_matrix = None
        self.model = None
        self.threshold = 4.0
        
        self.users_df = None
        self.movies_df = None
        
        self.user_id_col = 'userId' 
        self.movie_id_col = 'movieId'
        self.rating_col = 'rating'
        self.title_col = 'title'
        
        self.rating_output_filename = 'df_ratings.csv'
        
#         self.mean_total_score = None
        
        
    def learn(self, model_kwargs=None, fit_kwargs=None):
        
        if not model_kwargs:
            model_kwargs = self.model_parameters
        if not fit_kwargs:
            fit_kwargs = self.fit_parameters
        
        self.interactions = self.get_interaction_matrix()
        self.users_df = pd.DataFrame(self.get_interaction_matrix().index)
        self.movies_df = (self.data_movies[[self.movie_id_col, self.title_col]]
                              .set_index([self.movie_id_col]))
        self.model = self.prediction_model(model_kwargs, fit_kwargs)
        self.similarity_matrix = self.get_items_similarity_matrix()
        
        return self.model
    
    
    def run(self):
        
        """
        It runs the telegram chat-bot. You must use your own api key.
        The bot can recommend N movies, save new scores and relearn.
        """
         
        with open ('api_key_for_bot.txt') as api_key_file:
            api_key = api_key_file.read().strip()
        
        bot = telebot.TeleBot(api_key)     
        
        @bot.message_handler(content_types=['text'])
        def get_text_messages(message):
            
            self.user_id = message.from_user.id

            # Possible scores 0.5 , 1.0 or 1, ... 5 or 5.0 
            possible_scores = (list(map(str, np.arange(0.5, 5.1, 0.5))) 
                               + list(map(str, np.arange(1, 6))))
                
            if message.text.lower() in ['привет', 'подскажи', 'далее', 'ещё', 'еще']:
                keyboard = types.InlineKeyboardMarkup()
                recommendations = self.get_recommendation_for_user()[self.title_col].items()
                for movie_id, movie_name in recommendations:
                    key = types.InlineKeyboardButton(text=movie_name, callback_data=movie_id)
                    keyboard.add(key)
                    
                if message.text.lower() in ['привет']:
                    question = "Привет, а эти фильмы смотрел?"
                else:
                    question = "А эти фильмы смотрел?"    
                bot.send_message(message.from_user.id, text=question, reply_markup=keyboard)
                bot.send_message(message.from_user.id, 
                    "Выбери фильм и оцени его (0.5-5)")
            
            elif message.text in possible_scores and self.selected_movie_id:
                self.save_score(self.selected_movie_id, float(message.text), 
                                filename=self.rating_output_filename)
                print('data ratings:\n', self.data_ratings.tail(1))
                
                # Reset the movie id and locker
                self.selected_movie_id = 0
               
            elif message.text.lower() in ['переучить', 'обучить', 'переобучить', 
                                          'запомнить', 'сохранить']:
                # Relearn the model
                self.data_ratings = pd.read_csv(self.rating_output_filename)
                self.learn()
                
            elif message.text == "/help":
                bot.send_message(message.from_user.id, 
                    'Напиши привет или ещё, чтобы получить рекомендацию. '
                    'Далее выбери фильм и введи оценку (0.5 - 5.0), если уже смотрел(а) его. '
                    'Можно оценить все из предложенных фильмов по очереди. '
                    'Далее чат-бот можно переобучить на основе ваших интересов. ' 
                    'Для этого после введённых оценок напишите сохранить или переобучить') 
                
            else:
                bot.send_message(message.from_user.id, 
                    "Я тебя не понимаю. Напиши /help, привет или ещё.")   
            
        @bot.callback_query_handler(func=lambda call: True)
        def select_movie(call):
            self.selected_movie_id = call.data      
            print('selected_movie_id:', self.selected_movie_id)

        # Run the bot
        bot.polling(none_stop=True, interval=0)
        
        
    def get_score(scores, threshold, mean_total_score):

        """
        Formula for calculating the movie score
            (V / (V+M)) * R + (M / (V+M)) * C
        V - number of votes
        M - threshold 
        R - average score for the movie среднее
        С - total average score for all movies
        """

        num_votes = len(scores)
        mean_movie_score = np.mean(scores)
        movie_score = (
            (num_votes / (num_votes + threshold)) * mean_movie_score +
            (threshold / (num_votes + threshold)) * mean_total_score)
        return movie_score
    
    
    def get_movie_scores(df_movies, df_ratings, threshold, mean_total_score):
    
        """
        Calculate the scores for all movies
        """
        
        movie_scores = []
        for index, row in df_movies.iterrows():
            movie_id = row[self.movie_id_col]
            scores = (df_ratings[df_ratings[self.movie_id_col] == movie_id]
                      [self.rating_col].to_list())
            movie_score = get_score(scores, threshold, mean_total_score)
            movie_scores.append(movie_score)
        movie_scores = pd.Series(movie_scores)
        movie_scores.name = 'score'
        return movie_scores


    def get_interaction_matrix(self, binary=False, threshold=None):
        
        """
        Return the movie-user interaction matrix, where the cells take the rating values
        """
        
        if not threshold:
            threshold = self.threshold
        
        interaction_matrix = (self.data_ratings
                              .groupby([self.user_id_col, self.movie_id_col])[self.rating_col]
                              .sum().unstack().reset_index()
                              .fillna(0).set_index(self.user_id_col))
        if binary:
            interaction_matrix = (interaction_matrix.applymap(
                lambda x: 1 if x > threshold else 0))
        return interaction_matrix
    
    
    def prediction_model(self, model_kwargs, fit_kwargs):

        """
        Learning the prediction model with a sparsed interaction matrix
        """

        x_train = sparse.csr_matrix(self.interactions.values)
        model = LightFM(**model_kwargs)
        model.fit(x_train, **fit_kwargs)
        return model
    

    def get_recommendation_for_user(self, show=False):
        
        """
        The model predicts the movie ratings for current user. 
        Next it returns N top movies for current user
        """
        
        # Cold start with a new user        
        if not self.user_id in self.users_df.values:
            user_id = self.users_df.sample(1).values[0, 0]
        else:
            user_id = self.user_id
            
        print('User ID =', user_id)
        
        n_users, n_items = self.interactions.shape
        user_index = self.users_df[self.users_df == user_id].index[0]
        scores = pd.Series(
            self.model.predict(user_ids=user_index, item_ids=np.arange(n_items)))
        scores.index = self.interactions.columns
        
        rated_movies = (self.interactions.loc[user_id]
                        [self.interactions.loc[user_id] > 0]
                        .sort_values(ascending=False))
        recommend_ids = (scores[~(self.interactions.loc[user_id, :] > 0)]
                         .sort_values(ascending=False)
                         [:self.n_rec_items])
        
        rated = self.movies_df.loc[rated_movies.index]
        recommedations = self.movies_df.loc[recommend_ids.index]
        
        if show:
            print('Top watched \n')
            for value in rated[:self.n_rec_items].values:
                print('\t', value[0])
            print('\nRecommedations \n')
            for value in recommedations[:self.n_rec_items].values:
                print('\t', value[0]) 
                
        return recommedations
    
    
    def get_similar_users(self, number_of_user=10):
        
        """
        Return N users with similar interests
        """
        
        favorite_movies = (self.data_ratings[self.data_ratings[self.user_id_col] == self.user_id]
                           .sort_values(by=self.rating_col, ascending=False)[self.movie_id_col]
                           .head(10))
        
        # !TO DO: Choose all movies
        if favorite_movies.empty:
            movie_id = 1
        else:
            movie_id = np.random.choice(favorite_movies.values)
            
        n_users, n_items = self.interactions.shape
        movie_ids = np.array(self.interactions.columns)
        scores = pd.Series(
            self.model.predict(np.arange(n_users), 
                          np.repeat(movie_ids.searchsorted(movie_id), n_users)))
        similar_users = (scores.sort_values(ascending=False)[:number_of_user]
                         .index.to_list())
        return similar_users
    
    
    def get_items_similarity_matrix(self):
    
        """
        Return the movie-movie similarity matrix
        """
        
        similarity_matrix = pd.DataFrame(
            cosine_similarity(sparse.csr_matrix(self.model.item_embeddings)))
        similarity_matrix.columns = self.interactions.columns
        similarity_matrix.index = self.interactions.columns
        
        return similarity_matrix
    
    
    def item_item_recommendation(self, movie_id, n_items=6, show=False):
        
        """
        Return the movie-movie recommendation based on the similarity matrix
        """
        
        recommended_movies = (self.similarity_matrix
                              .loc[movie_id, :]
                              .sort_values(ascending=False)
                              [1: n_items+1])
        recommendation = self.movies_df.loc[recommended_movies.index]
        if show:
            print(f'Recommendations:\n')
            for value in recommendation.values:
                print('\t', value[0])
        return recommendation

            
    def save_score(self, movie_id, score, by_name=False, filename=None):
        
        """
        The function saves new scores of the current user. 
        It can save due the run or in a file 
        """
        
        if by_name:
            # In case when movie_id is a film description / title
            movie_id = self.movies_df[
                (self.movies_df.iloc[:, 0] == movie_id)].index[0]
            
        timestamp = int(datetime.datetime.now().timestamp())
        
        self.data_ratings.loc[self.data_ratings.shape[0]] = [
            self.user_id, movie_id, score, timestamp]
        
        if filename:
            self.data_ratings.to_csv(filename, index=False)       
#         return self.data_ratings

In [528]:
movie_rec = MovieRecommender(data_movies, data_ratings)

In [529]:
movie_rec.learn()

<lightfm.lightfm.LightFM at 0x1f125442648>

In [530]:
movie_rec.get_recommendation_for_user()

User ID = 380


Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
296,Pulp Fiction (1994)
356,Forrest Gump (1994)
110,Braveheart (1995)
593,"Silence of the Lambs, The (1991)"
47,Seven (a.k.a. Se7en) (1995)
293,Léon: The Professional (a.k.a. The Professiona...


In [531]:
# movie_rec.run()

In [532]:
movie_rec.get_interaction_matrix()

movieId,1,2,3,4,5,6,7,8,9,10,...,144482,144656,144976,146344,146656,146684,146878,148238,148626,149532
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,3.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
666,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
668,3.0,3.0,2.0,0.0,2.5,5.0,3.0,0.0,0.0,3.5,...,0.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,4.5,0.0


In [533]:
movie_rec.item_item_recommendation(movie_id=148626)

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
98296,Deadfall (2012)
115216,"Salvation, The (2014)"
116887,Exodus: Gods and Kings (2014)
68552,Crossing Over (2009)
101895,42 (2013)
90719,J. Edgar (2011)


In [526]:
movie_id = 148626

recommended_movies = (movie_rec.similarity_matrix
                               .loc[movie_id, :]
                               .sort_values(ascending=False)
                               [1: n_items+1])
movie_rec.movies_df.loc[recommended_movies.index].values
# recommended_movies
# recommendation = self.movie_series.loc[recommended_movies.index.to_list()]
# if show:
#     print(f'Recommendations:\n')
# for value in recommendation.values:
#     print('\t', value[0])
# return recommendation

array([['Exodus: Gods and Kings (2014)'],
       ['J. Edgar (2011)'],
       ['Calvary (2014)'],
       ...,
       ['Under the Rainbow (1981)'],
       ['Doctor Dolittle (1967)'],
       ['7th Voyage of Sinbad, The (1958)']], dtype=object)