In [1]:
    # установим недостающие библиотеки
!pip install surprise



In [58]:
# определим класс, который будет рекомендовать
# на основе модели гибрида (блендинга)[]
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import NearestNeighbors
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from collections import defaultdict
from sklearn.preprocessing import normalize

class MyRecommender:
  default_models = ['random_forest','knn','svd']
  default_weight = [0.3,0.3,0.4]
  default_opts = {
      'random_forest':{
          'max_depth':8,
          'max_features':3,
          'n_estimators':500,
      },
      'knn':{
          'n_neighbors':20,
          'n_jobs':-1,
          'metric':'manhattan',
      },
      'svd':{
          'n_factors': 90,
          'n_epochs': 100,
          'lr_all': 0.01,
          'reg_all': 0.1,
          'biased': True,
      },
  }
  default_meta_opts = {
    'item_id_attr':'item_id',
    'user_id_attr':'user_id',
    'rating_attr':'rating',
    'timestamp_attr':'timestamp',
  }
  models = {}
  opts = {}
  meta_opts = {}
  weight = []
  predictions = {}
  X = None
  Y = None

  all_users = []
  all_items = []

  base_df = None

  def __init__(self,models:list = [],model_weight:list = [],model_opts:dict = {},meta_opts:dict={}):
    if(len(models) == 0):
      models = self.default_models
    if(len(model_weight) == 0):
      weight = self.default_weight
    if(len(models) != len(model_weight)):
      raise Exception("Model count ({}) and weight count ({}) are differs".format(len(models),len(weight)))

    if(sum(model_weight) != 1.0):
      raise Exception('Sum of weights must be 1. Current is {}'.format(sum(model_weight)))

    for i,model in enumerate(models):
      if model not in self.default_models:
        raise Exception('Model name {} not supported'.format(model))
      if model not in model_opts:
        model_opts[model] = self.default_opts[model]
      #if model not in meta_opts and model in self.default_meta_opts:
      #  meta_opts[model] = self.default_meta_opts[model]  
      self.models[model] = None
      self.weight.append(model_weight[i])

    for key in self.default_meta_opts:
      if key not in meta_opts:
        meta_opts[key] = self.default_meta_opts[key]

    self.opts = model_opts
    self.meta_opts = meta_opts

  def prepare_random_forest(self,DATA):
    X_columns = DATA.columns.tolist()
    X_columns.remove(self.meta_opts['user_id_attr'])
    X_columns.remove(self.meta_opts['item_id_attr'])
    X_columns.remove(self.meta_opts['timestamp_attr'])
    return DATA[X_columns]

  def fit_random_forest(self,DATA):
    X_cols = DATA.columns.tolist()
    X_cols.remove(self.meta_opts['rating_attr'])
    model = RandomForestRegressor(**self.opts['random_forest'])
    model.fit(DATA[X_cols],DATA[self.meta_opts['rating_attr']])
    return model

  def predict_random_forest(self,user_id,num):
    if(type(user_id) != list):
        user_id = [user_id]
      
    u_predictions = {}
    for uid in user_id:
      item_df = pd.DataFrame(data=self.all_items,columns=[self.meta_opts['item_id_attr']])
      user_df = self.DATA.query(self.meta_opts['user_id_attr']+' == @uid')[[self.meta_opts['item_id_attr'],self.meta_opts['rating_attr']]]
      df = item_df.merge(user_df,on=self.meta_opts['item_id_attr'],how='left')

      df_unwatched = df \
                     .query(self.meta_opts['rating_attr']+' != '+self.meta_opts['rating_attr'])
      df_to_learn =  df_unwatched \
                     .merge(self.DATA.drop_duplicates(subset=[self.meta_opts['item_id_attr']]),on=self.meta_opts['item_id_attr'],how='left',suffixes=['','_r']) \
                     .drop([self.meta_opts['user_id_attr'],self.meta_opts['item_id_attr'],self.meta_opts['timestamp_attr'],self.meta_opts['rating_attr'],self.meta_opts['rating_attr']+'_r'],axis=1)
      predictions = self.models['random_forest'].predict(df_to_learn)
      df_unwatched['rating'] = predictions

      df = df.merge(df_unwatched,on=self.meta_opts['item_id_attr'],how='left',suffixes=['','_r'])
 
      df['score'] = df.loc[:,['rating','rating_r']].apply(lambda row: row[0] if row[1] != row[1] else row[1],axis=1)
      df['rating_r'] = df['rating_r'].apply(lambda x: True if x != x else False)
      df = df \
           .drop([self.meta_opts['rating_attr']],axis=1) \
           .rename(columns={'rating_r':'watched'}) \
           .query('watched == False') \
           .sort_values(by=['score'],ascending=False) \
           .head(num)
      df['score'] = df['score'].apply(lambda x: 0 if x != x else x)
      df['score'] = normalize([df['score'].to_numpy()])[0]
      u_predictions[uid] = df
    return u_predictions

  def build_random_forest(self,DATA):
    X_indexed = DATA.set_index([self.meta_opts['item_id_attr']])
    df = self.get_user_item_df()
    for idx,row in df.query('rating != rating').iterrows():
      item_id = idx[1]
      X_row = X_indexed.query(self.meta_opts['item_id_attr']+' == @item_id').iloc[0]
      X_row = X_row.drop(self.meta_opts['user_id_attr'])
      val = self.models['random_forest'].predict([X_row])
      df.loc[idx]['rating'] = val[0]
    
  def prepare_knn(self,DATA,with_item_id=False):
    X_columns = DATA.columns.tolist()
    X_columns.remove(self.meta_opts['user_id_attr'])
    if not with_item_id:
      X_columns.remove(self.meta_opts['item_id_attr'])
    X_columns.remove(self.meta_opts['rating_attr'])
    X_columns.remove(self.meta_opts['timestamp_attr'])
    return DATA.drop_duplicates(subset=[self.meta_opts['item_id_attr']])[X_columns]

  def fit_knn(self,DATA):
    model = NearestNeighbors(**self.opts['knn'])
    model.fit(DATA)
    return model

  def predict_knn(self,user_id,num):
    if(type(user_id) != list):
        user_id = [user_id]
      
    u_predictions = {}
    for uid in user_id:
      item_df = pd.DataFrame(data=self.all_items,columns=[self.meta_opts['item_id_attr']])
      user_df = self.DATA.query(self.meta_opts['user_id_attr']+' == @uid')[[self.meta_opts['item_id_attr'],self.meta_opts['rating_attr'],self.meta_opts['timestamp_attr']]]
      df = item_df.merge(user_df,on=self.meta_opts['item_id_attr'],how='left')
      df['watched'] = df[self.meta_opts['rating_attr']].apply(lambda x: False if x != x else True) 
        
      df_user_top = df \
                     .query('watched == True') \
                     .sort_values(by=[self.meta_opts['rating_attr'],self.meta_opts['timestamp_attr']],ascending=False) \
                     .drop(['watched',self.meta_opts['timestamp_attr']],axis=1) \
                     .head(10)
      df_to_learn =  df_user_top \
                     .merge(self.DATA.drop_duplicates(subset=[self.meta_opts['item_id_attr']]),on=self.meta_opts['item_id_attr'],how='left',suffixes=['','_r']) \
                     .drop([self.meta_opts['user_id_attr'],self.meta_opts['item_id_attr'],self.meta_opts['timestamp_attr'],self.meta_opts['rating_attr'],self.meta_opts['rating_attr']+'_r'],axis=1)
      predictions = self.models['knn'].kneighbors(df_to_learn,return_distance=True)
      df_pred = pd.DataFrame({
          'indices':predictions[1].flatten(),
          'distances':predictions[0].flatten(),
      }).sort_values(by=['distances'],ascending=True)
      indices = df_pred['indices'].to_numpy()
      
      df_recommend = self.prepare_knn(self.DATA,with_item_id=True).iloc[indices][[self.meta_opts['item_id_attr']]]
      df_recommend['score'] = df_pred['distances']
      df = df \
           .drop([self.meta_opts['rating_attr'],self.meta_opts['timestamp_attr']],axis=1) \
           .merge(df_recommend,on=self.meta_opts['item_id_attr'],how='left') \
           .query('watched == False') \
           .sort_values(by=['score'],ascending=True,na_position='last') \
           .head(num)
      df['score'] = df['score'].apply(lambda x: 0 if x != x else x)
      df['score'] = normalize([df['score'].apply(lambda x: df['score'].max() - x).to_numpy()])[0]
      u_predictions[uid] = df
    return u_predictions 

  def prepare_svd(self,DATA):
    rating_min = DATA[self.meta_opts['rating_attr']].min()
    rating_max = DATA[self.meta_opts['rating_attr']].max()
    df = pd.DataFrame({
        'uid':DATA[self.meta_opts['user_id_attr']].to_numpy().flatten(),
        'iid':DATA[self.meta_opts['item_id_attr']].to_numpy().flatten(),
        'rating':DATA[self.meta_opts['rating_attr']].to_numpy().flatten(),
    })
    reader = Reader(rating_scale=(rating_min, rating_max))
    data = Dataset.load_from_df(df, reader)
    return data.build_full_trainset()

  def fit_svd(self,DATA):
    model = SVD(**self.opts['svd'])
    model.fit(DATA)
    return model

  def predict_svd(self,user_id,num):
    if(type(user_id) != list):
        user_id = [user_id]
      
    u_predictions = {}
    testset = self.prepare_svd(self.DATA).build_anti_testset()
    predictions = self.models['svd'].test(testset)
    top_n = defaultdict(list)
    
    for uid, iid, true_r, est, _ in predictions:
      if uid in user_id:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
      user_ratings.sort(key=lambda x: x[1], reverse=True)
      top_n[uid] = user_ratings

    for uid in user_id:
      item_df = pd.DataFrame(data=self.all_items,columns=[self.meta_opts['item_id_attr']])
      user_df = self.DATA.query(self.meta_opts['user_id_attr']+' == @uid')[[self.meta_opts['item_id_attr'],self.meta_opts['rating_attr'],self.meta_opts['timestamp_attr']]]
      df = item_df.merge(user_df,on=self.meta_opts['item_id_attr'],how='left')
      df['watched'] = df[self.meta_opts['rating_attr']].apply(lambda x: False if x != x else True) 
      
      df_recommend = pd.DataFrame(data=top_n[uid],columns=[self.meta_opts['item_id_attr'],'score'])
      df = df \
           .drop([self.meta_opts['rating_attr'],self.meta_opts['timestamp_attr']],axis=1) \
           .merge(df_recommend,on=self.meta_opts['item_id_attr'],how='left') \
           .query('watched == False') \
           .sort_values(by=['score'],ascending=False,na_position='last') \
           .head(num)
      df['score'] = df['score'].apply(lambda x: 0 if x != x else x)
      df['score'] = normalize([df['score'].to_numpy()])[0]
      u_predictions[uid] = df
    return u_predictions 
        

  def fit(self,DATA):
        
    self.DATA = DATA
    self.all_users = DATA[self.meta_opts['user_id_attr']].unique()
    self.all_items = DATA[self.meta_opts['item_id_attr']].unique()
    
    for model in self.models:
      self.fit_model(model)
      #self.build_model(model,X,Y)
      pass

  def fit_model(self,model:str):
    fit_method = 'fit_'+model
    prepare_method = 'prepare_'+model
    DATA_prepared = self.DATA
    if hasattr(self,prepare_method):
      DATA_prepared = getattr(self,prepare_method)(self.DATA)
    self.models[model] = getattr(self,fit_method)(DATA_prepared)

  def build_model(self,model:str):
    build_method = 'build_'+model
    #self.predictions[model] = getattr(self,build_method)(self.DATA)
    
  def get_user_item_df(self):
    if self.base_df is not None:
        return self.base_df

    df_data = []
    for user_id in self.all_users:
      for item_id in self.all_items:
        df_data.append([user_id,item_id,None])
        
    df = pd.DataFrame(data=df_data,columns=['user_id','item_id','rating'])\
              .set_index(['user_id','item_id'])
    del df_data
    
    cur_data_df = pd.DataFrame({
        'user_id':self.DATA[self.meta_opts['user_id_attr']],
        'item_id':self.DATA[self.meta_opts['item_id_attr']],
        'rating':self.DATA[self.meta_opts['rating_attr']],
    }).set_index(['user_id','item_id'])

    df = df.join(cur_data_df,on=['user_id','item_id'],rsuffix='_r') \
           .drop('rating',axis=1) \
           .rename(columns={'rating_r':'rating'})
    self.base_df = df
    return self.base_df
        
  def predict_model(self,model,user_id,num):
    predict_method = 'predict_'+model
    self.predictions[model] = getattr(self,predict_method)(user_id,num)

  def weight_predictions(self,user_id,num):
    r = {}
    for i,model in enumerate(self.models):
      for uid in self.predictions[model]:
        if uid not in r:
          r[uid] = {}
        for idx,row in self.predictions[model][uid].iterrows():
          if row[self.meta_opts['item_id_attr']] not in r[uid]:
            r[uid][row[self.meta_opts['item_id_attr']]] = 0
          r[uid][row[self.meta_opts['item_id_attr']]] += self.weight[i]*row['score']
        r[uid] = dict(sorted(r[uid].items(), key=lambda item:item[1],reverse=True))
    
    lst = {}
    for uid in r:
      if uid not in lst:
        lst[uid] = []
      for item in r[uid]:
        lst[uid].append(item)
      lst[uid] = lst[uid][:num]

    return lst
    
  def recommend(self,user_id, num:int = 10):
    for model in self.models:
        self.predict_model(model,user_id,num)
    if(len(self.predictions) == 1):
        return self.predictions[model]
    return self.weight_predictions(user_id,num)


In [3]:
# получим данные

import pandas as pd

links = pd.read_csv('https://raw.githubusercontent.com/ALKONDR/netology-recsys/master/lecture-1/links.csv')
movies = pd.read_csv('https://github.com/ALKONDR/netology-recsys/raw/master/lecture-1/movies.csv')
ratings = pd.read_csv('https://github.com/ALKONDR/netology-recsys/raw/master/lecture-1/ratings.csv')
tags = pd.read_csv('https://github.com/ALKONDR/netology-recsys/raw/master/lecture-1/tags.csv')

In [4]:
# соберем данные в одну табличку

import scipy.stats
import numpy as np

all_data = ratings.copy()

# добавляем к рейтингам теги и приводим их к нижнему регистру
all_data = all_data.join(tags[['userId','movieId','tag']].set_index(['userId','movieId']),on=['userId','movieId'],rsuffix='_tags')

# добавляем к рейтингам фильмы и приводим их к нижнему регистру
all_data = all_data.join(movies.set_index('movieId'),on='movieId',rsuffix='_movies')

# добавляем среднюю оценку (mean)
ratings_mean = ratings.groupby(['movieId'],as_index=False).agg({'rating':np.mean})
all_data = all_data.join(ratings_mean.set_index('movieId'),on='movieId',rsuffix='_mean')

# добавляем кол-во отзывов
ratings_len = ratings.groupby(['movieId'],as_index=False).agg({'rating':len})
all_data = all_data.join(ratings_len.set_index('movieId'),on='movieId',rsuffix='_cnt')

# добавляем медианную оценку
ratings_median = ratings.groupby(['movieId'],as_index=False).agg({'rating':np.median})
all_data = all_data.join(ratings_median.set_index('movieId'),on='movieId',rsuffix='_median')

# добавляем оценку вариативности
ratings_variance = ratings.groupby(['movieId'],as_index=False).agg({'rating':lambda arr: np.var(arr) if len(arr)>0 else 0.0})
all_data = all_data.join(ratings_variance.set_index('movieId'),on='movieId',rsuffix='_variance')

# добавляем моду
ratings_mode = ratings.groupby(['movieId'],as_index=False).agg({'rating':lambda arr: scipy.stats.mode(arr,keepdims=False)[0]})
all_data = all_data.join(ratings_mode.set_index('movieId'),on='movieId',rsuffix='_mode')

# приведем все теги и жанры к нижнему регистру, попутно избавившись от NaN
all_data['tag'] = all_data['tag'].apply(lambda x: x.lower() if isinstance(x,str) else '')
all_data['genres'] = all_data['genres'].apply(lambda x: x.lower() if isinstance(x,str) else '')

all_data

Unnamed: 0,userId,movieId,rating,timestamp,tag,title,genres,rating_mean,rating_cnt,rating_median,rating_variance,rating_mode
0,1,1,4.0,964982703,,Toy Story (1995),adventure|animation|children|comedy|fantasy,3.920930,215,4.0,0.693748,4.0
1,1,3,4.0,964981247,,Grumpier Old Men (1995),comedy|romance,3.259615,52,3.0,1.091254,3.0
2,1,6,4.0,964982224,,Heat (1995),action|crime|thriller,3.946078,102,4.0,0.661308,4.0
3,1,47,5.0,964983815,,Seven (a.k.a. Se7en) (1995),mystery|thriller,3.975369,203,4.0,0.846684,4.0
4,1,50,5.0,964982931,,"Usual Suspects, The (1995)",crime|mystery|thriller,4.237745,204,4.5,0.638330,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,,Split (2017),drama|horror|thriller,3.333333,6,4.0,2.055556,4.0
100832,610,168248,5.0,1493850091,heroic bloodshed,John Wick: Chapter Two (2017),action|crime|thriller,4.142857,7,4.0,0.479592,4.0
100833,610,168250,5.0,1494273047,,Get Out (2017),horror,3.633333,15,4.0,0.882222,3.0
100834,610,168252,5.0,1493846352,,Logan (2017),action|sci-fi,4.280000,25,4.5,0.401600,5.0


In [5]:
# преобразуем строковые категориальные признаки с помощью Tf Idf
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

tfidf_transformer = TfidfVectorizer(tokenizer=lambda x: x.split('|'))
X_tag_tfidf = tfidf_transformer.fit_transform(all_data['tag']).toarray()

tfidf_transformer = TfidfVectorizer(tokenizer=lambda x: x.split('|'))
X_genre_tfidf = tfidf_transformer.fit_transform(all_data['genres']).toarray()

# комбинируем tf idf признаки в одно пространство
X_tfidf = np.hstack((X_tag_tfidf,X_genre_tfidf))
X_tfidf_df = pd.DataFrame(data=X_tfidf)

In [6]:
# скомбинируем данные в более удобное представление

DATA = all_data[['userId','title','timestamp','rating_mean','rating_cnt','rating_median','rating_variance','rating_mode','rating']].merge(X_tfidf_df,left_index=True,right_index=True)
# change all tfidf column names to string
rename_columns = {}
for col in DATA.columns:
  if type(col) == int or col.isnumeric():
    rename_columns[col] = 'tfidf_'+str(col)

DATA.rename(columns=rename_columns,inplace=True)

In [59]:
import warnings
warnings.filterwarnings('ignore')

recommender = MyRecommender(models=['random_forest','knn','svd'],model_weight=[0.2,0.3,0.5],meta_opts={'user_id_attr':'userId','item_id_attr':'title'})
recommender.fit(DATA)

In [60]:
recommender.recommend(user_id=222,num=5)

{222: ['Willy Wonka & the Chocolate Factory (1971)',
  'Seve (2014)',
  'Enter the Void (2009)',
  'Jetée, La (1962)',
  'Come and See (Idi i smotri) (1985)']}

In [61]:
recommender.recommend(user_id=555,num=5)

{555: ["Monty Python's Life of Brian (1979)",
  'Belle époque (1992)',
  'Crossing Delancey (1988)',
  'Autumn Sonata (Höstsonaten) (1978)',
  'Raiders of the Lost Ark: The Adaptation (1989)']}

In [62]:
recommender.recommend(user_id=333,num=5)

{333: ['NeverEnding Story, The (1984)',
  'Bill Hicks: Revelations (1993)',
  'Bad Boy Bubby (1993)',
  'Belle époque (1992)',
  "Guess Who's Coming to Dinner (1967)"]}