In [1]:
# установим недостающие библиотеки
!pip install surprise



In [31]:
# определим класс, который будет рекомендовать
# на основе модели гибрида (блендинга)[]
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import NearestNeighbors
from surprise import SVD
from surprise import Dataset
from surprise import accuracy
from surprise import Reader

class MyRecommender:
  default_models = ['random_forest','knn','svd']
  default_weight = [0.3,0.3,0.4]
  default_opts = {
      'random_forest':{
          'max_depth':8,
          'max_features':3,
          'n_estimators':500,
      },
      'knn':{
          'n_neighbors':20,
          'n_jobs':-1,
          'metric':'euclidean',
      },
      'svd':{
          'n_factors': 90,
          'n_epochs': 100,
          'lr_all': 0.01,
          'reg_all': 0.1,
          'biased': True,
      },
  }
  default_meta_opts = {
    'item_id_attr':'item_id',
    'user_id_attr':'user_id',
    'rating_id_attr':'rating',
    'rating_min':0.5,
    'rating_max':5,
  }
  models = {}
  opts = {}
  meta_opts = {}
  weight = []
  predictions = {}

  all_users = []
  all_items = []

  base_df = None

  def __init__(self,models:list = [],model_weight:list = [],model_opts:dict = {},meta_opts:dict={}):
    if(len(models) == 0):
      models = self.default_models
    if(len(model_weight) == 0):
      weight = self.default_weight
    if(len(models) != len(model_weight)):
      raise Exception("Model count ({}) and weight count ({}) are differs".format(len(models),len(weight)))

    if(sum(model_weight) != 1.0):
      raise Exception('Sum of weights must be 1. Current is {}'.format(sum(model_weight)))

    for i,model in enumerate(models):
      if model not in self.default_models:
        raise Exception('Model name {} not supported'.format(model))
      if model not in model_opts:
        model_opts[model] = self.default_opts[model]
      #if model not in meta_opts and model in self.default_meta_opts:
      #  meta_opts[model] = self.default_meta_opts[model]  
      self.models[model] = None
      self.weight.append(model_weight[i])

    for key in self.default_meta_opts:
      if key not in meta_opts:
        meta_opts[key] = self.default_meta_opts[key]

    self.opts = model_opts
    self.meta_opts = meta_opts

    print(self.models)
    print(self.opts)
    print(self.meta_opts)
    print(self.weight)

  def prepare_random_forest(self,X,Y):
    X_columns = X.columns.tolist()
    X_columns.remove(self.meta_opts['user_id_attr'])
    X_columns.remove(self.meta_opts['item_id_attr'])
    return X[X_columns]

  def fit_random_forest(self,X,Y):
    model = RandomForestRegressor(**self.opts['random_forest'])
    model.fit(X,Y)
    return model

  def build_random_forest(self,X,Y):
    X_indexed = X.set_index([self.meta_opts['item_id_attr']])
    df = self.get_user_item_df(X,Y)
    for idx,row in df.query('rating != rating').iterrows():
      item_id = idx[1]
      X_row = X_indexed.query(self.meta_opts['item_id_attr']+' == @item_id').iloc[0]
      X_row = X_row.drop(self.meta_opts['user_id_attr'])
      val = self.models['random_forest'].predict([X_row])
      df.loc[idx]['rating'] = val
    
  def prepare_knn(self,X,Y):
    X_columns = X.columns.tolist()
    X_columns.remove(self.meta_opts['user_id_attr'])
    X_columns.remove(self.meta_opts['item_id_attr'])
    return X[X_columns]

  def fit_knn(self,X,Y):
    model = NearestNeighbors(**self.opts['knn'])
    model.fit(X)
    return model

  def prepare_svd(self,X,Y):
    df = pd.DataFrame({
        'uid':X[self.meta_opts['user_id_attr']].to_numpy().flatten(),
        'iid':X[self.meta_opts['item_id_attr']].to_numpy().flatten(),
        'rating':Y.to_numpy().flatten(),
    })
    reader = Reader(rating_scale=(self.meta_opts['rating_min'], self.meta_opts['rating_max']))
    data = Dataset.load_from_df(df, reader)
    return data.build_full_trainset()

  def fit_svd(self,X,Y):
    model = SVD(**self.opts['svd'])
    model.fit(X)
    return model

  def fit(self,X,Y):
        
    self.all_users = X[self.meta_opts['user_id_attr']].unique()
    self.all_items = X[self.meta_opts['item_id_attr']].unique()
    
    for model in self.models:
      self.fit_model(model,X,Y)
      self.build_model(model,X,Y)

  def fit_model(self,model:str,X,Y):
    print("Fits {} model".format(model))
    fit_method = 'fit_'+model
    prepare_method = 'prepare_'+model
    X_prepared = X
    if hasattr(self,prepare_method):
      X_prepared = getattr(self,prepare_method)(X,Y)
    self.models[model] = getattr(self,fit_method)(X_prepared,Y)

  def build_model(self,model:str,X,Y):
    build_method = 'build_'+model
    self.predictions[model] = getattr(self,build_method)(X,Y)
    
  def get_user_item_df(self,X,Y):
    if self.base_df is not None:
        return self.base_df

    df_data = []
    for user_id in self.all_users:
      for item_id in self.all_items:
        df_data.append([user_id,item_id,None])
        
    df = pd.DataFrame(data=df_data,columns=['user_id','item_id','rating'])\
              .set_index(['user_id','item_id'])
    del df_data
    
    cur_data_df = pd.DataFrame({
        'user_id':X[self.meta_opts['user_id_attr']],
        'item_id':X[self.meta_opts['item_id_attr']],
        'rating':Y
    }).set_index(['user_id','item_id'])

    df = df.join(cur_data_df,on=['user_id','item_id'],rsuffix='_r') \
           .drop('rating',axis=1) \
           .rename(columns={'rating_r':'rating'})
    self.base_df = df
    return self.base_df
        
  def recommend(self,user_id):
    pass

In [3]:
# получим данные

import pandas as pd

links = pd.read_csv('https://raw.githubusercontent.com/ALKONDR/netology-recsys/master/lecture-1/links.csv')
movies = pd.read_csv('https://github.com/ALKONDR/netology-recsys/raw/master/lecture-1/movies.csv')
ratings = pd.read_csv('https://github.com/ALKONDR/netology-recsys/raw/master/lecture-1/ratings.csv')
tags = pd.read_csv('https://github.com/ALKONDR/netology-recsys/raw/master/lecture-1/tags.csv')

In [4]:
# соберем данные в одну табличку

import scipy.stats
import numpy as np

all_data = ratings.copy()

# добавляем к рейтингам теги и приводим их к нижнему регистру
all_data = all_data.join(tags[['userId','movieId','tag']].set_index(['userId','movieId']),on=['userId','movieId'],rsuffix='_tags')

# добавляем к рейтингам фильмы и приводим их к нижнему регистру
all_data = all_data.join(movies.set_index('movieId'),on='movieId',rsuffix='_movies')

# добавляем среднюю оценку (mean)
ratings_mean = ratings.groupby(['movieId'],as_index=False).agg({'rating':np.mean})
all_data = all_data.join(ratings_mean.set_index('movieId'),on='movieId',rsuffix='_mean')

# добавляем кол-во отзывов
ratings_len = ratings.groupby(['movieId'],as_index=False).agg({'rating':len})
all_data = all_data.join(ratings_len.set_index('movieId'),on='movieId',rsuffix='_cnt')

# добавляем медианную оценку
ratings_median = ratings.groupby(['movieId'],as_index=False).agg({'rating':np.median})
all_data = all_data.join(ratings_median.set_index('movieId'),on='movieId',rsuffix='_median')

# добавляем оценку вариативности
ratings_variance = ratings.groupby(['movieId'],as_index=False).agg({'rating':lambda arr: np.var(arr) if len(arr)>0 else 0.0})
all_data = all_data.join(ratings_variance.set_index('movieId'),on='movieId',rsuffix='_variance')

# добавляем моду
ratings_mode = ratings.groupby(['movieId'],as_index=False).agg({'rating':lambda arr: scipy.stats.mode(arr,keepdims=False)[0]})
all_data = all_data.join(ratings_mode.set_index('movieId'),on='movieId',rsuffix='_mode')

# приведем все теги и жанры к нижнему регистру, попутно избавившись от NaN
all_data['tag'] = all_data['tag'].apply(lambda x: x.lower() if isinstance(x,str) else '')
all_data['genres'] = all_data['genres'].apply(lambda x: x.lower() if isinstance(x,str) else '')

all_data

Unnamed: 0,userId,movieId,rating,timestamp,tag,title,genres,rating_mean,rating_cnt,rating_median,rating_variance,rating_mode
0,1,1,4.0,964982703,,Toy Story (1995),adventure|animation|children|comedy|fantasy,3.920930,215,4.0,0.693748,4.0
1,1,3,4.0,964981247,,Grumpier Old Men (1995),comedy|romance,3.259615,52,3.0,1.091254,3.0
2,1,6,4.0,964982224,,Heat (1995),action|crime|thriller,3.946078,102,4.0,0.661308,4.0
3,1,47,5.0,964983815,,Seven (a.k.a. Se7en) (1995),mystery|thriller,3.975369,203,4.0,0.846684,4.0
4,1,50,5.0,964982931,,"Usual Suspects, The (1995)",crime|mystery|thriller,4.237745,204,4.5,0.638330,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,,Split (2017),drama|horror|thriller,3.333333,6,4.0,2.055556,4.0
100832,610,168248,5.0,1493850091,heroic bloodshed,John Wick: Chapter Two (2017),action|crime|thriller,4.142857,7,4.0,0.479592,4.0
100833,610,168250,5.0,1494273047,,Get Out (2017),horror,3.633333,15,4.0,0.882222,3.0
100834,610,168252,5.0,1493846352,,Logan (2017),action|sci-fi,4.280000,25,4.5,0.401600,5.0


In [5]:
# преобразуем строковые категориальные признаки с помощью Tf Idf
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

tfidf_transformer = TfidfVectorizer(tokenizer=lambda x: x.split('|'))
X_tag_tfidf = tfidf_transformer.fit_transform(all_data['tag']).toarray()

tfidf_transformer = TfidfVectorizer(tokenizer=lambda x: x.split('|'))
X_genre_tfidf = tfidf_transformer.fit_transform(all_data['genres']).toarray()

# комбинируем tf idf признаки в одно пространство
X_tfidf = np.hstack((X_tag_tfidf,X_genre_tfidf))
X_tfidf_df = pd.DataFrame(data=X_tfidf)

In [6]:
# разобьем данные на тестовую и тренировочные выборки

from sklearn.model_selection import train_test_split

BASE_X = all_data[['userId','title','rating_mean','rating_cnt','rating_median','rating_variance','rating_mode']].merge(X_tfidf_df,left_index=True,right_index=True)
# change all tfidf column names to string
for col in BASE_X.columns:
  if type(col) == int or col.isnumeric():
    BASE_X.rename(columns={col:'tfidf_'+str(col)},inplace=True)

BASE_Y = all_data['rating']

TRAIN_X, TEST_X, TRAIN_Y, TEST_Y = train_test_split(BASE_X,BASE_Y,test_size=0.2,random_state=10)

In [None]:
import warnings
warnings.filterwarnings('ignore')

recommender = MyRecommender(models=['random_forest'],model_weight=[1],meta_opts={'user_id_attr':'userId','item_id_attr':'title'})
recommender.fit(TRAIN_X,TRAIN_Y)

recommender.recommend(user_id=[320,240])

{'random_forest': None}
{'random_forest': {'max_depth': 8, 'max_features': 3, 'n_estimators': 500}}
{'user_id_attr': 'userId', 'item_id_attr': 'title', 'rating_id_attr': 'rating', 'rating_min': 0.5, 'rating_max': 5}
[1, 1]
Fits random_forest model
