In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import json
from tqdm import tqdm_notebook

## Константы из ЛК

In [2]:
id_to_predict = 758

## Загрузим датасеты

`u.data` содержит все оценки

In [3]:
df_data = pd.read_csv(
    '../laba06/ml-100k/u.data', 
    delimiter='\t', 
    header=None, 
    names=['user_id', 'item_id','rating','timestamp']
)

`u.item` — список всех фильмов

In [4]:
item_col_lst = [
    
    'movie_id',
    'movie_title',
    'release_date',
    'video_release_date',
    'imdb_url',
    'unknown',
    'action',
    'adventure',
    'animation',
    'children',
    'comedy',
    'crime',
    'documentary',
    'drama',
    'fantasy',
    'film-Noir', 
    'horror',
    'musical',
    'mystery',
    'romance',
    'sci-Fi',
    'thriller',
    'war',
    'western',

]

In [5]:
df_item = pd.read_csv(
    '../laba06/ml-100k/u.item', 
    delimiter='|', 
    header=None, 
    names=item_col_lst,
    index_col=None,
    engine='python'
)

## Базовые предикторы

### Глобальное среднее 𝞵 (average_rating) по всему датасету. `Сумма всех оценок по всем фильмам / Количество всех оценок по всем фильмам.

In [6]:
average_rating = df_data['rating'].sum() / df_data['rating'].count()
average_rating

3.52986

### Базовый предиктор для каждого пользователя (суммирование по фильмам, оцененным данным пользователем).¶

In [7]:
user_df = df_data.groupby('user_id', as_index=False) \
                  .agg({'rating': lambda x: 1 / (x.count() + 10) * (x - average_rating).sum()}) \
                  .rename(columns={'rating': 'b_u'})
user_df.head()

Unnamed: 0,user_id,b_u
0,1,0.077582
1,2,0.154843
2,3,-0.618944
3,4,0.567158
4,5,-0.620138


### Базовый предиктор для каждого фильма (суммирование по пользователям, поставившим оценку данному фильму).

In [181]:
item_df = \
    df_data.merge(user_df, on='user_id', how='inner') \
           .groupby('item_id', as_index=False) \
           .apply(lambda x: pd.Series([1 / (x.user_id.count() + 25) * (x.rating - x.b_u - average_rating).sum()])) \
           .rename(columns={0:'b_i'})
    
item_df.head()

Unnamed: 0,item_id,b_i
0,1,0.292925
1,2,-0.213032
2,3,-0.320284
3,4,-0.019912
4,5,-0.158643


### Надо собрать b_ui для КАЖДОГО пользователя и КАЖДОГО фильма (все возможные вариации)

In [182]:
from itertools import product

all_comb_list = list(product(df_data['user_id'].unique(), df_data['item_id'].unique()))
len(all_comb_list)

1586126

In [183]:
all_comb_df = pd.DataFrame.from_records(all_comb_list,
                                        columns=['user_id', 'item_id'])

all_comb_df = all_comb_df.sort_values(by=['user_id', 'item_id'],
                                      ascending=[True, True])

all_comb_df = \
    all_comb_df.merge(df_data[['user_id', 'item_id', 'rating']], on=['user_id', 'item_id'], how='left') \
               .merge(user_df, on='user_id', how='inner') \
               .merge(item_df, on='item_id', how='inner')

all_comb_df['b_ui'] = average_rating + all_comb_df['b_u'] + all_comb_df['b_i']
all_comb_df['r_minus_b_ui'] = all_comb_df['rating'] - all_comb_df['b_ui']
all_comb_df['r_minus_b_ui'] = all_comb_df['r_minus_b_ui'].fillna(0)

print(all_comb_df.shape)
all_comb_df.head()

(1586126, 7)


Unnamed: 0,user_id,item_id,rating,b_u,b_i,b_ui,r_minus_b_ui
0,1,1,5.0,0.077582,0.292925,3.900367,1.099633
1,2,1,4.0,0.154843,0.292925,3.977628,0.022372
2,3,1,,-0.618944,0.292925,3.203841,0.0
3,4,1,,0.567158,0.292925,4.389943,0.0
4,5,1,4.0,-0.620138,0.292925,3.202647,0.797353


## Часть 4. Item-item CF

Разреженный датафрейм для расчета косинусных расстояний

In [184]:
sparse_df = \
    all_comb_df.pivot(index='item_id',
                      columns='user_id',
                      values='r_minus_b_ui').fillna(0)

print(sparse_df.shape)
sparse_df.head()

(1682, 943)


user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.099633,0.022372,0.0,0.0,0.797353,0.076765,0.0,0.0,0.0,-0.464567,...,-1.984765,-1.135753,-0.025153,0.0,0.416409,0.0,0.0,0.822744,0.0,0.0
2,-0.39441,0.0,0.0,0.0,0.30331,0.0,0.0,0.0,0.0,0.0,...,0.521191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.795624
3,0.712842,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.588057,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.58753,0.0,0.0,0.0,0.0,0.0,1.065194,0.0,0.0,-0.15173,...,1.328072,0.0,0.0,0.0,0.0,0.0,-1.444179,0.0,0.0,0.0
5,-0.448799,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Найдите попарные меры близости (косинус) для всех фильмов, используя очищенные оценки

In [185]:
from sklearn.metrics.pairwise import cosine_similarity

cos_df = pd.DataFrame(cosine_similarity(sparse_df, dense_output=False),
                      columns=sparse_df.index.tolist(),
                      index=sparse_df.index.tolist())
cos_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,1.000000,-0.003603,-0.012158,-0.035144,0.039609,0.022780,0.021228,0.052695,-0.004018,-0.015940,...,0.077118,0.0,0.00000,0.00000,0.028896,0.0,0.0,0.0,0.039629,-0.018718
2,-0.003603,1.000000,-0.001984,0.050458,0.002228,-0.003639,-0.016975,0.040146,-0.089526,-0.007177,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.0,0.0,0.022621,0.016415
3,-0.012158,-0.001984,1.000000,-0.113536,-0.005417,0.060713,-0.024408,-0.094746,-0.016745,-0.039964,...,0.000000,0.0,0.00000,0.00000,0.183583,0.0,0.0,0.0,0.000000,0.004880
4,-0.035144,0.050458,-0.113536,1.000000,-0.149507,-0.019089,0.005145,0.110511,0.033087,0.010968,...,0.000000,0.0,-0.11599,-0.11599,0.089827,0.0,0.0,0.0,0.001824,-0.048334
5,0.039609,0.002228,-0.005417,-0.149507,1.000000,-0.031989,-0.015494,0.038203,-0.042447,-0.036512,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.027241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,1.0,1.0,1.0,0.000000,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,1.0,1.0,1.0,0.000000,0.000000
1680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,1.0,1.0,1.0,0.000000,0.000000
1681,0.039629,0.022621,0.000000,0.001824,0.000000,0.000000,0.041703,0.117235,0.041731,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.0,0.0,1.000000,0.000000


### 30 ближайших фильмов-соседей для этого фильма (среди всех фильмов, а не фильмов, оценённых пользователем)

Словарь с индексами 30 соседей для каждого фильма (фильтр потом)

In [186]:
neighb_dict = {}

for col in cos_df.columns.tolist():
    neighb_dict.update({col: [cos_df[col].sort_values(ascending=False).index.tolist()[1:31],
                              cos_df[col].sort_values(ascending=False).values.tolist()[1:31]]})

neighb_dict

{1: [[588,
   933,
   174,
   870,
   928,
   426,
   1117,
   845,
   95,
   539,
   294,
   28,
   71,
   115,
   1606,
   786,
   210,
   1354,
   161,
   742,
   591,
   1199,
   523,
   82,
   819,
   894,
   1275,
   1605,
   926,
   423],
  [0.1854337353592464,
   0.1490537545492982,
   0.1487320778086005,
   0.14828287200080073,
   0.14784792233675637,
   0.14462732148956708,
   0.14396169480587273,
   0.13970744719358047,
   0.1369520528370985,
   0.1358408517049723,
   0.13379638628199336,
   0.12832609936464173,
   0.12520737465843002,
   0.1251636356829603,
   0.12273811110449903,
   0.11992689793769917,
   0.11579520714791984,
   0.113942170567524,
   0.11325108185581573,
   0.11051143099017605,
   0.10936581421211881,
   0.10589450175220687,
   0.10501436645288796,
   0.10394084723833896,
   0.1035753627507202,
   0.0987731941182127,
   0.09667741795114418,
   0.09517154817361967,
   0.09460343019958847,
   0.09413904251116541]],
 2: [[385,
   566,
   1419,
   54,
   233,

Target Vector и Расчет оценок рейтингов для незаполненных

In [187]:
target_vector = sparse_df[id_to_predict]
rated_movies_idx = target_vector[target_vector != 0].index.tolist()

target_vector = target_vector[target_vector == 0]
result_list = []

for item in target_vector.index.tolist():
    
    b_ui = all_comb_df.loc[(all_comb_df['user_id'] == id_to_predict) & 
                           (all_comb_df['item_id'] == item), 'b_ui'].values[0]
    
    # Все 30 соседей фильма
    neighbors_idx = neighb_dict.get(item)[0]    
    neighbors_cos = neighb_dict.get(item)[1]
    
    # Индексы соседей с оценкой юзера
    rated_idx = [i for i, x in enumerate(neighbors_idx) if x in rated_movies_idx]
    
    num = 0
    denom = 0
    
    for movie in rated_idx:
        
        num += neighbors_cos[movie] * sparse_df.loc[sparse_df.index == neighbors_idx[movie], id_to_predict].values[0]
        denom += np.abs(neighbors_cos[movie])
    
    # Исключение на случай, если юзер не оценивал ни одного из соседей
    try:
        rating = b_ui + num/denom
    except:
        rating = b_ui
    
    result_list.append([item, rating])

predicators_top10 = sorted(result_list, key=lambda x: x[1], reverse=True)[:10]
predicators_top10

[[1416, 5.396102257568369],
 [360, 5.278632807602977],
 [1018, 5.246351812217993],
 [989, 5.2457961444931716],
 [1121, 5.103013109732725],
 [190, 5.089691992209969],
 [1011, 5.080877775015453],
 [1097, 5.047774868754669],
 [878, 5.037650226836837],
 [1456, 5.0223189034871725]]

### Рекомендуйте пользователю 10 фильмов (predicators_top10) с самыми высокими оценками из фильмов соседей

Только для соседей с положительной близостью

In [188]:
target_vector = sparse_df[id_to_predict]
rated_movies_idx = target_vector[target_vector != 0].index.tolist()

target_vector = target_vector[target_vector == 0]
result_list = []

for item in target_vector.index.tolist():
    
    b_ui = all_comb_df.loc[(all_comb_df['user_id'] == id_to_predict) & 
                           (all_comb_df['item_id'] == item), 'b_ui'].values[0]
    
    # Все 30 соседей фильма
    neighbors_idx = neighb_dict.get(item)[0]    
    neighbors_cos = neighb_dict.get(item)[1]
    
    # Индексы соседей с оценкой юзера
    rated_idx = [i for i, x in enumerate(neighbors_idx) if x in rated_movies_idx]
    #neighbors_idx = [neighbors_idx[i] for i in rated_idx]
    #neighbors_cos = [neighbors_cos[i] for i in rated_idx]
    
    num = 0
    denom = 0
    
    for movie in rated_idx:
        if neighbors_cos[movie] > 0:
            num += neighbors_cos[movie] * sparse_df.loc[sparse_df.index == neighbors_idx[movie], id_to_predict].values[0]
            denom += np.abs(neighbors_cos[movie])
        else:
            pass

    # Исключение на случай, если юзер не оценивал ни одного из соседей
    try:
        rating = b_ui + num/denom
    except:
        rating = b_ui
    
    result_list.append([item, rating])

predicators_positive_top10 = sorted(result_list, key=lambda x: x[1], reverse=True)[:10]
predicators_positive_top10

[[1416, 5.396102257568369],
 [360, 5.278632807602977],
 [1018, 5.246351812217993],
 [989, 5.2457961444931716],
 [1121, 5.103013109732725],
 [190, 5.089691992209969],
 [1011, 5.080877775015453],
 [1097, 5.047774868754669],
 [878, 5.037650226836837],
 [1456, 5.0223189034871725]]

## Результат

In [189]:
result_dic = {
    "average_rating": round(average_rating, 4),
    "predicators_positive_top10": [elem[0] for elem in predicators_positive_top10],
    "predicators_top10": [elem[0] for elem in predicators_top10]
}

In [190]:
with open('../lab08s.json', 'w') as outfile:
    json.dump(result_dic, outfile)