In [1]:
import gzip
import json

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from  sklearn.metrics.pairwise import pairwise_distances

from math import sqrt



def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')




In [2]:
df = getDF('Software_5.json.gz')
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,4.0,False,"10 20, 2010",A38NELQT98S4H8,321719816,{'Format:': ' DVD-ROM'},WB Halper,I've been using Dreamweaver (and it's predeces...,A solid overview of Dreamweaver CS5,1287532800,,
1,4.0,False,"10 18, 2010",A3QJU4FEN8PQSZ,321719816,{'Format:': ' DVD-ROM'},Grimmy,"The demo is done with the PC version, with ref...",A good value,1287360000,,
2,5.0,False,"10 16, 2010",ACJT8MUC0LRF0,321719816,{'Format:': ' DVD-ROM'},D. Fowler,If you've been wanting to learn how to create ...,This is excellent software for those who want ...,1287187200,3.0,
3,5.0,False,"10 12, 2010",AYUF7YETYOLNX,321719816,{'Format:': ' DVD-ROM'},Bryan Newman,I've been creating websites with Dreamweaver f...,A Fantastic Overview of Dream Weaver and Web D...,1286841600,,
4,5.0,False,"10 7, 2010",A31ICLWQ9CSHRS,321719816,{'Format:': ' DVD-ROM'},Al Swanson,I decided (after trying a number of other prod...,Excellent Tutorials!,1286409600,,


In [3]:
#выбираем нужные столбцы и переименовываем для удобства
df = df[['overall','reviewerID','asin']]
df.rename(columns={'overall': 'rating', 'reviewerID': 'user','asin':'product'}, inplace=True)
df

Unnamed: 0,rating,user,product
0,4.0,A38NELQT98S4H8,0321719816
1,4.0,A3QJU4FEN8PQSZ,0321719816
2,5.0,ACJT8MUC0LRF0,0321719816
3,5.0,AYUF7YETYOLNX,0321719816
4,5.0,A31ICLWQ9CSHRS,0321719816
...,...,...,...
12800,4.0,A1E50L7PCVXLN4,B01FFVDY9M
12801,3.0,AVU1ILDDYW301,B01HAP3NUG
12802,4.0,A2LW5AL0KQ9P1M,B01HAP3NUG
12803,3.0,AZ515FFZ7I2P7,B01HAP47PQ


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12805 entries, 0 to 12804
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   rating   12805 non-null  float64
 1   user     12805 non-null  object 
 2   product  12805 non-null  object 
dtypes: float64(1), object(2)
memory usage: 400.2+ KB


## EDA

In [5]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

data = df['rating'].value_counts().sort_index(ascending=False)
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               )
# Create layout
layout = dict(title = 'Распределение {} рейтингов'.format(df.shape[0]),
              xaxis = dict(title = 'Рейтинг'),
              yaxis = dict(title = 'Количество'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [6]:
data = df.groupby('product')['rating'].count().clip(upper=50)

trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
layout = go.Layout(title = 'Распределение рейтингов по продуктам',
                   xaxis = dict(title = 'Число рейтингов по продукту'),
                   yaxis = dict(title = 'Количество'),
                   bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [7]:
data = df.groupby('user')['rating'].count().clip(upper=50)

trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 50,
                                  size = 2))
layout = go.Layout(title = 'Распределение рейтингов по юзерам',
                   xaxis = dict(title = 'Рейтинг по юзеру'),
                   yaxis = dict(title = 'Количество'),
                   bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [8]:
print('Unique users count: {}'.format(len(df['user'].unique())))
print('Unique movies count: {}'.format(len(df['product'].unique())))
print('DataFrame shape: {}'.format(df.shape))

Unique users count: 1826
Unique movies count: 802
DataFrame shape: (12805, 3)


Подготовим данные в виде матрицы, для этого пронумеруем продукты и юзеров и добавим столбцы

In [9]:
products = df['product'].unique()
users = df['user'].unique()
# products


def scale_product(item):
    scaled = np.where(products == item)[0][0] + 1
    return scaled

def scale_user(item):
    scaled = np.where(users == item)[0][0] + 1
    return scaled

df['item_num'] = df['product'].apply(scale_product)
df['user_num'] = df['user'].apply(scale_user)
df

Unnamed: 0,rating,user,product,item_num,user_num
0,4.0,A38NELQT98S4H8,0321719816,1,1
1,4.0,A3QJU4FEN8PQSZ,0321719816,1,2
2,5.0,ACJT8MUC0LRF0,0321719816,1,3
3,5.0,AYUF7YETYOLNX,0321719816,1,4
4,5.0,A31ICLWQ9CSHRS,0321719816,1,5
...,...,...,...,...,...
12800,4.0,A1E50L7PCVXLN4,B01FFVDY9M,776,15
12801,3.0,AVU1ILDDYW301,B01HAP3NUG,778,564
12802,4.0,A2LW5AL0KQ9P1M,B01HAP3NUG,778,1386
12803,3.0,AZ515FFZ7I2P7,B01HAP47PQ,779,929


In [10]:
train_data, test_data = train_test_split(df, test_size=0.2)

print('Train shape: {}'.format(train_data.shape))
print('Test shape: {}'.format(test_data.shape))

Train shape: (10244, 5)
Test shape: (2561, 5)


In [11]:
def rmse(prediction, ground_truth):
    # Оставим оценки, предсказанные алгоритмом, только для соотвествующего набора данных
    prediction = np.nan_to_num(prediction)[ground_truth.nonzero()].flatten()
    # Оставим оценки, которые реально поставил пользователь, только для соотвествующего набора данных
    ground_truth = np.nan_to_num(ground_truth)[ground_truth.nonzero()].flatten()
    
    mse = mean_squared_error(prediction, ground_truth)
    return sqrt(mse)

Собрираем матрицы для тренировочных и тестовых данных

In [12]:
n_users = len(df['user'].unique())
n_products = len(df['product'].unique())

train_data_matrix = np.zeros((n_users, n_products))
train_data_matrix
for line in train_data.itertuples():
    train_data_matrix[line[5] - 1, line[4] - 1] = line[1]
    
test_data_matrix = np.zeros((n_users, n_products))
for line in test_data.itertuples():
    test_data_matrix[line[5] - 1, line[4] - 1] = line[1]

In [13]:
train_data_matrix

array([[4., 0., 0., ..., 0., 0., 0.],
       [4., 0., 4., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
product_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

## Обучаем модели

### Наивные рекомедации

In [15]:
def naive_predict(top):

    top_similar_ratings = np.zeros((n_users, top, n_products))

    for i in range(n_users):
        top_sim_users = user_similarity[i].argsort()[1:top + 1]
        top_similar_ratings[i] = train_data_matrix[top_sim_users]

    pred = np.zeros((n_users, n_products))
    for i in range(n_users):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top
    
    return pred


def naive_predict_item(top):
    top_similar_ratings = np.zeros((n_products, top, n_users))

    for i in range(n_products):
        top_sim_products = product_similarity[i].argsort()[1:top + 1]
        top_similar_ratings[i] = train_data_matrix.T[top_sim_products]
        
    pred = np.zeros((n_products, n_users))
    for i in range(n_products):
        pred[i] = top_similar_ratings[i].sum(axis=0) / top
    
    return pred.T

naive_pred = naive_predict(7)
print('User-based CF RMSE: ', rmse(naive_pred, test_data_matrix))

naive_pred_item = naive_predict_item(7)
print('Item-based CF RMSE: ', rmse(naive_pred_item, test_data_matrix))

User-based CF RMSE:  3.8465073717989666
Item-based CF RMSE:  3.741528704343827


### Рекомендации с учётом средних оценок похожих пользователей

In [16]:
def k_fract_predict(top):
    top_similar = np.zeros((n_users, top))
    
    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = user_sim.argsort()[1:top + 1]#[-top:]

        for j in range(top):
            top_similar[i, j] = top_sim_users[j]
            
    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_products))
    
    for i in range(n_users):
        indexes = top_similar[i].astype(np.int)
        numerator = user_similarity[i][indexes]
        
        product = numerator.dot(train_data_matrix[indexes])
        
        denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
        
        pred[i] = product / denominator
    
    return pred


def k_fract_predict_item(top):
    flag = True
    top_similar = np.zeros((n_products, top))
    
    for i in range(n_products):
        products_sim = product_similarity[i]
        top_sim_products = products_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_products.T[j]
            
    abs_sim = np.abs(product_similarity)
    pred = np.zeros((n_products, n_users))
    
    
    for i in range(n_products):
        indexes = top_similar[i].astype(np.int)
        numerator = product_similarity[i][indexes]
        
        product = numerator.dot(train_data_matrix.T[indexes])
        
        denominator = abs_sim[i][indexes].sum()
        denominator = denominator if denominator != 0 else 1
        
        pred[i] = product / denominator
        
    return pred.T


k_predict = k_fract_predict(7)
print('User-based CF RMSE: ', rmse(k_predict, test_data_matrix))

k_predict_item = k_fract_predict_item(7)
print('Item-based CF RMSE: ', rmse(k_predict_item, test_data_matrix))

User-based CF RMSE:  3.8716772091616045
Item-based CF RMSE:  3.861062956880518



invalid value encountered in true_divide



### Рекомендации на основе средних оценок пользователей и матрицы “похожести”

In [17]:
def k_fract_mean_predict(top):
    top_similar = np.zeros((n_users, top))
    
    for i in range(n_users):
        user_sim = user_similarity[i]
        top_sim_users = user_sim.argsort()[1:top + 1]

        for j in range(top):
            top_similar[i, j] = top_sim_users[j]
            
    abs_sim = np.abs(user_similarity)
    pred = np.zeros((n_users, n_products))
    
    for i in range(n_users):
        indexes = top_similar[i].astype(np.int)
        numerator = user_similarity[i][indexes]
        
        mean_rating = np.array([x for x in train_data_matrix[i] if x > 0]).mean()
        diff_ratings = train_data_matrix[indexes] - train_data_matrix[indexes].mean()
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
        
        pred[i] = mean_rating + numerator / denominator
        
    return pred

def k_fract_mean_predict_item(top):
    top_similar = np.zeros((n_products, top))
    
    for i in range(n_products):
        movie_sim = product_similarity[i]
        top_sim_products = movie_sim.argsort()[1:top + 1]
        
        for j in range(top):
            top_similar[i, j] = top_sim_products[j]
    
    abs_sim = np.abs(product_similarity)
    pred = np.zeros((n_products, n_users))
    
    for i in range(n_products):
        indexes = top_similar[i].astype(np.int)
        numerator = product_similarity[i][indexes]
        
        diff_ratings = train_data_matrix.T[indexes] - train_data_matrix.T[indexes].mean()
        numerator = numerator.dot(diff_ratings)
        denominator = abs_sim[i][top_similar[i].astype(np.int)].sum()
        denominator = denominator if denominator != 0 else 1
        
        mean_rating = np.array([x for x in train_data_matrix.T[i] if x > 0]).mean()
        mean_rating = 0 if np.isnan(mean_rating) else mean_rating
        pred[i] = mean_rating + numerator / denominator
                
    return pred.T

k_predict = k_fract_mean_predict(7)
print('User-based CF RMSE: ', rmse(k_predict, test_data_matrix))

k_predict_item = k_fract_mean_predict_item(7)
print('Item-based CF RMSE: ', rmse(k_predict_item, test_data_matrix))


Mean of empty slice.


invalid value encountered in double_scalars


invalid value encountered in true_divide



User-based CF RMSE:  1.4809146677156997



Mean of empty slice.


invalid value encountered in double_scalars



Item-based CF RMSE:  1.290638814108221


Делаем вывод что Рекомендации на основе средних оценок пользователей и матрицы “похожести” в нашем случае работают лучше всего