In [2]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import numpy as np
# Load the dataset
df= pd.read_csv('UserBehavior.csv', dtype={'user_id':int, 'item_id':int, 'category_id':int, 'behavior':str, 'timestamp':np.int64})
print(df['user_id'].nunique())
print(df['item_id'].nunique())
df.head()

472
34323


Unnamed: 0,user_id,item_id,category_id,behavior,timestamp
0,1,2268318,2520377,pv,1511544070
1,1,2333346,2520771,pv,1511561733
2,1,2576651,149192,pv,1511572885
3,1,3830808,4181361,pv,1511593493
4,1,4365585,2520377,pv,1511596146


In [5]:
# Convert behavior types to scores
behavior_scores = {'pv': 1, 'fav': 2, 'cart': 3, 'buy': 9}
df['score'] = df['behavior'].map(behavior_scores)

# Sum up all scores of a user
user_scores = df.groupby(['user_id', 'item_id'])['score'].sum()

# Create a user-item matrix
user_item_matrix = user_scores.unstack()
user_item_matrix.head()
#print number of non-null values in each row
user_item_matrix.count(axis=1)
user_item_matrix.iloc[0:10, 0:10]

item_id,324,330,422,812,1110,1197,1211,1260,1369,1503
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,,,,,,
100,,,,,,,,,,
1000,,,,,,,,,,
10001,,,,,,,,,,
10008,,,,,,,,,,
10009,,,,,,,,,,
10013,,,,,,,,,,
10020,,,,,,,,,,
10021,,,,,,,,,,
100002,,,,,,,,,,


In [6]:
#split the data into training and test sets
train = user_item_matrix.sample(frac=0.8, random_state=0)
test = user_item_matrix.drop(train.index)
train = train.values

In [7]:
def als(R, lambda_, dim, n_iter):
    # Initialize the user and item matrices with random values
    num_users, num_items = R.shape
    U = np.random.rand(num_users, dim)
    V = np.random.rand(num_items, dim)
    
    # Perform alternating optimization for the specified number of iterations
    for i in range(n_iter):
        # Update the user matrix U with fixed item matrix V
        for u in range(num_users):
            mask = ~np.isnan(R[u, :])
            V_masked = V[mask, :]
            R_masked = R[u, mask]
            U[u, :] = np.linalg.solve(np.dot(V_masked.T, V_masked) + lambda_ * np.eye(dim), np.dot(V_masked.T, R_masked))
        
        # Update the item matrix V with fixed user matrix U
        for v in range(num_items):
            mask = ~np.isnan(R[:, v])
            U_masked = U[mask, :]
            R_masked = R[mask, v]
            V[v, :] = np.linalg.solve(np.dot(U_masked.T, U_masked) + lambda_ * np.eye(dim), np.dot(U_masked.T, R_masked))
    
    # Compute the predicted ratings matrix
    R_pred = np.dot(U, V.T)
    
    # Compute the regularized (squared) error loss
    error = R - R_pred
    mask = ~np.isnan(R)
    sq_error = np.sum((error ** 2)[mask])
    reg_term = lambda_ * (np.sum(U ** 2) + np.sum(V ** 2))
    loss = sq_error + reg_term
    
    # Return the user and item matrices, as well as the loss value
    return U, V,R_pred, loss


In [5]:
import matplotlib.pyplot as plt
dims = [2, 10, 20, 30]
lambda_ = [0.01,0.05,0.1,0.5,1,5]
#check the loss function for different parameters
for dim in dims:
    for l in lambda_:
        _, _,_, loss = als(train, l, dim, 10)
        print('dim = {}, lambda = {}, loss = {}'.format(dim, l, loss))

dim = 2, lambda = 0.01, loss = 806.9265789154151


KeyboardInterrupt: 

In [8]:
# Split the data into a training set and a validation set
np.random.shuffle(train)
val_size = int(0.1 * len(train))
val = train[:val_size, :]
train = train[val_size:, :]

# Use the validation set to select the best hyperparameters
# Try different values of lambda_ and dim and select the ones that minimize the validation loss
best_lambda = 0.01
best_dim = 20


In [10]:
u,v,r,loss = als(train, best_lambda, best_dim, 30)

In [11]:
r[1]

array([0.29035059, 0.65179306, 0.16250834, ..., 0.        , 0.        ,
       0.7035981 ])

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_nan(test, R):
    # Find the most similar row vector in R for each row vector in test by cosine similarity (only for non-NaN value)
    mask = ~np.isnan(test)
    test_non_nan = test[mask]
    R_non_nan = R[:, mask]
    similarity = cosine_similarity([test_non_nan], R_non_nan)[0]
    # Predict the NaN value in test
    prediction = np.nanmean(R[:, np.isnan(test)] * similarity[:, np.newaxis], axis=0) / np.nanmean(similarity)
    return prediction
predict_nan(test.iloc[0, :], r)

array([ 0.09691069,  0.21594951,  0.06332584, ...,  0.        ,
        0.        , -0.0081366 ])