# Recommender system - final project
## 1. Exploratory data analysis

In [1]:
# imports
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from pandas.api.types import CategoricalDtype
import random
from scipy.sparse import coo_matrix
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler

In [2]:
# The purchase that happened at the end of the session. One purchase per session.

train_purchases = pd.read_csv("data/train_purchases.csv")
train_purchases.head()

Unnamed: 0,session_id,item_id,date
0,3,15085,2020-12-18 21:26:47.986
1,13,18626,2020-03-13 19:36:15.507
2,18,24911,2020-08-26 19:20:32.049
3,19,12534,2020-11-02 17:16:45.92
4,24,13226,2020-02-26 18:27:44.114


The items that were viewed in a session. The "date" column is a timestamp to miliseconds. A session is equal to a day, 
so a session is one user's activity on one day. The session goes up to and not including the first time the user viewed 
the item that they bought in the end. The last item in the session will be the last item viewed before viewing the item that they bought. 
To find they item they bought link to train_purchases.csv on session_id.

In [None]:
train_sessions = pd.read_csv("data/train_sessions.csv")
train_sessions.head()

The label data of items. A feature_category_id represents an aspect of the item such as "colour", the feature_value_id is the value for that aspect, 
e.g. "blue". Some items may not share many feature_cateogry_ids if they different types of items, for example trousers will share almost 
nothing with shirts. Even things like colour will not be shared, the colour aspect for trousers and shirts are two different feature_category_ids.

## 2. Matrix factorization with implicit feedback

### Create training/test split

In [None]:
train_purchases['action'] = np.repeat(50.0, train_purchases.shape[0])
train_sessions['action'] = np.ones(train_sessions.shape[0])

In [None]:
df = train_purchases.append(train_sessions)

In [None]:
df.sort_values('session_id', inplace=True)
df.drop('date', axis=1, inplace=True)
df.head()

In [None]:
users = df['session_id'].unique()
random.shuffle(users)
users_train = users[:900000]
users_test = users[900000:]

In [None]:
df_train = df[df['session_id'].isin(users_train)]
df_train.head()

In [None]:
df_test = train_sessions[train_sessions['session_id'].isin(users_test)]

In [None]:
df_test.drop(['action', 'date'], axis=1, inplace=True)
df_test.head()

In [None]:
df_train.head()

In [None]:
df_train.columns = ['user', 'item', 'rating']

In [None]:
df_train = df_train.groupby(['user', 'item']).sum()

In [None]:
df_train.sort_values('rating', ascending=False).head()

In [None]:
df_train.reset_index(inplace=True)

In [None]:
df_train['confidence'] = df_train['rating'].apply(lambda x: 1 + 40 * x)

In [None]:
df_train.drop('rating', axis=1, inplace=True)

In [None]:
df_train['preference'] = df_train['confidence'].apply(lambda x: 0 if x == 41.0 else 1)

In [None]:
df_test.columns = ['user', 'item']
df_test.head()

In [None]:
# Matrix factorization test

In [None]:
df_train.drop('preference', axis=1, inplace=True)
df_train.columns = ['user_id', 'item_id', 'rating']

In [None]:
item_split = list(df_train.item_id.unique())[:int(np.round(len(df_train.item_id.unique()) * 0.03))]

df_train = df_train[df_train['item_id'].isin(item_split)]

In [None]:
df_train.shape

In [None]:
# create a sparse matrix

users = df_train["user_id"].unique()
items = df_train["item_id"].unique()

# Create indices for users and items
user_cat = CategoricalDtype(categories=sorted(users), ordered=True)
item_cat = CategoricalDtype(categories=sorted(items), ordered=True)
user_index = df_train["user_id"].astype(user_cat).cat.codes
item_index = df_train["item_id"].astype(item_cat).cat.codes

# Conversion via COO matrix
coo = coo_matrix((df_train["rating"], (user_index, item_index)), shape=(len(users), len(items)))
sparse_item_user = coo.tocsr()
coo = coo_matrix((df_train["rating"], (item_index, user_index)), shape=(len(items), len(users)))
sparse_user_item = coo.tocsr()

In [None]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random

from sklearn.preprocessing import MinMaxScaler

import implicit # The Cython library

# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)

In [None]:
def recommend(user_id, sparse_user_item, user_vecs, item_vecs, num_items=10):
    """The same recommendation function we used before"""

    user_interactions = sparse_user_item[user_id,:].toarray()

    user_interactions = user_interactions.reshape(-1) + 1
    user_interactions[user_interactions > 1] = 0

    rec_vector = user_vecs[user_id,:].dot(item_vecs.T).toarray()

    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    recommend_vector = user_interactions * rec_vector_scaled

    item_idx = np.argsort(recommend_vector)[::-1][:num_items]

    items = []
    scores = []

    for idx in item_idx:
        items.append(data.item.loc[data.item_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'items': items, 'score': scores})

    return recommendations

# Get the trained user and item vectors. We convert them to 
# csr matrices to work with our previous recommend function.
user_vecs = sparse.csr_matrix(model.user_factors)
item_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for user with id 2025
user_id = 3

recommendations = recommend(user_id, sparse_user_item, user_vecs, item_vecs)

# print(recommendations)

In [None]:
df_test

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50)
knn.fit(csr)

In [None]:
df_users_test = df_test['user'].unique()[:100]
answers = []

for x in range(len(df_users_test)):

    filter1 = df_test[df_test['user'] == df_users_test[x]]['item']
    filter1 = filter1.tolist()
    filter1 = filter1[:20]
    #print("Items liked by user ",user,": ",filter1)

    distances1=[]
    indices1=[]
    for i in filter1:
        distances , indices = knn.kneighbors(csr[i],n_neighbors=10)
        indices = indices.flatten()
        indices= indices[1:]
        indices1.extend(indices)

    #print("Items to be recommended: ",indices1)
    #print(len(df_users_test) - x)
    answers.append(indices1)

In [None]:
def implicit_als(sparse_data, alpha_val=40, iterations=10, lambda_val=0.1, features=10):
 
    """ Implementation of Alternating Least Squares with implicit data. We iteratively
    compute the user (x_u) and item (y_i) vectors using the following formulas:
 
    x_u = ((Y.T*Y + Y.T*(Cu - I) * Y) + lambda*I)^-1 * (X.T * Cu * p(u))
    y_i = ((X.T*X + X.T*(Ci - I) * X) + lambda*I)^-1 * (Y.T * Ci * p(i))
 
    Args:
        sparse_data (csr_matrix): Our sparse user-by-item matrix
 
        alpha_val (int): The rate in which we'll increase our confidence
        in a preference with more interactions.
 
        iterations (int): How many times we alternate between fixing and 
        updating our user and item vectors
 
        lambda_val (float): Regularization value
 
        features (int): How many latent features we want to compute.
    
    Returns:     
        X (csr_matrix): user vectors of size users-by-features
        
        Y (csr_matrix): item vectors of size items-by-features
     """

    # Calculate the foncidence for each value in our data
    confidence = sparse_data * alpha_val
    
    # Get the size of user rows and item columns
    user_size, item_size = sparse_data.shape
    
    # We create the user vectors X of size users-by-features, the item vectors
    # Y of size items-by-features and randomly assign the values.
    X = sparse.csr_matrix(np.random.normal(size = (user_size, features)))
    Y = sparse.csr_matrix(np.random.normal(size = (item_size, features)))
    
    #Precompute I and lambda * I
    X_I = sparse.eye(user_size)
    Y_I = sparse.eye(item_size)
    
    I = sparse.eye(features)
    lI = lambda_val * I
    

    # Start main loop. For each iteration we first compute X and then Y
    for i in range(iterations):
        print('iteration %d of %d' % (i+1, iterations))
        
        # Precompute Y-transpose-Y and X-transpose-X
        yTy = Y.T.dot(Y)
        xTx = X.T.dot(X)

        # Loop through all users
        for u in range(user_size):

            # Get the user row.
            u_row = confidence[u,:].toarray() 

            # Calculate the binary preference p(u)
            p_u = u_row.copy()
            p_u[p_u != 0] = 1.0

            # Calculate Cu and Cu - I
            CuI = sparse.diags(u_row, [0])
            Cu = CuI + Y_I

            # Put it all together and compute the final formula
            yT_CuI_y = Y.T.dot(CuI).dot(Y)
            yT_Cu_pu = Y.T.dot(Cu).dot(p_u.T)
            X[u] = spsolve(yTy + yT_CuI_y + lI, yT_Cu_pu)

    
        for i in range(item_size):

            # Get the item column and transpose it.
            i_row = confidence[:,i].T.toarray()

            # Calculate the binary preference p(i)
            p_i = i_row.copy()
            p_i[p_i != 0] = 1.0

            # Calculate Ci and Ci - I
            CiI = sparse.diags(i_row, [0])
            Ci = CiI + X_I

            # Put it all together and compute the final formula
            xT_CiI_x = X.T.dot(CiI).dot(X)
            xT_Ci_pi = X.T.dot(Ci).dot(p_i.T)
            Y[i] = spsolve(xTx + xT_CiI_x + lI, xT_Ci_pi)

    return X, Y