# Recommender system - final project
## 1. Data cleaning

In [366]:
# imports
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from pandas.api.types import CategoricalDtype
import random
from scipy.sparse import coo_matrix
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
from sklearn.preprocessing import MinMaxScaler
import sys
import implicit

In [367]:
train_purchases = pd.read_csv("data/train_purchases.csv")
train_purchases.head()

Unnamed: 0,session_id,item_id,date
0,3,15085,2020-12-18 21:26:47.986
1,13,18626,2020-03-13 19:36:15.507
2,18,24911,2020-08-26 19:20:32.049
3,19,12534,2020-11-02 17:16:45.92
4,24,13226,2020-02-26 18:27:44.114


In [368]:
train_sessions = pd.read_csv("data/train_sessions.csv")
train_sessions.head()

Unnamed: 0,session_id,item_id,date
0,3,9655,2020-12-18 21:25:00.373
1,3,9655,2020-12-18 21:19:48.093
2,13,15654,2020-03-13 19:35:27.136
3,18,18316,2020-08-26 19:18:30.833
4,18,2507,2020-08-26 19:16:31.211


In [369]:
train_purchases['action'] = np.repeat(20.0, train_purchases.shape[0])
train_sessions['action'] = np.ones(train_sessions.shape[0])

In [370]:
users_train = sorted(train_purchases['session_id'].unique())[:950000]

In [371]:
filtered_train_purchases = train_purchases[train_purchases['session_id'].isin(users_train)]

In [372]:
df = filtered_train_purchases.append(train_sessions)

In [373]:
df.sort_values('session_id', inplace=True)
df.drop('date', axis=1, inplace=True)
df.head()

Unnamed: 0,session_id,item_id,action
0,3,15085,20.0
1,3,9655,1.0
0,3,9655,1.0
1,13,18626,20.0
2,13,15654,1.0


In [374]:
df = df.groupby(['session_id', 'item_id']).sum()

In [375]:
df.reset_index(inplace=True)

In [376]:
df.columns = ['user_id', 'item_id', 'rating']

In [377]:
df.sort_values('rating', ascending=False, inplace=True)

In [378]:
df.head()

Unnamed: 0,user_id,item_id,rating
1753107,1523169,24454,97.0
3044105,2642287,27807,76.0
1803365,1567039,773,49.0
1439855,1252954,26478,48.0
1531104,1331858,16769,46.0


### 2. Model creation - matrix factorization

In [379]:
# create a sparse matrix for the training set

users = df["user_id"].unique()
items = df["item_id"].unique()

# Create indices for users and items
user_cat = CategoricalDtype(categories=sorted(users), ordered=True)
item_cat = CategoricalDtype(categories=sorted(items), ordered=True)
user_index = df["user_id"].astype(user_cat).cat.codes
item_index = df["item_id"].astype(item_cat).cat.codes

# Conversion via COO matrix
coo = coo_matrix((df["rating"], (user_index, item_index)), shape=(len(users), len(items)))
sparse_item_user = coo.tocsr()
coo = coo_matrix((df["rating"], (item_index, user_index)), shape=(len(items), len(users)))
sparse_user_item = coo.tocsr()

In [380]:
#sparse_item_user_train = sparse_item_user[:950000]
#sparse_item_user_test = sparse_item_user[950000:]

In [381]:
#sparse_item_user_train

In [382]:
#sparse_item_user_test

In [383]:
# Initialize the als model and fit it using the sparse item-user matrix
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.2, iterations=20, random_state=42)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 20
data_conf = (sparse_item_user * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)

  0%|          | 0/20 [00:00<?, ?it/s]

### 3. Creating the recommendations

In [384]:
final_recommendations_ids = []

In [385]:
users_test = sorted(users)[950000:]

In [386]:
for i in range(len(users_test)):
    user = i + 950000
    user_id_interactions = sparse_item_user[user]
    
    recommended = model.recommend(user, user_id_interactions, N=100)

    item_ids = recommended[0]
    final_recommendations_ids.append(item_ids)

In [387]:
indexes = []

In [388]:
for i in range(len(final_recommendations_ids)):
    _list = final_recommendations_ids[0]
    user_id = users_test[i]
    real_recommendation = train_purchases[train_purchases['session_id'] == user_id]['item_id'].values[0]
    location = np.where(_list == real_recommendation)
    if(len(location[0])>0):
        index = location[0][0]+1
    else:
        index = 0
    indexes.append(index)

In [389]:
from collections import Counter
results = dict(Counter(indexes))

In [390]:
sum = 0
for key, value in results.items():
    if(key != 0):
        sum += (1/key) * value
result = sum/50000

In [391]:
"{:.8f}".format(float(result))

'0.00013346'