In [1]:
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import ElasticNet
import pandas as pd
from scipy.sparse import csr_matrix, lil_matrix, save_npz, load_npz
from joblib import Parallel, delayed
import csv
from sklearn.metrics import mean_squared_error


In [2]:
DATA_DIR = 'data'
train_ratings = pd.read_csv(f'{DATA_DIR}/train_ratings.csv')

In [3]:
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
import numpy as np

def create_user_item_matrix(df):
    """ Create a user-item matrix for collaborative filtering """
    user_ids = df['userId'].unique()
    movie_ids = df['movieId'].unique()
    
    user_id_to_idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
    movie_id_to_idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
    
    rows = df['userId'].map(user_id_to_idx)
    cols = df['movieId'].map(movie_id_to_idx)
    values = df['rating']
    
    return csr_matrix((values, (rows, cols)), shape=(len(user_ids), len(movie_ids))), user_id_to_idx, movie_id_to_idx

# Create user-item matrices for train data
train_matrix, train_user_id_to_idx, train_movie_id_to_idx = create_user_item_matrix(train_ratings)

In [4]:
train_matrix = train_matrix.toarray()

In [5]:
min_rating = min(train_ratings['rating'])
max_rating = max(train_ratings['rating'])
global_mean = train_ratings['rating'].mean()
min_rating

0.5

In [6]:
#Normalize train_matrix
train_matrix = (train_matrix)/(max_rating - min_rating)

In [7]:
train_matrix

array([[0.66666667, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.88888889, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.66666667, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [8]:
#Create batches for elasticNet training
batches = []
i = 0
increment = 3000
while i < train_matrix.shape[1]:
    batches.append((i, min(i+increment, train_matrix.shape[1])))
    i += increment
batches    

[(0, 3000), (3000, 6000), (6000, 8983)]

In [None]:
def train_slim_parallel(data, l1_reg=0.001, l2_reg=0.001, n_jobs=-1):
    """
    Train SLIM model
    :param data: User-rating matrix.
    :param l1_reg: L1 regularization parameter.
    :param l2_reg: L2 regularization parameter.
    :param n_jobs: Number of parallel jobs.
    :return: Weight matrix W
    """
    n_items = data.shape[1]
    sim_matrix = lil_matrix((n_items, n_items))

    def train_item(item):
        target = data[:, item].ravel()  

        predictors = data.copy()
        predictors[:, item] = 0 

        model = ElasticNet(alpha=l1_reg + l2_reg, l1_ratio=l1_reg / (l1_reg + l2_reg), fit_intercept=False, positive=True)
        model.fit(predictors, target)

        return model.coef_

    for batch in batches:
        results = Parallel(n_jobs=n_jobs)(delayed(train_item)(item) for item in tqdm(range(batch[0], batch[1])))
        for item, coef in enumerate(results):
            index = batch[0] + item
            sim_matrix[:, index] = coef
        with open(f"npz_result_{batch[1]}.npz", 'wb') as f:
            save_npz(f, csr_matrix(sim_matrix))

    return csr_matrix(sim_matrix)


w_matrix = train_slim_parallel(train_matrix,0.001,0.001)

  3%|▎         | 96/3000 [00:20<02:30, 19.33it/s]

In [None]:
w_matrix


<8983x8983 sparse matrix of type '<class 'numpy.float64'>'
	with 245578 stored elements in Compressed Sparse Row format>

In [None]:
w_matrix = w_matrix.toarray()