In [None]:
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import ctypes
from pycuda.compiler import SourceModule
import time
#define CLIP(x, min, max) (fminf(fmaxf(x, min), max))

N = 50  # Increase to capture more user preferences
  # Adjust as needed
totalMoviesUsingPerPerson=400

# Define the Row structure
class Row(ctypes.Structure):
    _fields_ = [("col1", ctypes.c_int), ("col2", ctypes.c_float)]
class RowTest(ctypes.Structure):
    _fields_ = [("col1", ctypes.c_int), ("col2", ctypes.c_float),("user",ctypes.c_int)]





# Create some example row data
rows_data = []

# Define the parallel matrix multiplication kernel
kernel_code = """
struct Row {
    int col1;
    float col2;
};

__global__ void matmul(float *A, float *B, int N, int raters, int movies, Row *rows, float learning_rate) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;  // Movies
    int col = blockIdx.x * blockDim.x + threadIdx.x;  // Raters
    int totalMoviesUsingPerPerson=400;
    if (row < movies && col < raters && col > 0 && row > 0) {
        int found = 0;
        float actual_rating = 0.0;

        // Find actual rating from stored data
        
        int start = 0;
        int end = totalMoviesUsingPerPerson-1;  // Fix here to prevent out-of-bounds access
        while (start <= end) {
            int mid = start + (end - start) / 2;
            if (rows[(col - 1) * totalMoviesUsingPerPerson + mid].col1 == row){
                found = 1;
                actual_rating = rows[(col - 1) * totalMoviesUsingPerPerson + mid].col2;
                break;
            } else if (rows[(col - 1) * totalMoviesUsingPerPerson + mid].col1==0 || rows[(col - 1) * totalMoviesUsingPerPerson + mid].col1 > row) {
                end = mid - 1;
            } else {
                start = mid + 1;
            }
        }

        if (found) {
            float predicted_rating = 0.0;
            for (int k = 0; k < N; k++) {
                predicted_rating += A[row * N + k] * B[k * raters + col];
            }
            
            float error = actual_rating - predicted_rating;
            
            for (int k = 0; k < N; k++) {
                float lambda = 0.01;
                float regularization = lambda * (A[row * N + k] * A[row * N + k] + B[k * raters + col] * B[k * raters + col]);
                float grad_A = -2.0 * error * B[k * raters + col] + regularization;
                float grad_B = -2.0 * error * A[row * N + k] + regularization;
                                
                // Apply updates
                atomicAdd(&A[row * N + k], -learning_rate * grad_A);
                atomicAdd(&B[k * raters + col], -learning_rate * grad_B);

                atomicExch(&A[row * N + k], fminf(fmaxf(A[row * N + k], -1.0f), 1.0f));
                atomicExch(&B[k * raters + col], fminf(fmaxf(B[k * raters + col], -1.0f), 1.0f));
            }

            

        }

    }
}

"""

mod = SourceModule(kernel_code)


def read_csv_to_list_rating(file_path):
    """Read a CSV file into a list of lists with specific column types."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        matrix = []
        header = []
        for line in file:
            values = line.strip().split('::')
            row = [int(values[0]), int(values[1]), float(values[2]), int(values[3])]
            matrix.append(row)
        return header, matrix

def read_csv_to_list_movies(file_path):
    """Read a CSV file into a list of lists with specific column types."""
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        matrix = []
        header = []
        for line in file:
            values = line.strip().split('::')
            row = [int(values[0]), str(values[1]), str(values[2])]
            matrix.append(row)
        return header, matrix

headerRating, matrixrating = read_csv_to_list_rating('ratings.txt')
headerMovies, matrixMovies = read_csv_to_list_movies('movies.txt')

totalRaters = int(matrixrating[-1][0])+1  # total rows
totalMovies = int(matrixMovies[-1][0])+1
print(totalRaters)
print(totalMovies)



d = {}
# row 0 is userID

for row in matrixrating:
    if row[0] not in d:
        d[row[0]] = []
    else:
        d[row[0]].append(list(row[1:4]))

for key, val in d.items():
    d[key] = sorted(val, key=lambda x: x[2])

for key, val in d.items():
    d[key] = d[key][::-1]

newD = {}
for key, val in d.items():
    newD[key] = sorted(d[key][:totalMoviesUsingPerPerson], key=lambda x: x[0])

totalUsers = len(newD)
total_rat_us=0
rows_data = []
for i in range(1, (len(newD) + 1)):
    row = newD[i]
    for j in range(totalMoviesUsingPerPerson):
        if j < len(row):
            total_rat_us+=1
            rows_data.append(Row(row[j][0], row[j][1]))
        else:
            rows_data.append(Row(0, 0.0))
print("Total rating used ",end="")
print(total_rat_us)
matrixAMovies = np.zeros((totalMovies, N), dtype=np.float32)
matrixBUsers = np.zeros((N, totalRaters), dtype=np.float32)


# Fill matrices with random values in the range -0.01 to 0.01
matrixAMovies = np.random.uniform(-0.1, 0.1, size=(totalMovies, N)).astype(np.float32)
matrixBUsers = np.random.uniform(-0.1, 0.1, size=(N, totalRaters)).astype(np.float32)

matrixAMovies = np.ascontiguousarray(matrixAMovies, dtype=np.float32)
matrixBUsers = np.ascontiguousarray(matrixBUsers, dtype=np.float32)

A_gpu_matrixAMovies = cuda.mem_alloc(matrixAMovies.nbytes)
B_gpu_matrixBUsers = cuda.mem_alloc(matrixBUsers.nbytes)

if A_gpu_matrixAMovies is None or B_gpu_matrixBUsers is None:
    print("Damn")

rows = (Row * len(rows_data))(*rows_data)

# Allocate memory on the GPU for the rows
rows_gpu = cuda.mem_alloc(ctypes.sizeof(rows))
if rows_gpu is None:
    print("damn")

# Transfer the rows data to the GPU
cuda.memcpy_htod(rows_gpu, rows)

cuda.memcpy_htod(A_gpu_matrixAMovies, matrixAMovies)
cuda.memcpy_htod(B_gpu_matrixBUsers, matrixBUsers)


matmul = mod.get_function("matmul")

# Define grid and block sizes
block_size = (16, 16, 1)  # 16x16 threads per block
grid_size = ((totalRaters + block_size[1] - 1) // block_size[1], 
             (totalMovies + block_size[0] - 1) // block_size[0])


print(f"Matrix A shape: {matrixAMovies.shape}")
print(f"Matrix B shape: {matrixBUsers.shape}")

# Calculate the shared memory size
shared_mem_size = ctypes.sizeof(Row) * min(totalRaters * totalMoviesUsingPerPerson, block_size[0] * block_size[1])
# Run the parallel CUDA kernel
cuda.Context.synchronize()  # Ensure synchronization before execution]
print("A GPU Memory Address:", int(A_gpu_matrixAMovies))
print("B GPU Memory Address:", int(B_gpu_matrixBUsers))
print("Rows GPU Memory Address:", int(rows_gpu))

# start_time = time.time()
# matmul(A_gpu_matrixAMovies, B_gpu_matrixBUsers, np.int32(N), np.int32(totalRaters), np.int32(totalMovies), rows_gpu,learning_rate,
#        block=block_size, grid=grid_size, shared=shared_mem_size)
# cuda.Context.synchronize()  # Ensure synchronization after execution
# end_time = time.time()
# # Copy result back to CPU

# # 🚀 First, copy updated A and B back to CPU
# matrixAMovies_result = np.empty_like(matrixAMovies)
# matrixBUsers_result = np.empty_like(matrixBUsers)

# cuda.memcpy_dtoh(matrixAMovies_result, A_gpu_matrixAMovies)
# cuda.memcpy_dtoh(matrixBUsers_result, B_gpu_matrixBUsers)
# rows_result = np.zeros(len(rows_data), dtype=[('col1', np.int32), ('col2', np.float32)])
# cuda.memcpy_dtoh(rows_result, rows_gpu)




6041
3953
Total rating used 855363
Matrix A shape: (3953, 50)
Matrix B shape: (50, 6041)
A GPU Memory Address: 47346511872
B GPU Memory Address: 47343206400
Rows GPU Memory Address: 47380955136


In [None]:
epochs = 30  # Number of iterations

initial_lr = np.float32(0.001)  # Start slightly lower than 0.003
decay_factor = np.float32(0.95)   # Reduce by 10% every 5 epochs
decay_step = 5

min_lr = np.float32(0.0005)  # Ensure learning rate never goes too low
def normalize_embeddings(matrix):
    norms = np.linalg.norm(matrix, axis=1, keepdims=True)  # Compute row-wise norm
    norms[norms == 0] = 1  # Prevent division by zero
    return matrix / norms
learning_rate = np.float32(0.001)
for epoch in range(epochs):
    # learning_rate = np.float32(initial_lr * np.exp(-0.05 * epoch))
    if epoch==3:
        learning_rate = np.float32(0.0005)


    matrixAMovies_result = np.empty_like(matrixAMovies)
    matrixBUsers_result = np.empty_like(matrixBUsers)
    rows_result = np.zeros(len(rows_data), dtype=[('col1', np.int32), ('col2', np.float32)])
    start_time = time.time()
    # if epoch % 5 == 0:  # Every 5 epochs, reduce learning rate
    #     learning_rate *= 0.9  # Reduce by 10%

    matmul(A_gpu_matrixAMovies, B_gpu_matrixBUsers, np.int32(N), np.int32(totalRaters),
           np.int32(totalMovies), rows_gpu, learning_rate, block=block_size, grid=grid_size,
           shared=shared_mem_size)
    
    cuda.Context.synchronize()  # Wait for CUDA execution

    # Copy updated matrices and rows back to CPU
    cuda.memcpy_dtoh(matrixAMovies_result, A_gpu_matrixAMovies)
    cuda.memcpy_dtoh(matrixBUsers_result, B_gpu_matrixBUsers)
    cuda.memcpy_dtoh(rows_result, rows_gpu)


    if np.isnan(matrixAMovies_result).any() or np.isnan(matrixBUsers_result).any():
        print("Error: NaN detected in A or B matrix!")
        exit()

    if np.isinf(matrixAMovies_result).any() or np.isinf(matrixBUsers_result).any():
        print("Error: Inf detected in A or B matrix!")
        exit()

    # Compute loss
    mse = 0
    count = 0
    for i in range(len(rows_result)):
        if rows_result[i]['col1'] > 0:
            actual = rows_result[i]['col2']
            predicted = np.dot(matrixAMovies_result[rows_result[i]['col1'], :], matrixBUsers_result[:, i // totalMoviesUsingPerPerson])
            
            if np.isnan(predicted) or np.isinf(predicted):
                print(f"Error in prediction: row {i}, actual {actual}, predicted {predicted}")
                exit()

            mse += (actual - predicted) ** 2
            count += 1

    mse = mse / count if count > 0 else float('inf')  # Avoid division by zero

    end_time = time.time()
    print(f"Epoch {epoch+1}/{epochs}, MSE: {mse:.5f}, Time: {end_time - start_time:.5f} sec")





# Track execution time for CUDA

print(f"Parallel GPU Execution Time (CUDA): {end_time - start_time:.5f} seconds")


user_id = 2  # Example user (change as needed)
movie_id = rows_result[17]['col1']  # Get the movie ID from rows_result

predicted = np.dot(matrixAMovies_result[movie_id, :], matrixBUsers_result[:, user_id])
print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted:.2f}")







listUnused =[]
for m in (matrixrating[55:180]):
    hit=0
    for i in range(1*totalMoviesUsingPerPerson,2*totalMoviesUsingPerPerson):
        if int(m[1]) ==int(rows_data[i].col1):
            
            hit=1
            break
    if hit==0:
        listUnused.append([int(m[1]),int(m[2])])

listUnused2 =[]
for m in (matrixrating[55:180]):
    hit=0
    for i in range(1*totalMoviesUsingPerPerson,2*totalMoviesUsingPerPerson):
        if int(m[1]) ==int(rows_data[i].col1):
            
            hit=1
            break
    if hit==1:
        listUnused2.append([int(m[1]),int(m[2])])


for i in range(len(listUnused)):
    user_id = 2  # Example user (change as needed)
    movie_id = listUnused[i][0]  # Get the movie ID from rows_result
    actualPred = listUnused[i][1]
    predicted = np.dot(matrixAMovies_result[movie_id, :], matrixBUsers_result[:, user_id])
    print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted:.2f}     Actual {actualPred}")

print()

Epoch 1/30, MSE: 13.99952, Time: 4.28683 sec
Epoch 2/30, MSE: 8.26048, Time: 4.24870 sec
Epoch 3/30, MSE: 3.61113, Time: 4.40834 sec
Epoch 4/30, MSE: 2.62249, Time: 4.20624 sec
Epoch 5/30, MSE: 2.16217, Time: 4.23969 sec
Epoch 6/30, MSE: 1.85183, Time: 4.31850 sec
Epoch 7/30, MSE: 1.64910, Time: 4.27681 sec
Epoch 8/30, MSE: 1.50845, Time: 4.30024 sec
Epoch 9/30, MSE: 1.40977, Time: 4.26602 sec
Epoch 10/30, MSE: 1.33884, Time: 4.22616 sec
Epoch 11/30, MSE: 1.28675, Time: 4.26149 sec
Epoch 12/30, MSE: 1.24798, Time: 4.30060 sec
Epoch 13/30, MSE: 1.21883, Time: 4.31560 sec
Epoch 14/30, MSE: 1.19659, Time: 4.29249 sec
Epoch 15/30, MSE: 1.17946, Time: 4.36096 sec
Epoch 16/30, MSE: 1.16599, Time: 4.30867 sec
Epoch 17/30, MSE: 1.15554, Time: 4.24148 sec
Epoch 18/30, MSE: 1.14720, Time: 4.28903 sec
Epoch 19/30, MSE: 1.14064, Time: 4.21613 sec
Epoch 20/30, MSE: 1.13522, Time: 4.26694 sec
Epoch 21/30, MSE: 1.13098, Time: 4.24596 sec
Epoch 22/30, MSE: 1.12753, Time: 4.22337 sec
Epoch 23/30, MSE: 

In [20]:
print("HERE")
print()
for i in range(len(listUnused2)):
    user_id = 2  # Example user (change as needed)
    movie_id = listUnused2[i][0]  # Get the movie ID from rows_result
    actualPred = listUnused2[i][1]
    predicted = np.dot(matrixAMovies_result[movie_id, :], matrixBUsers_result[:, user_id])
    print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted:.2f}     Actual {actualPred}")

HERE

Predicted rating for user 2 and movie 1537: 3.42     Actual 4
Predicted rating for user 2 and movie 647: 3.06     Actual 3
Predicted rating for user 2 and movie 2194: 3.74     Actual 4
Predicted rating for user 2 and movie 648: 3.54     Actual 4
Predicted rating for user 2 and movie 2268: 3.98     Actual 5
Predicted rating for user 2 and movie 2628: 10.38     Actual 3
Predicted rating for user 2 and movie 1103: 3.08     Actual 3
Predicted rating for user 2 and movie 2916: 3.57     Actual 3
Predicted rating for user 2 and movie 3468: 3.44     Actual 5
Predicted rating for user 2 and movie 1210: 7.61     Actual 4
Predicted rating for user 2 and movie 1792: 2.63     Actual 3
Predicted rating for user 2 and movie 1687: 2.82     Actual 3
Predicted rating for user 2 and movie 1213: 3.31     Actual 2
Predicted rating for user 2 and movie 3578: 5.79     Actual 5
Predicted rating for user 2 and movie 2881: 2.79     Actual 3
Predicted rating for user 2 and movie 3030: 3.49     Actual 4
Pre

In [25]:
learning_rate = np.float32(0.001)

In [26]:
for epoch in range(3):

    cuda.memcpy_htod(rows_gpu, rows_result)

    cuda.memcpy_htod(A_gpu_matrixAMovies, matrixAMovies_result)
    cuda.memcpy_htod(B_gpu_matrixBUsers, matrixBUsers_result)
    # learning_rate = np.float32(initial_lr * np.exp(-0.05 * epoch))



    matrixAMovies_result = np.empty_like(matrixAMovies)
    matrixBUsers_result = np.empty_like(matrixBUsers)
    rows_result = np.zeros(len(rows_data), dtype=[('col1', np.int32), ('col2', np.float32)])
    start_time = time.time()
    # if epoch % 5 == 0:  # Every 5 epochs, reduce learning rate
    #     learning_rate *= 0.9  # Reduce by 10%

    matmul(A_gpu_matrixAMovies, B_gpu_matrixBUsers, np.int32(N), np.int32(totalRaters),
           np.int32(totalMovies), rows_gpu, learning_rate, block=block_size, grid=grid_size,
           shared=shared_mem_size)
    
    cuda.Context.synchronize()  # Wait for CUDA execution

    # Copy updated matrices and rows back to CPU
    cuda.memcpy_dtoh(matrixAMovies_result, A_gpu_matrixAMovies)
    cuda.memcpy_dtoh(matrixBUsers_result, B_gpu_matrixBUsers)
    cuda.memcpy_dtoh(rows_result, rows_gpu)


    if np.isnan(matrixAMovies_result).any() or np.isnan(matrixBUsers_result).any():
        print("Error: NaN detected in A or B matrix!")
        exit()

    if np.isinf(matrixAMovies_result).any() or np.isinf(matrixBUsers_result).any():
        print("Error: Inf detected in A or B matrix!")
        exit()

    # Compute loss
    mse = 0
    count = 0
    for i in range(len(rows_result)):
        if rows_result[i]['col1'] > 0:
            actual = rows_result[i]['col2']
            predicted = np.dot(matrixAMovies_result[rows_result[i]['col1'], :], matrixBUsers_result[:, i // totalMoviesUsingPerPerson])
            
            if np.isnan(predicted) or np.isinf(predicted):
                print(f"Error in prediction: row {i}, actual {actual}, predicted {predicted}")
                exit()

            mse += (actual - predicted) ** 2
            count += 1

    mse = mse / count if count > 0 else float('inf')  # Avoid division by zero

    end_time = time.time()
    print(f"Epoch {epoch+1}/{epochs}, MSE: {mse:.5f}, Time: {end_time - start_time:.5f} sec")

Epoch 1/30, MSE: 1.12056, Time: 4.31409 sec
Epoch 2/30, MSE: 1.12554, Time: 4.24295 sec
Epoch 3/30, MSE: 1.13615, Time: 4.20705 sec


In [27]:
listUnused =[]
for m in (matrixrating[55:180]):
    hit=0
    for i in range(1*totalMoviesUsingPerPerson,2*totalMoviesUsingPerPerson):
        if int(m[1]) ==int(rows_data[i].col1):
            
            hit=1
            break
    if hit==0:
        listUnused.append([int(m[1]),int(m[2])])

listUnused2 =[]
for m in (matrixrating[55:180]):
    hit=0
    for i in range(1*totalMoviesUsingPerPerson,2*totalMoviesUsingPerPerson):
        if int(m[1]) ==int(rows_data[i].col1):
            
            hit=1
            break
    if hit==1:
        listUnused2.append([int(m[1]),int(m[2])])


print("HERE")
print()
for i in range(len(listUnused2)):
    user_id = 2  # Example user (change as needed)
    movie_id = listUnused2[i][0]  # Get the movie ID from rows_result
    actualPred = listUnused2[i][1]
    predicted = np.dot(matrixAMovies_result[movie_id, :], matrixBUsers_result[:, user_id])
    print(f"Predicted rating for user {user_id} and movie {movie_id}: {predicted:.2f}     Actual {actualPred}")

HERE

Predicted rating for user 2 and movie 1537: 3.78     Actual 4
Predicted rating for user 2 and movie 647: 3.39     Actual 3
Predicted rating for user 2 and movie 2194: 4.01     Actual 4
Predicted rating for user 2 and movie 648: 3.46     Actual 4
Predicted rating for user 2 and movie 2268: 3.99     Actual 5
Predicted rating for user 2 and movie 2628: 3.75     Actual 3
Predicted rating for user 2 and movie 1103: 3.68     Actual 3
Predicted rating for user 2 and movie 2916: 3.63     Actual 3
Predicted rating for user 2 and movie 3468: 4.05     Actual 5
Predicted rating for user 2 and movie 1210: 4.22     Actual 4
Predicted rating for user 2 and movie 1792: 3.08     Actual 3
Predicted rating for user 2 and movie 1687: 3.04     Actual 3
Predicted rating for user 2 and movie 1213: 4.06     Actual 2
Predicted rating for user 2 and movie 3578: 4.32     Actual 5
Predicted rating for user 2 and movie 2881: 3.19     Actual 3
Predicted rating for user 2 and movie 3030: 4.13     Actual 4
Pred