In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
data = pd.read_csv('collab_dataset.csv')
ratings_matrix = data.values

# Print dataset shape to verify it's loaded
print(f"Dataset loaded with shape: {ratings_matrix.shape}")


Dataset loaded with shape: (1000, 5000)


In [3]:
# Define the random seed for reproducibility
np.random.seed(42)

# Prepare training, validation, and test indices
non_nan_indices = np.array([(i, j) for i, j in zip(*np.where(~np.isnan(ratings_matrix)))])
np.random.shuffle(non_nan_indices)

# Split indices (80-10-10)
train_size = int(len(non_nan_indices) * 0.8)
valid_size = int(len(non_nan_indices) * 0.1)

train_indices = non_nan_indices[:train_size]
valid_indices = non_nan_indices[train_size:train_size + valid_size]
test_indices = non_nan_indices[train_size + valid_size:]

# Create matrices with NaNs for splitting data
train_matrix = np.full_like(ratings_matrix, np.nan)
for i, j in train_indices:
    train_matrix[i, j] = ratings_matrix[i, j]

# Print the size of each split
print(f"Training set size: {train_matrix.shape}")
print(f"Validation set size: {len(valid_indices)}")
print(f"Test set size: {len(test_indices)}")


Training set size: (1000, 5000)
Validation set size: 1100
Test set size: 1100


In [5]:
# Global mean for centering
global_mean = np.nanmean(train_matrix)
train_matrix_filled = np.nan_to_num(train_matrix - global_mean, nan=0)

# Print the global mean and the filled training matrix shape
print(f"Global mean: {global_mean:.4f}")
print(f"Shape of filled training matrix: {train_matrix_filled.shape}")


Global mean: 47.8138
Shape of filled training matrix: (1000, 5000)


In [7]:
# SVD for Latent Factor Model
def svd_predict(matrix_filled, n_components=100):
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    svd.fit(matrix_filled)
    transformed = svd.transform(matrix_filled)
    predicted_matrix = svd.inverse_transform(transformed) + global_mean
    return predicted_matrix

# SVD-predicted matrix
predicted_matrix_svd = svd_predict(train_matrix_filled)

# Print a sample of the predicted matrix
print(f"SVD-predicted matrix (first 5 rows):")
print(predicted_matrix_svd[:5])


SVD-predicted matrix (first 5 rows):
[[3.12117695e-02 4.79255563e+01 4.85038186e+01 ... 4.82000189e+01
  4.79686139e+01 4.80686615e+01]
 [4.78297433e+01 4.77391185e+01 4.84881783e+01 ... 4.83612435e+01
  4.77062232e+01 4.82696689e+01]
 [2.03733782e+00 4.76717722e+01 4.87309421e+01 ... 4.77256567e+01
  4.73043626e+01 4.74180951e+01]
 [3.00375180e+00 4.78332579e+01 4.66920078e+01 ... 4.79382555e+01
  4.81190305e+01 4.75847470e+01]
 [4.00588368e+00 4.77860084e+01 4.74092309e+01 ... 4.77394736e+01
  4.86148976e+01 4.75260185e+01]]


In [9]:
# Item-based collaborative filtering using cosine similarity
item_similarity_matrix = cosine_similarity(np.nan_to_num(train_matrix.T, nan=0))
predicted_matrix_item_based = np.dot(np.nan_to_num(train_matrix, nan=0), item_similarity_matrix) / (np.sum(item_similarity_matrix, axis=1) + 1e-9)

# Print the first 5 rows of item-based predicted matrix
print(f"Item-based predicted matrix (first 5 rows):")
print(predicted_matrix_item_based[:5])


Item-based predicted matrix (first 5 rows):
[[0.01881289 0.         0.03733716 ... 0.         0.         0.        ]
 [0.01384475 0.         0.         ... 0.         0.         0.        ]
 [0.02208098 0.00490314 0.01307683 ... 0.00852632 0.         0.        ]
 [0.03235241 0.00735471 0.01961524 ... 0.01278947 0.         0.        ]
 [0.0566806  0.00980628 0.02615365 ... 0.01705263 0.         0.        ]]


In [11]:
# User-based collaborative filtering using cosine similarity
user_similarity_matrix = cosine_similarity(np.nan_to_num(train_matrix, nan=0))

# Calculate predicted ratings for the user-based approach
predicted_matrix_user_based = np.dot(user_similarity_matrix, np.nan_to_num(train_matrix, nan=0))

# Normalize the predictions by dividing by the sum of similarities
user_similarity_sum = np.sum(np.abs(user_similarity_matrix), axis=1)
predicted_matrix_user_based /= (user_similarity_sum[:, np.newaxis] + 1e-9)  # Prevent division by zero

# Print the user similarity matrix and first 5 rows of predicted user-based matrix
print(f"User similarity matrix (first 5 rows):")
print(user_similarity_matrix[:5])

print(f"User-based predicted matrix (first 5 rows):")
print(predicted_matrix_user_based[:5])


User similarity matrix (first 5 rows):
[[1.         0.         0.         ... 0.         0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.22486277 0.22487355 0.22485937]
 [0.         0.         0.06805382 ... 0.3026123  0.30262681 0.30260773]
 [0.         0.         0.07233372 ... 0.32164357 0.32165899 0.32163871]]
User-based predicted matrix (first 5 rows):
[[3.48269093e+01 0.00000000e+00 1.09081103e-02 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.58465400e+01 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [4.98661375e+02 3.80454790e-03 2.32024919e-02 ... 8.24239841e-03
  0.00000000e+00 0.00000000e+00]
 [4.98231189e+02 3.80080035e-03 2.31796370e-02 ... 8.23427950e-03
  0.00000000e+00 0.00000000e+00]
 [4.96519015e+02 3.78628592e-03 2.30911190e-02 ... 8.20283457e-03
  0.00000000e+00 0.00000000e+00]]


In [13]:
# Combine SVD and user-based predictions
predicted_matrix = 0.7 * predicted_matrix_svd + 0.3 * predicted_matrix_user_based  # Emphasis on SVD for higher-order patterns

# Print combined prediction matrix sample
print(f"Combined predicted matrix (first 5 rows):")
print(predicted_matrix[:5])


Combined predicted matrix (first 5 rows):
[[ 10.46992103  33.54788938  33.95594548 ...  33.74001324  33.57802976
   33.64806308]
 [ 41.2347823   33.41738296  33.94172484 ...  33.85287046  33.39435623
   33.78876824]
 [151.02454903  33.37138188  34.11862019 ...  33.41043244  33.11305379
   33.19266657]
 [151.5719831   33.48442076  32.69135936 ...  33.55924911  33.68332132
   33.3093229 ]
 [151.75982318  33.45134178  33.19338898 ...  33.42009238  34.03042835
   33.26821293]]


In [15]:
# Validate and Fine-Tune Threshold for F1
best_f1, best_threshold = 0, 3.5  # Starting threshold for positive ratings
thresholds = np.arange(3.0, 4.5, 0.1)

for threshold in thresholds:
    valid_actual, valid_predicted = [], []

    for i, j in valid_indices:
        if not np.isnan(train_matrix[i, j]):
            valid_actual.append(train_matrix[i, j] >= threshold)
            valid_predicted.append(predicted_matrix[i, j] >= threshold)

    # Check if valid_actual or valid_predicted is empty
    if valid_actual and valid_predicted:
        f1 = f1_score(valid_actual, valid_predicted)
        if f1 > best_f1:
            best_f1, best_threshold = f1, threshold

# Print the best threshold and F1 score
print(f"Best Threshold: {best_threshold:.2f}")
print(f"Best F1 Score: {best_f1:.4f}")


Best Threshold: 3.50
Best F1 Score: 0.0000


In [17]:
# Apply best threshold to test data for final F1
test_actual, test_predicted = [], []

for i, j in test_indices:
    if not np.isnan(ratings_matrix[i, j]):
        test_actual.append(ratings_matrix[i, j] >= best_threshold)
        test_predicted.append(predicted_matrix[i, j] >= best_threshold)

# Handle zero division issue
f1_test = f1_score(test_actual, test_predicted, zero_division=1)

# Print final F1 score for test data
print(f"Test F1 Score: {f1_test:.4f}")


Test F1 Score: 0.7021


In [19]:
# Function to find top 5 similar users
def top_5_similar_users(user_id, user_similarity_matrix):
    similar_users = user_similarity_matrix[user_id]
    top_5_users = np.argsort(similar_users)[::-1][1:6]  # Exclude the user itself
    return top_5_users, similar_users[top_5_users]

# Test for User 0 (can be changed to any user)
user_id = 0
top_users, scores = top_5_similar_users(user_id, user_similarity_matrix)

# Print top 5 similar users
print(f"Top 5 similar users for User {user_id}:")
for i, score in zip(top_users, scores):
    print(f"User {i} with similarity score: {score:.4f}")


Top 5 similar users for User 0:
User 443 with similarity score: 0.5654
User 5 with similarity score: 0.5550
User 548 with similarity score: 0.3380
User 527 with similarity score: 0.2715
User 21 with similarity score: 0.2013


In [35]:
def recommend_courses(user_id, train_matrix, item_similarity_matrix, top_n=5):
    # Get the ratings of the user for all courses
    user_ratings = train_matrix[user_id]
    
    # Calculate the predicted ratings for all courses for the user
    predicted_ratings = np.dot(item_similarity_matrix, np.nan_to_num(train_matrix[user_id], nan=0)) / (np.sum(item_similarity_matrix, axis=1) + 1e-9)
    
    # Get the predicted ratings for this specific user
    user_predicted_ratings = predicted_ratings
    
    # Get the indices of the top N recommended courses (excluding courses the user has already rated)
    recommended_courses = np.argsort(user_predicted_ratings)[::-1]  # Sort in descending order of predicted ratings
    recommended_courses = [course for course in recommended_courses if np.isnan(user_ratings[course])][:top_n]
    
    # Get the actual ratings for the recommended courses
    actual_ratings = [user_ratings[course] for course in recommended_courses]
    
    return recommended_courses, user_predicted_ratings[recommended_courses], actual_ratings

# Test for User 0 (can be changed to any user)
user_id = 345
top_courses, scores, actual_ratings = recommend_courses(user_id, train_matrix, item_similarity_matrix, top_n=5)

# Print the top recommended courses with their predicted and actual ratings
print(f"Top 5 recommended courses for User {user_id}:")
for i, (score, actual) in zip(top_courses, zip(scores, actual_ratings)):
    print(f"Course {i} with predicted rating: {score:.4f} and actual rating: {actual if not np.isnan(actual) else 'Not rated'}")


Top 5 recommended courses for User 345:
Course 2508 with predicted rating: 11.3420 and actual rating: Not rated
Course 2689 with predicted rating: 7.3089 and actual rating: Not rated
Course 1711 with predicted rating: 6.6083 and actual rating: Not rated
Course 4362 with predicted rating: 6.5660 and actual rating: Not rated
Course 639 with predicted rating: 6.5579 and actual rating: Not rated
