In [1]:
import gzip
import numpy as np
import pandas as pd
from collections import defaultdict

def readGz(path):
    for l in gzip.open(path, 'rt'):
        yield eval(l)

def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        yield l.strip().split(',')

### Rating Prediction Task
___

In [15]:
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        yield l.strip().split(',')

allRatings = []
ratingsTrain = []
userRatings = defaultdict(list)
bookRatings = defaultdict(list)

for user, book, r in readCSV("train_Interactions.csv.gz"):
    r = int(r)
    allRatings.append(r)
    ratingsTrain.append((user, book, r))
    userRatings[user].append(r)
    bookRatings[book].append(r)

globalAverage = sum(allRatings) / len(allRatings)

best_lambda = 0.1  
num_epochs = 50
learning_rate = 0.005
latent_dim = 12 

alpha = globalAverage

beta_u = defaultdict(float)
beta_i = defaultdict(float)

user_factors = defaultdict(lambda: [0.1] * latent_dim)
book_factors = defaultdict(lambda: [0.1] * latent_dim)

for epoch in range(num_epochs):
    for user, book, rating in ratingsTrain:
        #dot product of latent factors
        dot_product = sum([pu * qi for pu, qi in zip(user_factors[user], book_factors[book])])

        prediction = alpha + beta_u[user] + beta_i[book] + dot_product
        error = rating - prediction

        alpha += learning_rate * error

        beta_u[user] += learning_rate * (error - best_lambda * beta_u[user])
        beta_i[book] += learning_rate * (error - best_lambda * beta_i[book])


        for k in range(latent_dim):

            user_factor_k = user_factors[user][k]
            book_factor_k = book_factors[book][k]

            user_factors[user][k] += learning_rate * (error * book_factor_k - best_lambda * user_factor_k)
            book_factors[book][k] += learning_rate * (error * user_factor_k - best_lambda * book_factor_k)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{num_epochs} completed.")

predictions = open("predictions_Rating.csv", 'w')
for l in open("pairs_Rating.csv"):
    if l.startswith("userID"):
        predictions.write(l)  # Write header
        continue
    user, book = l.strip().split(',')

    dot_product = sum([pu * qi for pu, qi in zip(user_factors[user], book_factors[book])])

    prediction = alpha + beta_u.get(user, 0) + beta_i.get(book, 0) + dot_product

    prediction = max(1, min(5, prediction))

    predictions.write(f"{user},{book},{prediction}\n")

predictions.close()

print("Predictions saved to 'predictions_Rating.csv'.")

Epoch 1/50
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 15.0841 - mse: 15.0841 - val_loss: 8.8008 - val_mse: 8.7929
Epoch 2/50
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 5.5129 - mse: 5.5129 - val_loss: 2.6823 - val_mse: 2.6722
Epoch 3/50
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 1.6951 - mse: 1.6951 - val_loss: 2.0154 - val_mse: 2.0053
Epoch 4/50
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 1.1182 - mse: 1.1182 - val_loss: 1.9290 - val_mse: 1.9218
Epoch 5/50
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 0.8717 - mse: 0.8717 - val_loss: 1.9829 - val_mse: 1.9760
Epoch 6/50
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 0.6903 - mse: 0.6903 - val_loss: 2.0848 - val_mse: 2.0766
Epoch 7/50
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step 

IndexError: tuple index out of range

In [14]:
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
import gzip

# Helper function to read data
def readCSV(path):
    f = gzip.open(path, 'rt')
    f.readline()
    for l in f:
        yield l.strip().split(',')

# Step 1: Prepare the data
users = []
books = []
ratings = []

# Read training data
user_id_map = {}
book_id_map = {}
next_user_id = 0
next_book_id = 0

for user, book, r in readCSV("train_Interactions.csv.gz"):
    if user not in user_id_map:
        user_id_map[user] = next_user_id
        next_user_id += 1
    if book not in book_id_map:
        book_id_map[book] = next_book_id
        next_book_id += 1
    users.append(user_id_map[user])
    books.append(book_id_map[book])
    ratings.append(float(r))

users = np.array(users)
books = np.array(books)
ratings = np.array(ratings)

# Split into training and validation sets
train_users, val_users, train_books, val_books, train_ratings, val_ratings = train_test_split(
    users, books, ratings, test_size=0.1, random_state=42
)

# Step 2: Define the TensorFlow model
class LatentFactorModel(tf.keras.Model):
    def __init__(self, num_users, num_books, latent_dim):
        super().__init__()
        self.user_embedding = tf.keras.layers.Embedding(num_users, latent_dim)
        self.book_embedding = tf.keras.layers.Embedding(num_books, latent_dim)

    def call(self, inputs):
        user_input, book_input = inputs
        user_latent = self.user_embedding(user_input)
        book_latent = self.book_embedding(book_input)
        dot_product = tf.reduce_sum(user_latent * book_latent, axis=1)  # Compute dot product
        return dot_product

# Parameters
latent_dim = 12  # Number of latent dimensions
num_users = len(user_id_map)
num_books = len(book_id_map)

# Instantiate the model
model = LatentFactorModel(num_users, num_books, latent_dim)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005),
              loss='mse',
              metrics=['mse'])

# Step 3: Train the model
history = model.fit(
    [train_users, train_books], train_ratings,
    validation_data=([val_users, val_books], val_ratings),
    epochs=50,
    batch_size=256
)

# Step 4: Predict ratings for the test set
test_users = []
test_books = []

for l in open("pairs_Rating.csv"):
    if l.startswith("userID"):
        continue
    user, book = l.strip().split(',')
    test_users.append(user_id_map.get(user, -1))  # Default to -1 for unseen users
    test_books.append(book_id_map.get(book, -1))  # Default to -1 for unseen books

test_users = np.array(test_users)
test_books = np.array(test_books)

# Handle unseen users or books by assigning a default latent vector (e.g., zero vectors)
default_user = tf.zeros((latent_dim,))
default_book = tf.zeros((latent_dim,))

predictions = []
for i in range(len(test_users)):
    user_idx = test_users[i]
    book_idx = test_books[i]
    if user_idx == -1 or book_idx == -1:
        # Use the dot product of default latent vectors
        prediction = tf.reduce_sum(default_user * default_book).numpy()
    else:
        # Ensure the input shape is correct for model.predict
        user_input = np.array([user_idx])  # Shape (1,)
        book_input = np.array([book_idx])  # Shape (1,)
        prediction = model.predict([user_input, book_input])[0]  # Output is (1,)
    # Clip prediction to the valid range (1 to 5)
    prediction = max(1, min(5, prediction))
    predictions.append(prediction)


# Save predictions
with open("predictions_Rating.csv", 'w') as f:
    f.write("userID,bookID,rating\n")
    for l, pred in zip(open("pairs_Rating.csv"), predictions):
        if l.startswith("userID"):
            continue
        user, book = l.strip().split(',')
        f.write(f"{user},{book},{pred}\n")

print("Predictions saved to 'predictions_Rating.csv'.")


Total interactions: 200000
Unique users: 27943
Unique books: 6688
Number of users: 27943
Number of books: 6688
Positive samples: 4396
Negative samples: 4396
Total samples after negative sampling: 8792
Training samples: 8792
Validation samples: 0


Epoch 1/20




[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 62ms/step - accuracy: 0.5092 - loss: 0.6945
Epoch 2/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7440 - loss: 0.6925

  self.gen.throw(typ, value, traceback)
  current = self.get_monitor_value(logs)


[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.7470 - loss: 0.6925
Epoch 3/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9083 - loss: 0.6902
Epoch 4/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9714 - loss: 0.6878
Epoch 5/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.9905 - loss: 0.6853
Epoch 6/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9970 - loss: 0.6824
Epoch 7/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9986 - loss: 0.6791
Epoch 8/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.9995 - loss: 0.6754
Epoch 9/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.9996 - loss: 0.6711
Epoch 10/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2

ValueError: math domain error

In [None]:
predictions = []
for i in range(len(test_users)):
    user_idx = test_users[i]
    book_idx = test_books[i]
    if user_idx == -1 or book_idx == -1:
        # Use the dot product of default latent vectors
        prediction = tf.reduce_sum(default_user * default_book).numpy()
    else:
        # Ensure the input shape is correct for model.predict
        user_input = np.array([user_idx])  # Shape (1,)
        book_input = np.array([book_idx])  # Shape (1,)
        prediction = model.predict([user_input, book_input])[0]  # Output is (1,)
    # Clip prediction to the valid range (1 to 5)
    prediction = max(1, min(5, prediction))
    predictions.append(prediction)


# Save predictions
with open("predictions_Rating.csv", 'w') as f:
    f.write("userID,bookID,rating\n")
    for l, pred in zip(open("pairs_Rating.csv"), predictions):
        if l.startswith("userID"):
            continue
        user, book = l.strip().split(',')
        f.write(f"{user},{book},{pred}\n")

print("Predictions saved to 'predictions_Rating.csv'.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 87ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21

### Read Prediction Task
____

In [3]:
from sklearn.model_selection import train_test_split

def readCSV(path):
    with gzip.open(path, 'rt') if path.endswith('.gz') else open(path, 'r') as f:
        for line in f:
            yield line.strip().split(',')

def jaccard_similarity(set1, set2):
    intersection = len(set1 & set2)
    union = len(set1 | set2)
    return intersection / union if union > 0 else 0

bookToUsers = defaultdict(set)
userToBooks = defaultdict(set)
bookCount = defaultdict(int)

for user, book, _ in readCSV("train_Interactions.csv.gz"):
    bookToUsers[book].add(user)
    userToBooks[user].add(book)
    bookCount[book] += 1


mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort(reverse=True)

top_blank_percent = int(len(mostPopular) * 0.30)
return1 = set(book for _, book in mostPopular[:top_blank_percent])

threshold = 0.005  
predictions = open("predictions_Read.csv", 'w')

for l in open("pairs_Read.csv"):
    if l.startswith("userID"):
        predictions.write(l)  
        continue
    u, b = l.strip().split(',')
    
    popular_prediction = b in return1

    jaccard_prediction = False
    if u in userToBooks:
        max_similarity = 0
        for b_prime in userToBooks[u]:
            max_similarity = max(max_similarity, jaccard_similarity(bookToUsers[b], bookToUsers[b_prime]))
        jaccard_prediction = max_similarity > threshold

#hybrid prediction 
    final_prediction = popular_prediction or jaccard_prediction


    predictions.write(f"{u},{b},{int(final_prediction)}\n")

predictions.close()
