In [18]:
#imports
import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
# import utils.py

#utils code from Andrew ng (could not import as a module)
#replaced pickle5 with pickle to suit latest verson of python

import pickle
from tensorflow.keras.models import Model
import csv
import re
import tabulate

def load_data():
    item_train = genfromtxt('content_item_train.csv', delimiter=',')
    user_train = genfromtxt('content_user_train.csv', delimiter=',')
    y_train    = genfromtxt('content_y_train.csv', delimiter=',')

    with open('content_item_train_header.txt', newline='') as f:
        item_features = list(csv.reader(f))[0]

    with open('content_user_train_header.txt', newline='') as f:
        user_features = list(csv.reader(f))[0]

    item_vecs = genfromtxt('content_item_vecs.csv', delimiter=',')

    movie_dict = defaultdict(dict)
    with open('content_movie_list.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='"')
        next(reader)  # Skip header
        for line in reader:
            movie_id = int(line[0])
            movie_dict[movie_id]["title"] = line[1]
            movie_dict[movie_id]["genres"] = line[2]

    with open('content_user_to_genre.pickle', 'rb') as f:
        user_to_genre = pickle.load(f)

    return item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre


def pprint_train(x_train, features, vs, u_s, maxcount=5, user=True):
    if user:
        flist = [".0f", ".0f", ".1f"] + [".1f"] * (x_train.shape[1] - 3)
    else:
        flist = [".0f", ".0f", ".1f"] + [".0f"] * (x_train.shape[1] - 3)

    head = features[:vs]
    if vs < u_s:
        print(f"error, vector start {vs} should be greater than user start {u_s}")
    for i in range(u_s):
        head[i] = "[" + head[i] + "]"
    genres = features[vs:]
    hdr = head + genres
    disp = [split_str(hdr, 5)]

    for i in range(min(maxcount, x_train.shape[0])):
        disp.append([
            x_train[i, 0].astype(int),
            x_train[i, 1].astype(int),
            x_train[i, 2].astype(float),
            *x_train[i, 3:].astype(float)
        ])

    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=flist, numalign='center')
    return table


def pprint_data_tab(y_p, user_train, item_train, uvs, ivs, user_features, item_features, maxcount=20):
    flist = [".1f", ".1f", ".0f", ".1f", ".0f", ".0f", ".0f"] + [".1f"] * (user_train.shape[1] - uvs)
    user_head = user_features[:uvs]
    genres = user_features[uvs:]
    item_head = item_features[:ivs]
    hdr = ["y_p", "y"] + user_head + item_head + genres
    disp = [split_str(hdr, 5)]

    for i in range(min(maxcount, y_p.shape[0])):
        a = user_train[i, uvs:]
        b = item_train[i, ivs:]
        c = np.multiply(a, b)

        disp.append([
            y_p[i, 0], 0,
            user_train[i, 0].astype(int),
            user_train[i, 1].astype(int),
            user_train[i, 2].astype(float),
            item_train[i, 0].astype(int),
            item_train[i, 1].astype(int),
            item_train[i, 2].astype(float),
            *c
        ])

    table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow", floatfmt=flist, numalign='center')
    return table


def split_str(ifeatures, smax):
    ofeatures = []
    for s in ifeatures:
        if ' ' not in s and len(s) > smax:
            mid = len(s) // 2
            s = s[:mid] + " " + s[mid:]
        ofeatures.append(s)
    return ofeatures


def gen_user_vecs(user_vec, num_items):
    return np.tile(user_vec, (num_items, 1))


def predict_uservec(user_vecs, item_vecs, model, u_s, i_s, scaler, ScalerUser, ScalerItem, scaledata=False):
    if scaledata:
        scaled_user_vecs = ScalerUser.transform(user_vecs)
        scaled_item_vecs = ScalerItem.transform(item_vecs)
        y_p = model.predict([scaled_user_vecs[:, u_s:], scaled_item_vecs[:, i_s:]])
    else:
        y_p = model.predict([user_vecs[:, u_s:], item_vecs[:, i_s:]])
    y_pu = scaler.inverse_transform(y_p)

    if np.any(y_pu < 0):
        print("Error, expected all positive predictions")

    sorted_index = np.argsort(-y_pu, axis=0).reshape(-1).tolist()
    sorted_ypu = y_pu[sorted_index]
    sorted_items = item_vecs[sorted_index]
    sorted_user = user_vecs[sorted_index]
    return sorted_index, sorted_ypu, sorted_items, sorted_user


def get_user_vecs(user_id, user_train, item_vecs, user_to_genre):
    if str(user_id) not in user_to_genre:
        print("error: unknown user id")
        return None

    for user_vec in user_train:
        if user_vec[0] == user_id:
            break
    else:
        print("error in get_user_vecs, did not find uid in user_train")
        return None

    num_items = len(item_vecs)
    user_vecs = np.tile(user_vec, (num_items, 1))
    y = np.zeros(num_items)

    for i in range(num_items):
        movie_id = item_vecs[i, 0]
        rating = user_to_genre[str(user_id)]['movies'].get(str(int(movie_id)), 0)
        y[i] = rating

    return user_vecs, y


def print_pred_movies(y_p, user, item, movie_dict, maxcount=10):
    count = 0
    movies_listed = set()
    disp = [["y_p", "movie id", "rating ave", "title", "genres"]]

    for i in range(y_p.shape[0]):
        if count >= maxcount:
            break

        movie_id = int(item[i, 0])
        if movie_id in movies_listed:
            continue

        disp.append([
            y_p[i, 0],
            movie_id,
            item[i, 2].astype(float),
            movie_dict[movie_id]['title'],
            movie_dict[movie_id]['genres']
        ])
        movies_listed.add(movie_id)
        count += 1

    return tabulate.tabulate(disp, tablefmt='html', headers="firstrow")


In [19]:
#load data and set config variables 
item_train, user_train, y_train, item_features, user_features, item_vecs, movie_dict, user_to_genre = load_data()

num_user_features = user_train.shape[1] -3
num_item_features = item_train.shape[1] -1

uvs = 3  
ivs = 3  
u_s = 3  
i_s = 1  
scaledata = True  # applies the standard scalar to data if true
print(f"No. of training vectors: {len(item_train)}")

No. of training vectors: 58187


In [20]:
#data visualization

#user 
pprint_train(user_train, user_features, uvs,  u_s, maxcount=5)

[user id],[rating count],[rating ave],Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9
2,16,4.1,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.2,3.9


In [21]:
#movies
pprint_train(item_train, item_features, ivs, i_s, maxcount=5, user=False)

[movie id],year,ave rating,Act ion,Adve nture,Anim ation,Chil dren,Com edy,Crime,Docum entary,Drama,Fan tasy,Hor ror,Mys tery,Rom ance,Sci -Fi,Thri ller
6874,2003,4.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
6874,2003,4.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6874,2003,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8798,2004,3.8,1,0,0,0,0,0,0,0,0,0,0,0,0,0
8798,2004,3.8,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [22]:
len(y_train)

58187

In [23]:
#training data scaling 
scalerItem = StandardScaler()
scalerItem.fit(item_train)
item_train = scalerItem.transform(item_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)


In [24]:
#split data into training and test sets
item_train, item_test = train_test_split(item_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test = train_test_split(y_train, train_size=0.80, shuffle=True, random_state=1)

#min max scaling of targets to be between -1 and 1 
scaler = MinMaxScaler((-1, 1))
scaler.fit(y_train.reshape(-1, 1))
ynorm_train = scaler.transform(y_train.reshape(-1, 1))
ynorm_test = scaler.transform(y_test.reshape(-1, 1))

In [25]:
item_train.shape, item_test.shape

((46549, 17), (11638, 17))

In [26]:
user_train.shape, user_test.shape

((46549, 17), (11638, 17))

In [27]:
#model 
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Lambda, Dot

# Define feature dimensions
num_outputs = 32

tf.random.set_seed(1)

# User neural network
user_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'),
])

# Item neural network
item_NN = tf.keras.models.Sequential([
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(num_outputs, activation='linear'),
])

# Define inputs
input_user = Input(shape=(num_user_features,))
input_item = Input(shape=(num_item_features,))

# base networks + normalization using Lambda layers
vu = Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(user_NN(input_user))
vm = Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(item_NN(input_item))

# Dot product
output = Dot(axes=1)([vu, vm])

# Build model
model = Model(inputs=[input_user, input_item], outputs=output)

# Show model summary
model.summary()


In [28]:
# loss, optimizer and model training 
tf.random.set_seed(1)
Loss = tf.keras.losses.MeanSquaredError()
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer = opt, loss = Loss)

In [29]:
tf.random.set_seed(1)
model.fit([user_train[:, u_s:], item_train[:, i_s:]], ynorm_train, epochs=30)

Epoch 1/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.1290
Epoch 2/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1170
Epoch 3/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1146
Epoch 4/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1132
Epoch 5/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1119
Epoch 6/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1106
Epoch 7/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1092
Epoch 8/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1081
Epoch 9/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1072
Epoch 10/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x1d8437ebb60>

In [30]:
model.evaluate([user_test[:, u_s:], item_test[:, i_s:]], ynorm_test)

[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1063    


0.10620453953742981

In [31]:
# Evaluation loss is similar to training loss indicating model is not overfitted to training data

In [35]:
#testing our hypothesis (code is made using ChatGpt)

# Helper function
from tabulate import tabulate 

def create_user_vec(user_id, rating_count, rating_ave, genres):
    return np.array([[user_id, rating_count, rating_ave] + genres])

def predict_for_new_user(user_id, rating_count, rating_ave, genres):
    user_vec = create_user_vec(user_id, rating_count, rating_ave, genres)
    user_vec_trimmed = user_vec[:, :user_train.shape[1]]  # ensure feature count matches training
    user_vecs = gen_user_vecs(user_vec_trimmed, len(item_vecs))

    sorted_index, sorted_ypu, sorted_items, sorted_user = predict_uservec(
        user_vecs, item_vecs, model, u_s, i_s, scaler, scalerUser, scalerItem, scaledata=scaledata
    )

    print(f"\n🔹 Top 10 Recommendations for User {user_id}:")
    predictions = []
    for i in range(10):
        movie_id = int(sorted_items[i, 0])
        title = movie_dict[movie_id]['title']
        genres_str = movie_dict[movie_id]['genres']
        rating = round(float(sorted_ypu[i, 0]), 2)
        avg_rating = round(float(sorted_items[i, 2]), 2)
        predictions.append([movie_id, title, genres_str, rating, avg_rating])

    print(tabulate(predictions, headers=["Movie ID", "Title", "Genres", "Predicted Rating", "Avg Rating"], tablefmt="fancy_grid"))


#  User 1
new_user_id = 5000
new_rating_count = 3
new_rating_ave = 1.0
genres_user1 = [1.0, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 5, 5, 1]
predict_for_new_user(new_user_id, new_rating_count, new_rating_ave, genres_user1)

# User 2
new_user_id = 5001
new_rating_count = np.random.randint(2, 10)
new_rating_ave = round(np.random.uniform(2.0, 4.5), 1)
genres_user2 = np.random.randint(0, 6, size=15).tolist()
predict_for_new_user(new_user_id, new_rating_count, new_rating_ave, genres_user2)

#  User 3
new_user_id = 5002
new_rating_count = np.random.randint(1, 5)
new_rating_ave = round(np.random.uniform(1.5, 4.0), 1)
genres_user3 = np.random.randint(0, 6, size=15).tolist()
predict_for_new_user(new_user_id, new_rating_count, new_rating_ave, genres_user3)

# User 4
new_user_id = 5003
new_rating_count = np.random.randint(3, 8)
new_rating_ave = round(np.random.uniform(2.5, 4.8), 1)
genres_user4 = [np.random.choice([0, 1, 5]) for _ in range(15)]
predict_for_new_user(new_user_id, new_rating_count, new_rating_ave, genres_user4)


[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 

🔹 Top 10 Recommendations for User 5000:
╒════════════╤══════════════════════════════╤════════════════════════════════════════════╤════════════════════╤══════════════╕
│   Movie ID │ Title                        │ Genres                                     │   Predicted Rating │   Avg Rating │
╞════════════╪══════════════════════════════╪════════════════════════════════════════════╪════════════════════╪══════════════╡
│      63859 │ Bolt (2008)                  │ Action|Adventure|Animation|Children|Comedy │               4.75 │         3.39 │
├────────────┼──────────────────────────────┼────────────────────────────────────────────┼────────────────────┼──────────────┤
│      45517 │ Cars (2006)                  │ Animation|Children|Comedy                  │               4.75 │         3.38 │
├────────────┼──────────────────────────────┼────────────────────────────────────────────┼────────────────────┼────────────

In [None]:
# the top ten recomendatons for these 4 new users who have rated movies similarly appears to consist of movies majorly from the time frame 2004-2007,hence proving our claim.