In [30]:
import numpy as np
import numpy.ma as ma
from numpy import genfromtxt
from collections import defaultdict
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import tabulate
import time
pd.set_option("display.precision", 1)

In [31]:
# Load and preprocess the data
courses = pd.read_excel('Rated_Courses.xlsx')
users = pd.read_excel('User_Profiles.xlsx')
ratings = pd.read_excel('User_Ratings.xlsx')
y_train = ratings.rating.values

## Cleaning

In [32]:
users = users.drop('Unnamed: 0',axis=1)
ratings = ratings.drop('Unnamed: 0',axis=1)
courses = courses.drop('Unnamed: 0',axis=1)
courses['id'] = courses['id'].astype(int)

## Feature Extraction

### User Data

In [33]:
# Merging courses into ratings and gettings dummies for subcategory.
df_merged = pd.merge(ratings,courses[['id','subcategory']], left_on='courseId', right_on='id').drop_duplicates()
df_subcategory = pd.get_dummies(df_merged['subcategory'])

# Contains each user rating with one hot encoding of course subcategory
df_ratings = pd.concat([ratings, df_subcategory], axis=1)

In [34]:
# Group by userid and category, and calculate the mean rating for each group
df_user_subcategory_rating = df_merged.groupby(["userId", "subcategory"])["rating"].mean().reset_index()

# Pivot the table to have category columns with average rating values
df_user_subcategory_rating = df_user_subcategory_rating.pivot(index="userId", columns="subcategory", values="rating").reset_index()

# Rename the columns for clarity
df_user_subcategory_rating.columns.name = None

# Display the new dataframe
df_user_average_ratings = df_user_subcategory_rating.replace(np.nan, 0)
user_input = df_user_average_ratings.iloc[:,1:]
user_input.head()

Unnamed: 0,3D & Animation,Accounting & Bookkeeping,Affiliate Marketing,Apple,Architectural Design,Arts & Crafts,Beauty & Makeup,Branding,Business Analytics & Intelligence,Business Law,...,Teacher Training,Test Prep,Travel,User Experience Design,Video & Mobile Marketing,Video Design,Vocal,Web Design,Web Development,Yoga
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.3,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0


### Courses Data

In [35]:
encoded_cols = pd.get_dummies(courses['subcategory'], prefix='')
encoded_cols = encoded_cols.rename(columns=lambda x: x.replace('_', ''))
df_courses = pd.concat([courses, encoded_cols], axis=1)

In [36]:
courses_input = df_courses.iloc[:, list([0,6]) + list(range(20, df_courses.shape[1]))]
courses_input.head()

Unnamed: 0,id,avg_rating,3D & Animation,Accounting & Bookkeeping,Affiliate Marketing,Apple,Architectural Design,Arts & Crafts,Beauty & Makeup,Branding,...,Teacher Training,Test Prep,Travel,User Experience Design,Video & Mobile Marketing,Video Design,Vocal,Web Design,Web Development,Yoga
0,638418,4.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,3640438,4.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,4439592,4.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,451966,4.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1578238,3.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# Group by userid and category, and calculate the mean rating for each group
# df_new2 = df_merged.groupby(["userId", "subcategory"])["rating"].mean().reset_index()

# Pivot the table to have category columns with average rating values


In [38]:
# use pivot to create a dataframe with subcategory as columns, rating as values, and userId as an additional column
df_pivot = df_merged.pivot(columns="subcategory", values="rating")
df_pivot.insert(0, 'userId', df_merged['userId'])
df_pivot.insert(1, 'timestamp', df_merged['timestamp'])

In [39]:
cols_to_mean = df_pivot.columns[2:]
df_p = df_pivot.copy()
for col in cols_to_mean:
    df_p[f'{col}'] = df_p.groupby('userId')[col].transform('mean')
df_p = df_p.fillna(0)
user_input = df_p.fillna(0)
user_input = user_input.sort_values(by='userId', ascending=True)

In [40]:
dates = pd.to_datetime(user_input['timestamp'])
user_input['timestamp'] = (dates - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

In [41]:
num_user_features = user_input.shape[1] - 1  # remove userid, rating count and ave rating during training
num_item_features = courses_input.shape[1] -1 # remove movie id at train time
scaledata = True  # applies the standard scalar to data if true

In [61]:
item_train_save

Unnamed: 0,id,avg_rating,3D & Animation,Accounting & Bookkeeping,Affiliate Marketing,Apple,Architectural Design,Arts & Crafts,Beauty & Makeup,Branding,...,Teacher Training,Test Prep,Travel,User Experience Design,Video & Mobile Marketing,Video Design,Vocal,Web Design,Web Development,Yoga
0,638418,4.2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,3640438,4.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,4439592,4.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,451966,4.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1578238,3.9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34921,4045274,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
34922,4813246,0.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34923,1550138,4.3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34924,1919624,3.6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
scalerItem.inverse_transform(courses_input)

array([[ 6.38418000e+05,  4.25000000e+00,  0.00000000e+00, ...,
         0.00000000e+00, -3.46944695e-18,  0.00000000e+00],
       [ 3.64043800e+06,  4.91666650e+00,  0.00000000e+00, ...,
         0.00000000e+00, -3.46944695e-18,  0.00000000e+00],
       [ 4.43959200e+06,  4.50000000e+00,  0.00000000e+00, ...,
         0.00000000e+00, -3.46944695e-18,  0.00000000e+00],
       ...,
       [ 1.55013800e+06,  4.35000000e+00,  0.00000000e+00, ...,
         0.00000000e+00, -3.46944695e-18,  0.00000000e+00],
       [ 1.91962400e+06,  3.55555560e+00,  0.00000000e+00, ...,
         0.00000000e+00, -3.46944695e-18,  0.00000000e+00],
       [ 4.46793800e+06,  4.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00, -3.46944695e-18,  0.00000000e+00]])

In [64]:
print(np.allclose(item_train_save, scalerItem.inverse_transform(courses_input)))

True


In [42]:
if scaledata:
    item_train_save = courses_input
    user_train_save = user_input

    scalerItem = StandardScaler()
    scalerItem.fit(courses_input)
    courses_input = scalerItem.transform(courses_input)

    scalerUser = StandardScaler()
    scalerUser.fit(user_input)
    user_input = scalerUser.transform(user_input)
    print(np.allclose(item_train_save, scalerItem.inverse_transform(courses_input)))
    print(np.allclose(user_train_save, scalerUser.inverse_transform(user_input)))

True
True


In [43]:
from sklearn.model_selection import train_test_split
item_train, item_test = train_test_split(courses_input, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_input, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {item_train.shape}")
print(f"movie/item test  data shape: {item_test.shape}")

movie/item training data shape: (27940, 132)
movie/item test  data shape: (6986, 132)


In [44]:
scaler = MinMaxScaler((-1, 1))
scaler.fit(y_train.reshape(-1, 1))
ynorm_train = scaler.transform(y_train.reshape(-1, 1))
ynorm_test = scaler.transform(y_test.reshape(-1, 1))
print(ynorm_train.shape, ynorm_test.shape)

(27940, 1) (6986, 1)


In [None]:
def evaluate_model(num_outputs, rate):
    tf.random.set_seed(1)
    user_NN = tf.keras.models.Sequential([
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(num_outputs),
    ])

    item_NN = tf.keras.models.Sequential([    
        tf.keras.layers.Dense(256, activation='relu'),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(num_outputs),
    ])

    # create the user input and point to the base network
    input_user = tf.keras.layers.Input(shape=(num_user_features))
    vu = user_NN(input_user)
    vu = tf.linalg.l2_normalize(vu, axis=1)

    # create the item input and point to the base network
    input_item = tf.keras.layers.Input(shape=(num_item_features))
    vm = item_NN(input_item)
    vm = tf.linalg.l2_normalize(vm, axis=1)

    # compute the dot product of the two vectors vu and vm
    output = tf.keras.layers.Dot(axes=1)([vu, vm])

    # specify the inputs and output of the model
    model = tf.keras.Model([input_user, input_item], output)
    
    tf.random.set_seed(1)
    cost_fn = tf.keras.losses.MeanSquaredError()
    opt = keras.optimizers.Adam(learning_rate=rate)
    model.compile(optimizer=opt, loss=cost_fn)
    
    history = model.fit([user_train[:,1:], item_train[:,1:]], ynorm_train, epochs=50)

    loss = model.evaluate([user_test[:, 1:], item_test[:, 1:]], ynorm_test)
    return loss

In [None]:
import itertools
# define the range of values for the number of factors
num_factors_range = [16, 32, 64, 128, 256]
lr_list = [0.001, 0.01, 0.1, 1.0]

# create a list of all possible combinations of hyperparameters
param_grid = list(itertools.product(num_factors_range, lr_list))

best_mse = np.inf
best_params = None
for params in param_grid:
    num_factors, rate = params
    mse = evaluate_model(num_factors, rate)
    print(f"num_factors: {params}, mse: {mse}")
    if mae < best_mse:
        best_mse = mse
        best_params = params

print(f"Best parameters: num_factors={best_params[0]}, MSE={best_mse:.4f}")

Epoch 1/2
Epoch 2/2
num_factors: (16, 0.001), mse: 0.2111767679452896
Epoch 1/2
Epoch 2/2
num_factors: (16, 0.01), mse: 0.21129857003688812
Epoch 1/2
Epoch 2/2
num_factors: (16, 0.1), mse: 0.21113206446170807
Epoch 1/2
Epoch 2/2
num_factors: (16, 1.0), mse: 0.21110525727272034
Epoch 1/2
152/874 [====>.........................] - ETA: 1s - loss: 0.2265

In [None]:
def gen_user_vecs(user_vec, num_items):
    """ given a user vector return:
        user predict maxtrix to match the size of item_vecs """
    user_vecs = np.tile(user_vec, (num_items, 1))
    return (user_vecs)

In [None]:
new_user = [1]*132
new_user[0] = 10000
new_user[1] = int(time.time())
new_user[5] = 4.5
user_vec = np.array([new_user])
user_vecs = gen_user_vecs(user_vec,len(item_train))
pd.DataFrame(user_vec, columns=user_train_save.columns)

In [None]:
def predict_uservec(user_vecs, item_vecs, model, scaler, ScalerUser, ScalerItem, scaledata=False):
    """ given a user vector, does the prediction on all movies in item_vecs returns
        an array predictions sorted by predicted rating,
        arrays of user and item, sorted by predicted rating sorting index
    """
    print(item_vecs)
    if scaledata:
        scaled_user_vecs = ScalerUser.transform(user_vecs)
        #scaled_item_vecs = ScalerItem.transform(item_vecs)
        y_p = model.predict(
            [scaled_user_vecs[:, 1:], item_vecs[:, 1:]])
    else:
        y_p = model.predict([user_vecs[:, 1:], item_vecs[:, 1:]])
    y_pu = scaler.inverse_transform(y_p)
    if np.any(y_pu < 0):
        print("Error, expected all positive predictions")
    # negate to get largest rating first
    sorted_index = np.argsort(-y_pu, axis=0).reshape(-1).tolist()
    sorted_ypu = y_pu[sorted_index]
    sorted_items = item_vecs[sorted_index]
    sorted_user = user_vecs[sorted_index]
    print(item_vecs)
    return y_pu
    return (sorted_index, sorted_ypu, sorted_items, sorted_user)

In [None]:
y_pu = predict_uservec(user_vecs, item_train, model, scaler, scalerUser, scalerItem, scaledata=scaledata)

In [None]:
pred_courses = pd.DataFrame(scalerItem.inverse_transform(item_train), columns= list(['id','avg_rating'] + list(df_subcategory.columns)))
pred_courses['id'] = pred_courses['id'].astype(int)
pred_courses['pred_rating'] = y_pu

In [None]:
pd.merge(pred_courses.sort_values(by='pred_rating', ascending=False),courses[['id','subcategory']], on='id')

In [None]:
# pred_courses.reindex(sorted_index)

In [None]:
# movie_dict = defaultdict(dict,courses.to_dict(orient='index')) 

In [None]:
# def print_pred_movies(y_p, item, movie_dict, maxcount=10):
#     """ print results of prediction of a new user. inputs are expected to be in
#         sorted order, unscaled. """
#     count = 0
#     movies_listed = defaultdict(int)
#     disp = [["y_p", "id", "title", "subcategory"]]
# #     item = scalerItem.inverse_transform(item)
#     print(item[0])
#     for i in range(0, y_p.shape[0]):
#         if count == maxcount:
#             break
#         count += 1
#         movie_id = item[i, 0].astype(int)
#         if movie_id in movies_listed:
#             continue
#         movies_listed[movie_id] = 1
#         disp.append([y_p[i, 0], movie_id, item[i, 2].astype(float),
#                     movie_dict[movie_id]['title'], movie_dict[movie_id]['subcategory']])
#     table = tabulate.tabulate(disp, tablefmt='html', headers="firstrow")
#     return (table)

In [None]:
# print_pred_movies(sorted_ypu, sorted_items, movie_dict, maxcount = 10)