# Model Preperation

### Import Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
data = pd.read_csv("ratings_Electronics.csv",header=None)

In [None]:
data = data.sample(frac=0.05)
#subset.to_csv('output.csv', index=False)

In [None]:
data[0], _ = pd.factorize(data[0])
data[1], _ = pd.factorize(data[1])
data.columns = ['UserID', 'ProductID', 'Rating', 'Timestamp']
print(data.head())

         UserID  ProductID  Rating   Timestamp
4200237       0          0     5.0  1376524800
6005683       1          1     5.0  1354060800
4313893       2          2     5.0  1371686400
1921031       3          3     1.0  1326326400
6930892       4          4     5.0  1401235200


In [None]:
#fix the seed to make consistent
np.random.seed(4)

#separate the data into s
mask = np.random.rand(len(data)) < 0.75
train = data[mask].copy()
val = data[~mask].copy()

In [None]:
def proc_col(col, train_col=None):
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [None]:
def encode_data(df, train=None):
    df = df.copy()
    for col_name in ["UserID", "ProductID"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [None]:

df_train = encode_data(train)
df_val = encode_data(val, train)


# GMF

## Constructing GMF

In [None]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(GMF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)

    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

In [None]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    #optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd)#using SGD to optimize
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd) #using Adam to optimize
    model.train()
    for i in range(epochs):
        #convert the input data to tensor for use of pytorch
        users = torch.LongTensor(df_train.UserID.values)
        items = torch.LongTensor(df_train.ProductID.values)
        y_hat = model(users, items)
        #print("y_hat", y_hat)
        ratings = torch.FloatTensor(df_train.Rating.values)

        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print(loss.item())
    loss = test_loss(model)
    return loss.item()

In [None]:
def test_loss(model):
    model.eval()
    users = torch.LongTensor(df_val.UserID.values)#.cuda()
    items = torch.LongTensor(df_val.ProductID.values)#.cuda()
    ratings = torch.FloatTensor(df_val.Rating.values)#.cuda()
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())
    return loss

In [None]:
num_users = len(df_train.UserID.unique())
num_items = len(df_train.ProductID.unique())

In [None]:
print("There are " + str(num_users) + " users")
print("There are " + str(num_items) + " items")

There are 273226 users
There are 95768 items


Test Optimal GMF Model

In [None]:
class GMF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(GMF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.03)
        self.item_emb.weight.data.uniform_(0, 0.03)

    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

In [None]:
model = GMF(num_users, num_items, emb_size=108)
loss = train_epocs(model,epochs = 54, lr = 0.09, wd = 0.00004)

test loss 11.557 


## Construct MLP

In [None]:
class MLP(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, hidden_size=10):
        super(MLP, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        self.fc1 = nn.Linear(emb_size * 2,hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size,1)
        self.drop = nn.Dropout(0.1)
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        x = torch.cat([u,v],dim = 1)
        x = self.drop(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x.squeeze()

Test Optimal MLP Model

In [None]:
model = MLP(num_users, num_items, emb_size = 47, hidden_size = 30)
loss = train_epocs(model, epochs=45, lr=0.09, wd = 0.000006)

test loss 2.455 


# NeuMF

## Construct NeuMF

In [None]:
class NeuMF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, hidden_size=10, final_size = 5):
        super(NeuMF, self).__init__()
        self.user_emb_GMF = nn.Embedding(num_users, emb_size)
        self.item_emb_GMF = nn.Embedding(num_items, emb_size)
        self.user_emb_MLP = nn.Embedding(num_users, emb_size)
        self.item_emb_MLP = nn.Embedding(num_items, emb_size)
        self.user_emb_MLP.weight.data.uniform_(0, 0.05)
        self.item_emb_MLP.weight.data.uniform_(0, 0.05)
        self.fc1 = nn.Linear(emb_size * 2, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, final_size)
        self.drop = nn.Dropout(0.1) #drop out neruons to prevent overfitting
        self.prediction = nn.Linear(emb_size + final_size,1)
    def forward(self, u, v):
        u_M = self.user_emb_MLP(u)
        v_M = self.item_emb_MLP(v)
        x_M = torch.cat([u_M,v_M],dim = 1)
        x_M = self.drop(x_M)
        x_M = self.fc1(x_M)
        x_M = self.relu(x_M)
        x_M = self.fc2(x_M)


        u_G = self.user_emb_GMF(u)
        v_G = self.item_emb_GMF(v)
        x_G = (u_G * v_G)

        x = torch.cat([x_M,x_G], dim = 1)
        x = self.prediction(x)
        return x.squeeze()

Test Optimal NeuMF Model

In [None]:
model = NeuMF(num_users, num_items, emb_size=250,hidden_size=85, final_size = 7)
loss = train_epocs(model, epochs=40, lr=0.03, wd = 0.00005)

test loss 2.198 


In [None]:
mov = pd.read_csv("MovieLens.csv",header=None)

In [None]:
print(mov.shape[0])

100837
