In [None]:
import pandas as pd
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

In [None]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [None]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["user_id", "item_id"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [None]:
header = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=header)

In [None]:
train, val = model_selection.train_test_split(df, test_size=0.1, random_state=4, stratify=df.rating.values)

In [None]:
df_train = encode_data(train)
df_val = encode_data(train, val)

In [None]:
df_train[0:50]

Unnamed: 0,user_id,item_id,rating,timestamp
53270,0,0,5,888519206
76926,1,1,3,881605093
86402,2,2,5,891447157
42887,3,3,3,885807173
72708,4,4,5,882814571
26958,5,5,2,883876573
13925,6,6,2,887249889
25588,7,7,3,875154535
24946,8,8,4,884496985
98732,9,9,3,891353801


In [None]:
class net(nn.Module):
    def __init__(self, num_users, num_movies):
        super(net, self).__init__()
        
        self.user_embed = nn.Embedding(num_users, 32)
        self.movie_embed = nn.Embedding(num_movies, 32)
        self.out = nn.Linear(64, 1)
        
    def forward(self, users, movies):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        
        #print(user_embeds, user_embeds.shape)
        #print(movie_embeds, movie_embeds.shape)
        
        output = torch.cat([user_embeds, movie_embeds], dim=1)

        #output = torch.cat([torch.reshape(user_embeds, (1, len(user_embeds))), torch.reshape(movie_embeds, (1, len(movie_embeds)))], dim=1)
        
        output = self.out(output)

        return output

class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, 100)
        self.item_emb = nn.Embedding(num_items, 100)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
    
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

In [None]:
num_unique_users = len(set(df_train.user_id.values))
num_unique_movies = len(set(df_train.item_id.values))

In [None]:
model = MF(num_unique_users, num_unique_movies)

In [None]:
opt = torch.optim.Adam(model.parameters(), lr=1e-3)

In [None]:
loss_function = nn.MSELoss(reduction='none')

In [None]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.user_id.values) #.cuda()
    items = torch.LongTensor(df_val.item_id.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()

    y_hat = model(users, items)
    
    count = 0
    for i in range(len(y_hat)):
        
        diff = np.abs(y_hat[i].detach().numpy() - ratings[i].detach().numpy())
        #print("diff: ", diff)
        if round(diff) == 0 or round(diff) == 1:
            count += 1
    print("Accuracy: ", count / len(y_hat))

    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [None]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.user_id.values)
        items = torch.LongTensor(df_train.item_id.values)
        ratings = torch.FloatTensor(df_train.rating.values) 
        
        y_hat = model(users, items)
        print("y_hat:", y_hat, y_hat.shape)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item())
    test_loss(model, unsqueeze)

In [None]:
train_epocs(model, epochs=15, lr=0.01)

new
torch.Size([90000, 100])
tensor([3.4546, 3.8965, 3.8118,  ..., 2.8735, 4.5192, 3.5110],
       grad_fn=<SumBackward1>)

y_hat: tensor([3.4546, 3.8965, 3.8118,  ..., 2.8735, 4.5192, 3.5110],
       grad_fn=<SumBackward1>) torch.Size([90000])
0.8639264702796936
new
torch.Size([90000, 100])
tensor([3.8367, 4.2528, 4.2126,  ..., 2.8541, 4.1034, 3.8985],
       grad_fn=<SumBackward1>)

y_hat: tensor([3.8367, 4.2528, 4.2126,  ..., 2.8541, 4.1034, 3.8985],
       grad_fn=<SumBackward1>) torch.Size([90000])
0.8554012179374695
new
torch.Size([90000, 100])
tensor([3.6990, 4.0487, 4.1684,  ..., 2.7618, 4.0058, 3.9359],
       grad_fn=<SumBackward1>)

y_hat: tensor([3.6990, 4.0487, 4.1684,  ..., 2.7618, 4.0058, 3.9359],
       grad_fn=<SumBackward1>) torch.Size([90000])
0.84673672914505
new
torch.Size([90000, 100])
tensor([3.5183, 3.8249, 4.0350,  ..., 2.7745, 4.1310, 3.8424],
       grad_fn=<SumBackward1>)

y_hat: tensor([3.5183, 3.8249, 4.0350,  ..., 2.7745, 4.1310, 3.8424],
       grad_fn=<

NameError: name 'unsqueeze' is not defined