In [1]:
import numpy as np
from scipy.sparse import rand as sprand
from scipy.sparse import lil_matrix
import torch
from torch.autograd import Variable
import pandas as pd

In [2]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
print("Loading Data")
df_train = pd.read_csv('ml-10M100K/r3.train', sep='::', names=names,engine='python')
print("Loaded Train")
df_test = pd.read_csv('ml-10M100K/r3.test', sep='::', names=names,engine='python')
print("Loaded Test")

Loading Data
Loaded Train
Loaded Test


In [3]:
df_train.head()
print(df_train.shape)

(8000044, 4)


In [4]:
def get_movielens_ratings(df):
    n_users = max(df.user_id.unique())
    n_items = max(df.item_id.unique())

    interactions = lil_matrix( (n_users,n_items), dtype=float) #np.zeros((n_users, n_items))
    for row in df.itertuples():
        interactions[row[1] - 1, row[2] - 1] = row[3]
    return interactions

In [5]:
print("Preparing Training Data")
ratings = get_movielens_ratings(df_train)
print(ratings.shape)

Preparing Training Data
(71567, 65133)


In [6]:
print("Preparing Testing Data")
test_ratings = get_movielens_ratings(df_test)
print(test_ratings.shape)

Preparing Testing Data
(42788, 65133)


In [64]:
class MatrixFactorization(torch.nn.Module):


    def __init__(self, n_users, n_items, n_factors=5, useBias = False):
        super().__init__()
        
        if torch.cuda.is_available(): # CHECK FOR CUDA AVAILABILITY
            self.useCUDA = True
            print("CUDA is being used")
        else:
            print("CUDA not available, reverting to CPU")
            
        if self.useCUDA: # IF IT IS AVAILABLE, USE IT
            self.cuda() 
        
        self.user_factors = torch.nn.Embedding(n_users, 
                                               n_factors,
                                               sparse=False)
        self.item_factors = torch.nn.Embedding(n_items, 
                                               n_factors,
                                               sparse=False)
        # Also should consider fitting overall bias (self.mu term) and both user and item bias vectors
        
        ## Incorporation of Bias Term
        if useBias:
            self.user_bias = torch.nn.Embedding(n_users, 1, sparse = False)
            self.item_bias = torch.nn.Embedding(n_itms, 1, sparse = False)
        
        
        # Mu is 1x1, user_bias is 1xn_users. item_bias is 1xn_items
        
    def getLoss(self):
        return self.currentLoss


    currentLoss = 2
    useCUDA = False
#     loss_func = torch.nn.MSELoss()
#     reg_loss_func = torch.optim.SGD(self.model.parameters(), lr=1e-6, weight_decay=1e-3)
    
    # For convenience when we want to predict a single user-item pair. 
    def predict(self, user, item):
        # Need to fit bias factors
#         prediction = self.user_bias(user) + self.item_bias(item)
        #pred += (self.user_factors(user) * self.item_factors(item)).sum(1)
        return (self.user_factors(user) * self.item_factors(item)).sum(1)
#         return torch.dot(self.user_factors(user),self.item_factors(item))
    
    # Much more efficient batch operator. This should be used for training purposes
    def forward(self, users, items):
        # Need to fit bias factors
        return torch.mm(self.user_factors(users),torch.transpose(self.item_factors(items),0,1))
    
    def get_batch(self,batch_size,ratings):
        # Sort our data and scramble it
        rows, cols = ratings.shape
        p = np.random.permutation(rows)

        # create batches
        sindex = 0
        eindex = batch_size
        while eindex < rows:
            batch = p[sindex:eindex]
            temp = eindex
            eindex = eindex + batch_size
            sindex = temp
            yield batch

        if eindex >= rows:
            batch = range(sindex,rows)
            yield batch    
    
    def run_epoch(self,batch_size, ratings):
        for i,batch in enumerate(self.get_batch(batch_size, ratings)):
            # Set gradients to zero
            self.reg_loss_func.zero_grad()

            # Turn data into variables
            if self.useCUDA:
                interactions = Variable(torch.cuda.FloatTensor(ratings[batch, :].toarray()))
                rows = Variable(torch.cuda.LongTensor(batch))
                cols = Variable(torch.cuda.LongTensor(np.arange(ratings.shape[1])))               
            else:
                interactions = Variable(torch.FloatTensor(ratings[batch, :].toarray()))
                rows = Variable(torch.LongTensor(batch))
                cols = Variable(torch.LongTensor(np.arange(ratings.shape[1])))

            # Predict and calculate loss
            predictions = model(rows, cols)
            self.currentLoss = self.loss_func(predictions, interactions)

            # Backpropagate
            self.currentLoss.backward()

            # Update the parameters
            self.reg_loss_func.step()
    
    def train(self, numEpochs, batch_size, ratings,learningRate):
        self.loss_func = torch.nn.MSELoss()
        self.reg_loss_func = torch.optim.SGD(model.parameters(), lr=learningRate, weight_decay=1e-3)
        for i in range(numEpochs):
            print(i)
            self.run_epoch(batch_size,ratings)
            
    
    def convertLillMatrixToVariable(self,lillMatrix,transpose = False):
        if self.useCUDA:
            if lillMatrix.shape[0] == 1:
                # we have a single matrix
                return Variable(torch.cuda.LongTensor(lillMatrix.toarray()[0]))
            else:
                return Variable(torch.cuda.LongTensor(lillMatrix.toarray()))
        else:
            if lillMatrix.shape[0] == 1:
                # we have a single matrix
                return Variable(torch.LongTensor(lillMatrix.toarray()[0]))
            else:
                return Variable(torch.LongTensor(lillMatrix.toarray()))



In [65]:
print("Creating Model")
model = MatrixFactorization(ratings.shape[0], ratings.shape[1], n_factors=2,useBias = False)
# if torch.cuda.is_available():
#     model.cuda()

Creating Model
CUDA not available, reverting to CPU


In [36]:
EPOCH = 5
BATCH_SIZE = 1000 #50
LR = 0.001

In [None]:
print("Training Model")
model.train(EPOCH,BATCH_SIZE,ratings,LR)

In [None]:
print("Model Loss: {}".format(model.getLoss()))

In [53]:
rows = model.convertLillMatrixToVariable(test_ratings[28665,:])
cols = model.convertLillMatrixToVariable(test_ratings[:,0])
# cols = Variable(torch.LongTensor(test_ratings[:,0].T.toarray()[0]))

In [66]:
predictionVar = model.predict(rows, cols)

RuntimeError: inconsistent tensor size, expected tensor [65133 x 2] and src [42788 x 1 x 2] to have the same number of elements, but got 130266 and 85576 elements respectively at /Users/soumith/minicondabuild3/conda-bld/pytorch_1518385717421/work/torch/lib/TH/generic/THTensorMath.c:656

In [59]:
predictionVar

Variable containing:
1.00000e+05 *
 0.6861 -0.5846
 0.6861 -0.5846
 0.6861 -0.5846
       ⋮        
 0.6861 -0.5846
 0.6861 -0.5846
 0.6861 -0.5846
[torch.FloatTensor of size 42788x2]

In [None]:
pd.DataFrame(predictionVar).describe()

In [39]:
test_ratings.shape

(42788, 65133)

In [None]:
# model.convertLillMatrixToVariable(test_ratings)

In [None]:
# for i,batch in enumerate(get_batch(BATCH_SIZE,test_ratings)):
print(test_ratings)
rows = Variable(torch.LongTensor(batch))
cols = Variable(torch.LongTensor(np.arange(test_ratings.shape[1])))
predictions = predict(rows, cols)
print(predictions.data.cpu().numpy().shape)

In [None]:
np.random.permutation(test_ratings.shape[0])

In [None]:
for i in range(test_ratings.shape[0]):
    test_ratings[i:,].todense().max()
    if (test_ratings[i:,].todense().max() > 0):
        print(i)
        break

In [None]:
for i in range(test_ratings.shape[0]):
    test_ratings[:,i].todense().max()
    if (test_ratings[:,i].todense().max() > 0):
        print(i)
        break

In [None]:
test_ratings[:,0]

In [None]:
test_ratings[28665].toarray()[0]

In [None]:
test_ratings[:,0].T.toarray()[0]