In [1]:
import numpy as np
from scipy.sparse import rand as sprand
from scipy.sparse import lil_matrix
import torch
from torch.autograd import Variable
import pandas as pd

In [2]:
names = ['user_id', 'item_id', 'rating', 'timestamp']
print("Loading Data")
df_train = pd.read_csv('ml-10M100K/r3.train', sep='::', names=names,engine='python')
print("Loaded Train")
df_test = pd.read_csv('ml-10M100K/r3.test', sep='::', names=names,engine='python')
print("Loaded Test")

Loading Data
Loaded Train
Loaded Test


In [3]:
df_train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [4]:
df_train.head()
print(df_train.shape)

(8000044, 4)


In [5]:
def get_movielens_ratings(df):
    n_users = max(df.user_id.unique())
    n_items = max(df.item_id.unique())

    interactions = lil_matrix( (n_users,n_items), dtype=float) #np.zeros((n_users, n_items))
    for row in df.itertuples():
        interactions[row[1] - 1, row[2] - 1] = row[3]
    return interactions

In [6]:
print("Preparing Training Data")
ratings = get_movielens_ratings(df_train)
print(ratings.shape)

Preparing Training Data
(71567, 65133)


In [7]:
print("Preparing Testing Data")
test_ratings = get_movielens_ratings(df_test)
print(test_ratings.shape)

Preparing Testing Data
(42788, 65133)


In [20]:
class MatrixFactorization(torch.nn.Module):


    def __init__(self, n_users, n_items, n_factors=5, useBias = False):
        super().__init__()
        
        if torch.cuda.is_available(): # CHECK FOR CUDA AVAILABILITY
            self.useCUDA = True
            print("CUDA is being used")
        else:
            print("CUDA not available, reverting to CPU")
            
        if self.useCUDA: # IF IT IS AVAILABLE, USE IT
            self.cuda() 
        
        self.user_factors = torch.nn.Embedding(n_users, 
                                               n_factors,
                                               sparse=False)
        self.item_factors = torch.nn.Embedding(n_items, 
                                               n_factors,
                                               sparse=False)
        # Also should consider fitting overall bias (self.mu term) and both user and item bias vectors
        
        ## Incorporation of Bias Term
        if useBias:
            self.user_bias = torch.nn.Embedding(n_users, 1, sparse = False)
            self.item_bias = torch.nn.Embedding(n_itms, 1, sparse = False)
        
        
        # Mu is 1x1, user_bias is 1xn_users. item_bias is 1xn_items
        
    def getLoss(self):
        return self.currentLoss


    currentLoss = 2
    useCUDA = False
    
    interactions = False
#     loss_func = torch.nn.MSELoss()
#     reg_loss_func = torch.optim.SGD(self.model.parameters(), lr=1e-6, weight_decay=1e-3)
    
    # For convenience when we want to predict a single user-item pair. 
    def predict(self, user, item):
        # Need to fit bias factorsx
        print("Predicting")
        
        print(self.user_factors(user))
        print(torch.transpose(self.item_factors(item),0,1))

#         return torch.mm(self.user_factors(user)[0],torch.transpose(self.item_factors(item),0,1)[0])
        return (self.user_factors(user) * self.item_factors(item)).sum(1)
    
    
#         return torch.dot(self.user_factors(user),self.item_factors(item))
    
    # Much more efficient batch operator. This should be used for training purposes
    def forward(self, users, items):
        # Need to fit bias factors
#         print("Forward")
        return torch.mm(self.user_factors(users),torch.transpose(self.item_factors(items),0,1))
    
    def get_batch(self,batch_size,ratings):
        # Sort our data and scramble it
        rows, cols = ratings.shape
        p = np.random.permutation(rows)

        # create batches
        sindex = 0
        eindex = batch_size
        while eindex < rows:
            batch = p[sindex:eindex]
            temp = eindex
            eindex = eindex + batch_size
            sindex = temp
            yield batch

        if eindex >= rows:
            batch = range(sindex,rows)
            yield batch 
    
    def run_test(self,batch_size,ratings_test):
        predictionsArray = []
        losses = []
        for i,batch in enumerate(model.get_batch(batch_size,ratings_test)):
            
            if self.useCUDA:
                interactions = Variable(torch.cuda.FloatTensor(ratings_test[batch, :].toarray()))
                rows = Variable(torch.cuda.LongTensor(batch))
                cols = Variable(torch.cuda.LongTensor(np.arange(ratings_test.shape[1])))               
            else:
                interactions = Variable(torch.FloatTensor(ratings_test[batch, :].toarray()))
                rows = Variable(torch.LongTensor(batch))
                cols = Variable(torch.LongTensor(np.arange(ratings_test.shape[1])))

            # Predict and calculate loss
            predictions = model(rows, cols)
#             print(type(predictions))
            predictionsArray.append(predictions.data.cpu().numpy())
            losses.append(self.loss_func(predictions, interactions))
        return predictionsArray, losses
    
    def run_epoch(self,batch_size, ratings):
        for i,batch in enumerate(self.get_batch(batch_size, ratings)):
            # Set gradients to zero
            self.reg_loss_func.zero_grad()

#             print(type(batch))
            # Turn data into variables
            if self.useCUDA:
#                 print("using cuda")
                interactions = Variable(torch.cuda.FloatTensor(ratings[batch, :].toarray()))
                rows = Variable(torch.cuda.LongTensor(batch))
                cols = Variable(torch.cuda.LongTensor(np.arange(ratings.shape[1])))               
            else:
                interactions = Variable(torch.FloatTensor(ratings[batch, :].toarray()))
                rows = Variable(torch.LongTensor(batch))
                cols = Variable(torch.LongTensor(np.arange(ratings.shape[1])))

#             print(type(rows))
#             print(type(cols))
            # Predict and calculate loss
            predictions = model(rows, cols)
#             print(predictions)
            self.currentLoss = self.loss_func(predictions, interactions)

            # Backpropagate
            self.currentLoss.backward()

            # Update the parameters
            self.reg_loss_func.step()
    
    def train(self, numEpochs, batch_size, ratings,learningRate):
        self.loss_func = torch.nn.MSELoss()
        self.reg_loss_func = torch.optim.SGD(model.parameters(), lr=learningRate, weight_decay=1e-3)
        for i in range(numEpochs):
            print("Epoch {}".format(i))
            self.run_epoch(batch_size,ratings)
            
    def convertArryToVariable(self, array):
        if self.useCUDA:
            return Variable(torch.cuda.LongTensor(array))
        else:
            return Variable(torch.LongTensor(array))


    
    def convertLillMatrixToVariable(self,lillMatrix):
        if self.useCUDA:
            if lillMatrix.shape[0] == 1:
                # we have a single matrix
                return Variable(torch.cuda.LongTensor(lillMatrix.toarray()))
            else:
                return Variable(torch.cuda.LongTensor(lillMatrix.toarray()))
        else:
            if lillMatrix.shape[0] == 1:
                # we have a single matrix
                return Variable(torch.LongTensor(lillMatrix.toarray()))
            else:
                return Variable(torch.LongTensor(lillMatrix.toarray()))



In [21]:
print("Creating Model")
model = MatrixFactorization(ratings.shape[0], ratings.shape[1], n_factors=2,useBias = False)
if torch.cuda.is_available():
    model.cuda()

Creating Model
CUDA is being used


In [22]:
EPOCH = 1
BATCH_SIZE = 1000 #50
LR = 0.001

In [23]:
print("Training Model")
model.train(EPOCH,BATCH_SIZE,ratings,LR)

Training Model
0


In [24]:
print("Model Loss: {}".format(model.getLoss()))

Model Loss: Variable containing:
 1.9848
[torch.cuda.FloatTensor of size 1 (GPU 0)]



In [25]:
model.run_test(10,test_ratings)

RuntimeError: cuda runtime error (2) : out of memory at c:\anaconda2\conda-bld\pytorch_1519496000060\work\torch\lib\thc\generic/THCStorage.c:82

In [None]:
model.convertLillMatrixToVariable(test_ratings)

In [None]:
model.convertArryToVariable(np.arange(test_ratings.shape[1]))

In [None]:
cols.shape

In [None]:
prediction = model.predict(rows,cols)

In [None]:
prediction

In [None]:
pd.DataFrame(prediction.sum(1)).describe()

In [None]:
Variable(torch.LongTensor(test_ratings[28665,:].toarray()))

In [None]:
Variable(torch.LongTensor(test_ratings[:,0].toarray()))

In [None]:
# predictionVar = model.predict(Variable(torch.LongTensor(test_ratings[28665,:].toarray())), Variable(torch.LongTensor(test_ratings[:,0].toarray())))

In [None]:
predictionVar.data.cpu().numpy()

In [None]:
pd.DataFrame(predictionVar.data.cpu().numpy()).describe()

In [None]:
test_ratings.shape

In [None]:
# model.convertLillMatrixToVariable(test_ratings)

In [None]:
# for i,batch in enumerate(get_batch(BATCH_SIZE,test_ratings)):
print(test_ratings)
rows = Variable(torch.LongTensor(batch))
cols = Variable(torch.LongTensor(np.arange(test_ratings.shape[1])))
predictions = predict(rows, cols)
print(predictions.data.cpu().numpy().shape)

In [None]:
np.random.permutation(test_ratings.shape[0])

In [None]:
for i in range(test_ratings.shape[0]):
    test_ratings[i:,].todense().max()
    if (test_ratings[i:,].todense().max() > 0):
        print(i)
        break

In [None]:
for i in range(test_ratings.shape[0]):
    test_ratings[:,i].todense().max()
    if (test_ratings[:,i].todense().max() > 0):
        print(i)
        break

In [None]:
test_ratings[:,0]

In [None]:
test_ratings[28665].toarray()[0]

In [None]:
test_ratings[:,0].T.toarray()[0]