The below function load_data takes in the index of the movie whose ratings we want to predict and the minimum number of ratings per column, and returns the cleaned-up rating matrix and the new index of movie.

In [1]:
import numpy as np
import os
import zipfile # To handle zip files
import torch as torch

def load_data(movie, min_ratings):
    
    # Extract all from zip file
    dataDir=os.getcwd()
    zipObject = zipfile.ZipFile(os.path.join(dataDir,'ml-100k.zip'))
    zipObject.extractall(dataDir)
    zipObject.close()
    
    rawDataFilename = os.path.join(dataDir,'ml-100k','u.data')
    
    # Initialize rating matrix
    rawMatrix = np.empty([0, 0]) 
    
    # From each row of u.data, extract userID, movieID and rating
    with open(rawDataFilename, 'r') as rawData:
        for dataLine in rawData:
            dataLineSplit = dataLine.rstrip('\n').split('\t')
            userID = int(dataLineSplit[0])
            movieID = int(dataLineSplit[1])
            rating = int(dataLineSplit[2])
            if userID > rawMatrix.shape[0]:
                rowDiff = userID - rawMatrix.shape[0]
                zeroPadRows = np.zeros([rowDiff, rawMatrix.shape[1]])
                rawMatrix = np.concatenate((rawMatrix, zeroPadRows),
                                           axis = 0)
            if movieID > rawMatrix.shape[1]:
                colDiff = movieID - rawMatrix.shape[1]
                zeroPadCols = np.zeros([rawMatrix.shape[0], colDiff])
                rawMatrix = np.concatenate((rawMatrix, zeroPadCols),
                                           axis = 1)
                
            # Assign rating to rating matrix
            rawMatrix[userID - 1, movieID - 1] = rating
          
    # Define X
    X = rawMatrix
    
    # Count number of ratings per column, i.e., per movie
    nbRatingsCols = np.sum(X>0,axis=0)
    
    # Mask to identify movies with at least min_ratings
    mask = nbRatingsCols >= min_ratings
    
    # Save new index of the input argument "movie"
    idxMovie = np.sum(mask[0:movie])
    
    
    # 1.2 Data clean-up:
    
    # Remove matrix columns
    idx = np.argwhere(mask>0).squeeze()
    X = X[:,idx.squeeze()]
    
    # Make sure there are no rows of all zeros
    nbRatingsRows = np.sum(X>0,axis=1)
    idx = np.argwhere(nbRatingsRows>0).squeeze()
    X=X[idx,:]
    
    # Return cleaned-up X and new index of input argument "movie"
    return X, idxMovie

X, idxContact = load_data(movie=257, min_ratings=150)
print(X.shape, idxContact)

(943, 203) 109


The input arguments of the function "def create_graph" are:
X, the rating matrix; 
IndexTrain, the indices of the users in the training set; 
knn, the number of neighbors to keep when sparsifying the graph. 

In [2]:
def create_graph(X, idxTrain, knn):
    
    # Everything below 1e-9 is considered zero
    zeroTolerance = 1e-9
    
    # Number of nodes is equal to the number of columns (movies)
    N = X.shape[1]
    
    # Isolating users used for training
    XTrain = np.transpose(X[idxTrain,:])
    
    # Calculating correlation matrix
    binaryTemplate = (XTrain > 0).astype(XTrain.dtype)
    sumMatrix = XTrain.dot(binaryTemplate.T)
    countMatrix = binaryTemplate.dot(binaryTemplate.T)
    countMatrix[countMatrix == 0] = 1
    avgMatrix = sumMatrix / countMatrix
    sqSumMatrix = (XTrain ** 2).dot(binaryTemplate.T)
    correlationMatrix = sqSumMatrix / countMatrix - avgMatrix ** 2
    
    # Normalizing by diagonal weights
    sqrtDiagonal = np.sqrt(np.diag(correlationMatrix))
    nonzeroSqrtDiagonalIndex = (sqrtDiagonal > zeroTolerance)\
                                                 .astype(sqrtDiagonal.dtype)
    sqrtDiagonal[sqrtDiagonal < zeroTolerance] = 1.
    invSqrtDiagonal = 1/sqrtDiagonal
    invSqrtDiagonal = invSqrtDiagonal * nonzeroSqrtDiagonalIndex
    normalizationMatrix = np.diag(invSqrtDiagonal)
    
    # Zero-ing the diagonal
    normalizedMatrix = normalizationMatrix.dot(
                            correlationMatrix.dot(normalizationMatrix)) \
                            - np.eye(correlationMatrix.shape[0])

    # Keeping only edges with weights above the zero tolerance
    normalizedMatrix[np.abs(normalizedMatrix) < zeroTolerance] = 0.
    W = normalizedMatrix
    
    # Sparsifying the graph
    WSorted = np.sort(W,axis=1)
    threshold = WSorted[:,-knn].squeeze()
    thresholdMatrix = (np.tile(threshold,(N,1))).transpose()
    W[W<thresholdMatrix] = 0
    
    # Normalizing by eigenvalue with largest magnitude
    E, V = np.linalg.eig(W)
    W = W/np.max(np.abs(E))
    
    return W

Use the following function to generate the graph using a training set with 90% of the users selected at random and 40% nearest neighbors per nodes:

In [3]:
# Creating and sparsifying the graph

nTotal = X.shape[0] # total number of users (samples)
permutation = np.random.permutation(nTotal)
nTrain = int(np.ceil(0.9*nTotal)) # number of training samples
idxTrain = permutation[0:nTrain] # indices of training samples
nTest = nTotal-nTrain # number of test samples
idxTest=permutation[nTrain:nTotal] # indices of test samples

W = create_graph(X=X, idxTrain=idxTrain, knn=40)
print(W.shape)

(203, 203)


In [6]:
def split_data(X, idxTrain, idxTest, idxMovie):  
    
    N = X.shape[1]
    
    xTrain = X[idxTrain,:]
    idx = np.argwhere(xTrain[:,idxMovie]>0).squeeze()
    xTrain = xTrain[idx,:]
    yTrain = np.zeros(xTrain.shape)
    yTrain[:,idxMovie] = xTrain[:,idxMovie]
    xTrain[:,idxMovie] = 0
    
    xTrain = torch.tensor(xTrain)
    xTrain = xTrain.reshape([-1,1,N])
    yTrain = torch.tensor(yTrain)
    yTrain = yTrain.reshape([-1,1,N])
    
    xTest = X[idxTest,:]
    idx = np.argwhere(xTest[:,idxMovie]>0).squeeze()
    xTest = xTest[idx,:]
    yTest = np.zeros(xTest.shape)
    yTest[:,idxMovie] = xTest[:,idxMovie]
    xTest[:,idxMovie] = 0
    
    xTest = torch.tensor(xTest)
    xTest = xTest.reshape([-1,1,N])
    yTest = torch.tensor(yTest)
    yTest = yTest.reshape([-1,1,N])
    
    return xTrain, yTrain, xTest, yTest


xTrain, yTrain, xTest, yTest =split_data(X, idxTrain, idxTest, idxContact)
nTrain = xTrain.shape[0]
nTest = xTest.shape[0]

print(nTrain, nTest)

468 41
