# Deep Learning Course - HW3 - Autoencoders

# intial Process of the data
We load the csv and we save in dictionary for each user,list of all the movies that he saw.
We take one random item from each user to validation.

In [29]:
from torch.utils.data import DataLoader, Dataset
import numpy as np
from numpy.random import choice
import pandas as pd
from collections import Counter
import random as random
from numpy.random import rand
import torch as torch
from torch import nn,optim
import pickle
from tqdm import tqdm
USER_IND = 0
ITEM_IND = 1
INPUT_SIZE = 3706
TRAIN_DATA_PATH = "Train.csv"
TEST_DATA_RANDOM_PATH = "RandomTest.csv"
TEST_DATA_POPULAR_PATH = "PopularityTest.csv"

In [30]:
def initialProcessData(path):
    data = pd.read_csv(
        path, sep=',', header=0).to_numpy()
    train = {}
    popularity = {}
    for i in range(INPUT_SIZE):
        popularity[i] = 0
    # create training data
    for row in data:
        if row[USER_IND]-1 not in train.keys():
            train[row[USER_IND]-1] = []
        train[row[USER_IND]-1].append(row[ITEM_IND]-1)
        popularity[row[ITEM_IND]-1] += 1
    validation = {}
    # create validation data
    for user in train.keys():
        if(len(train[user]) > 1):
            validation_item = random.choice(train[user])
            train[user].remove(validation_item)
            validation[user] = [validation_item]
    return train, validation, popularity

In [31]:
    d1, validation_data, popularity = initialProcessData(
        TRAIN_DATA_PATH)

# Dataloader class
We used Dataloader_recsys class to manage all the data with functions like userSeenItems that returns list of items that the user have watched.
userBinaryVector that returns binary vector at the size of the input with 1 in the positions where the movies he has already watched.

In [32]:
class DataLoader_RecSys(Dataset):
    def __init__(self, dataset, popularity):
        self.dataset = dataset
        self.popularity = popularity
        self.popularity_prob = np.array(
            list(self.popularity.values()))/sum(np.array(list(self.popularity.values())))
        self.users = list(self.dataset.keys())
        self.items = []
        for user in self.users:
            self.items = self.items + self.dataset[user]
        self.items = Counter(self.items)
        self.max_item_index = max(self.items)
        self.max_user_index = max(self.users)

    def userSeenItems(self, user):
        return self.dataset[user]

    def userBinaryVector(self, user):
        userVector = np.zeros(self.max_item_index + 1)
        userItems = self.userSeenItems(user)
        for item in userItems:
            userVector[item] = 1
        return userVector

    def userUnseenItems(self, user):
        return list(set(self.items).difference(set(self.userSeenItems(user))))

    def numOfUsers(self):
        return self.max_user_index + 1

    def numOfItems(self):
        return self.max_item_index + 1

    def drawUnseenItem(self, user):
        return random.choice(self.userUnseenItems(user))

    def __getitem__(self, ind):
        if(ind >= self.__len__()):
            raise IndexError
        userVec = self.userBinaryVector(ind)
        return userVec

    def __len__(self):
        return self.max_user_index + 1

In [33]:

class AutoEncoderArgs:
    num_epochs = 100
    lr = 1e-4
    weight_decay = 1e-7
    input_size = 3706
    hidden_size = 130
    popularity_multiplyer = 200


In [34]:
train_dataloader = DataLoader_RecSys(d1, popularity)

# AutoEncoder Class and training loop for randomDataset
AutoEncoder class with 1 hidden layer for encoder and 1 hidden layer for decoder using sigmoid activation for non linearity.
We saw that using dropout as regularization gets us better results on validation.

In [35]:

class Autoencoder(nn.Module):
    def __init__(self, args=None):
        super(Autoencoder, self).__init__()
        self.args = args
        self.encoder = nn.Sequential(nn.Dropout(0.5),
                                     nn.Linear(args.input_size,
                                               args.hidden_size, bias=True),
                                     nn.Sigmoid())

        self.decoder = nn.Sequential(
            nn.Linear(args.hidden_size, args.input_size, bias=True),
            nn.Sigmoid())

    def forward(self, x):
        #x = torch.tensor(x).float()
        x = self.encoder(x)
        x = self.decoder(x)
        return x


def infer(dataloader, validation, model):
    accuracy = 0
    counter = 0
    index = 0
    model.eval()
    with torch.no_grad():
        for userVec in dataloader:
            userVec = torch.tensor(userVec).float()
            output = model(userVec)
            if(len(validation[index]) == 0):
                index += 1
                counter += 1
                continue
            validationUserSeenItem = validation[index][0]
            itemDrawn = dataloader.drawUnseenItem(index)
            while(itemDrawn == validationUserSeenItem):
                itemDrawn = dataloader.drawUnseenItem(index)
            if(output[validationUserSeenItem].item() > output[itemDrawn].item()):
                accuracy += 1
            index += 1
    acc = accuracy/(dataloader.numOfUsers() - counter)
    return acc


def training_loop(args,
                  model,
                  tr_dataloader=None,
                  validation=None,
                  criterion_func=nn.MSELoss,
                  ):
    accuracy_by_epoch = []
    criterion = criterion_func()
    optimizer = optim.Adam(model.parameters(), lr=args.lr,
                           weight_decay=args.weight_decay)

    for epoch in range(args.num_epochs):
        model.train()
        for userVec in tqdm(tr_dataloader):
            userVec = torch.tensor(userVec).float()
            # ===================forward=====================
            output = model(userVec)
            loss = criterion(output, userVec)
            # ===================backward====================
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        # ===================log========================
        if epoch % 1 == 0:
            currentAccuracy = infer(
                tr_dataloader, validation, model)
            accuracy_by_epoch.append(currentAccuracy)
            print(
                f" epoch: { epoch+1} validation accuracy: {currentAccuracy}")
            if currentAccuracy > 0.932:
                break
    predict = pd.read_csv("RandomTest.csv")
    data = predict.values
    for x in data:
        user = x[0]-1
        item1 = x[1]-1
        item2 = x[2]-1
        uservector = tr_dataloader.__getitem__(user)
        uservector = torch.tensor(uservector).float()
        with torch.no_grad():
            output = model(uservector)
            if output[item1] >= output[item2]:
                x[3] = 0
            if output[item1] < output[item2]:
                x[3] = 1
    df = pd.DataFrame(
        data, columns=['UserID', 'Item1', 'Item2', 'bitClassification'])
    df.to_csv(r'random_205592652_312425036.csv', index=False)

    return accuracy_by_epoch


Traing the model and print the randomDataset results to csv

In [36]:

args = AutoEncoderArgs()
model = Autoencoder(args=args)
training_loop(args,
            model,
            tr_dataloader=train_dataloader,
            validation=validation_data,
            criterion_func=nn.MSELoss)

100%|██████████| 6040/6040 [02:18<00:00, 43.76it/s]
  0%|          | 5/6040 [00:00<02:21, 42.75it/s] epoch: 1 validation accuracy: 0.8529801324503311
100%|██████████| 6040/6040 [02:06<00:00, 47.71it/s]
  0%|          | 5/6040 [00:00<02:24, 41.76it/s] epoch: 2 validation accuracy: 0.848841059602649
100%|██████████| 6040/6040 [02:06<00:00, 47.57it/s]
  0%|          | 5/6040 [00:00<02:17, 43.74it/s] epoch: 3 validation accuracy: 0.8600993377483444
100%|██████████| 6040/6040 [02:06<00:00, 47.68it/s]
  0%|          | 5/6040 [00:00<02:15, 44.58it/s] epoch: 4 validation accuracy: 0.8524834437086093
100%|██████████| 6040/6040 [02:13<00:00, 45.40it/s]
  0%|          | 5/6040 [00:00<02:16, 44.21it/s] epoch: 5 validation accuracy: 0.8622516556291391
100%|██████████| 6040/6040 [02:08<00:00, 47.08it/s]
  0%|          | 5/6040 [00:00<02:24, 41.78it/s] epoch: 6 validation accuracy: 0.8539735099337749
100%|██████████| 6040/6040 [02:27<00:00, 40.86it/s]
  0%|          | 4/6040 [00:00<02:45, 36.47it/s] 

[0.8529801324503311,
 0.848841059602649,
 0.8600993377483444,
 0.8524834437086093,
 0.8622516556291391,
 0.8539735099337749,
 0.8584437086092715,
 0.8605960264900663,
 0.8647350993377484,
 0.8682119205298013,
 0.8731788079470199,
 0.8759933774834437,
 0.8766556291390728,
 0.8768211920529801,
 0.8774834437086093,
 0.8864238410596027,
 0.8855960264900662,
 0.894205298013245,
 0.8981788079470199,
 0.8978476821192053,
 0.895364238410596,
 0.9018211920529802,
 0.9033112582781457,
 0.9087748344370861,
 0.9076158940397351,
 0.909271523178808,
 0.9135761589403973,
 0.9175496688741722,
 0.9208609271523179,
 0.9201986754966888,
 0.9206953642384106,
 0.9231788079470199,
 0.9256622516556291,
 0.921523178807947,
 0.9288079470198676,
 0.9206953642384106,
 0.9254966887417219,
 0.9291390728476822,
 0.9240066225165563,
 0.930794701986755,
 0.930794701986755,
 0.9286423841059602,
 0.93658940397351]

# AutoEncoder and training loop for PopularityDataset 
We used AutoEncoderPopular class to find latent representation of user's binary vector.
It's very to similar to the AutoEncoder class. The main change is the loss function. Each iteration we create a binary mask for the user where all the seen movies values are 1 and some of the unseen values are also 1. We draw the unseen movies with respect to the popularity distribution of the movies. We calculate the MSELoss on Autoencoder's output*mask to focus on the seen movies and unseen popular movies which are most likely to be in the Popularity test file

In [37]:

class AutoencoderPopular(nn.Module):
    def __init__(self, args=None):
        super(AutoencoderPopular, self).__init__()
        self.args = args
        self.encoder = nn.Sequential(
            nn.Linear(args.input_size,
                      args.hidden_size, bias=True),
            nn.Sigmoid())

        self.decoder = nn.Sequential(
            nn.Linear(args.hidden_size, args.input_size, bias=True),
            nn.Sigmoid())

    def forward(self, x):
        #x = torch.tensor(x).float()
        x = self.encoder(x)
        x = self.decoder(x)
        return x


def infer(dataloader, validation, model):
    accuracy = 0
    counter = 0
    index = 0
    model.eval()
    popProb = dataloader.popularity_prob
    with torch.no_grad():
        for userVec in tqdm(dataloader):
            userPopProb = popProb.copy()
            userItems = dataloader.userSeenItems(index)
            for item in userItems:
                userPopProb[item] = 1
            userVec = torch.tensor(userVec).float()
            output = model(userVec)
            new_output = output
            if(len(validation[index]) == 0):
                index += 1
                counter += 1
                continue
            validationUserSeenItem = validation[index][0]
            # unseenPopularItemsList[index][0][epoch]
            prob = userPopProb.copy()
            for item in userItems:
                prob[item] = 0
            itemDrawn = random.choices(
                range(model.args.input_size), weights=prob, k=1)
            while(itemDrawn == validationUserSeenItem):
                itemDrawn = random.choices(
                    range(model.args.input_size), weights=prob, k=1)
            if(new_output[validationUserSeenItem].item() > new_output[itemDrawn].item()):
                accuracy += 1
            index += 1
    acc = accuracy/(dataloader.numOfUsers() - counter)
    return acc


def training_loop_pop(args,
                  model,
                  tr_dataloader=None,
                  validation=None,
                  criterion_func=nn.MSELoss,
                  ):
    accuracy_by_epoch = []
    criterion = criterion_func()
    optimizer = optim.Adam(model.parameters(), lr=args.lr,
                           weight_decay=args.weight_decay)
    popProb = tr_dataloader.popularity_prob

    for epoch in range(args.num_epochs):
        model.train()
        index = 0
        for userVec in tqdm(tr_dataloader):
            userItems = tr_dataloader.userSeenItems(index)
            userPopProb = popProb.copy()*args.popularity_multiplyer
            for item in userItems:
                userPopProb[item] = 1
            rand_vec = rand(args.input_size)
            mask = np.zeros(args.input_size)
            for j in range(args.input_size):
                if(rand_vec[j] < userPopProb[j]):
                    mask[j] = 1
            userVec = torch.tensor(userVec).float()
            # ===================forward=====================
            output = model(userVec)
            new_output = output*torch.tensor(mask)
            loss = criterion(new_output, userVec.double())
            # ===================backward====================
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            index += 1
        # ===================log========================
        if epoch % 1 == 0:
            currentAccuracy = infer(
                tr_dataloader, validation, model)
            accuracy_by_epoch.append(currentAccuracy)
            print(
                f" epoch: { epoch+1} tr_loss: {loss} validation accuracy: {currentAccuracy} ")
            if(epoch % 8 == 0 and epoch != 0):
                args.lr = args.lr*0.1
        if currentAccuracy > 0.87:
            break
    predict = pd.read_csv("PopularityTest.csv")
    data = predict.values
    for x in data:
        user = x[0]-1
        item1 = x[1]-1
        item2 = x[2]-1
        uservector = tr_dataloader.__getitem__(user)
        uservector = torch.tensor(uservector).float()
        with torch.no_grad():
            output = model(uservector)
            if output[item1] >= output[item2]:
                x[3] = 0
            if output[item1] < output[item2]:
                x[3] = 1
    df = pd.DataFrame(
        data, columns=['UserID', 'Item1', 'Item2', 'bitClassification'])
    df.to_csv(r'popularity_205592652_312425036.csv', index=False)
    return accuracy_by_epoch


In [28]:
    args.hidden_size = 80
    args.popularity_multiplyer = 150
    model = AutoencoderPopular(args=args)
    training_loop_pop(args,
            model,
            tr_dataloader=train_dataloader,
            validation=validation_data,
            criterion_func=nn.MSELoss)

100%|██████████| 6040/6040 [02:18<00:00, 43.75it/s]
100%|██████████| 6040/6040 [00:16<00:00, 364.27it/s]
  0%|          | 5/6040 [00:00<02:09, 46.75it/s] epoch: 1 tr_loss: 0.014138277833690085 validation accuracy: 0.659933774834437 
100%|██████████| 6040/6040 [01:53<00:00, 53.08it/s]
100%|██████████| 6040/6040 [00:13<00:00, 432.38it/s]
  0%|          | 6/6040 [00:00<01:47, 56.23it/s] epoch: 2 tr_loss: 0.013880058924617095 validation accuracy: 0.7327814569536424 
100%|██████████| 6040/6040 [01:48<00:00, 55.51it/s]
100%|██████████| 6040/6040 [00:14<00:00, 419.46it/s]
  0%|          | 6/6040 [00:00<01:58, 50.73it/s] epoch: 3 tr_loss: 0.012870070469262185 validation accuracy: 0.7667218543046358 
100%|██████████| 6040/6040 [01:49<00:00, 55.19it/s]
100%|██████████| 6040/6040 [00:15<00:00, 394.57it/s]
  0%|          | 6/6040 [00:00<01:44, 57.85it/s] epoch: 4 tr_loss: 0.011612717386435127 validation accuracy: 0.7793046357615894 
100%|██████████| 6040/6040 [01:49<00:00, 55.03it/s]
100%|████████

[0.659933774834437,
 0.7327814569536424,
 0.7667218543046358,
 0.7793046357615894,
 0.8028145695364238,
 0.81158940397351,
 0.8210264900662252,
 0.8326158940397351,
 0.8339403973509933,
 0.8408940397350994,
 0.8445364238410596,
 0.847682119205298,
 0.8456953642384106,
 0.851158940397351,
 0.847682119205298,
 0.8481788079470198,
 0.85,
 0.8605960264900663,
 0.8571192052980132,
 0.8523178807947019,
 0.8577814569536424,
 0.8549668874172185,
 0.8574503311258278,
 0.8607615894039735,
 0.8615894039735099,
 0.8642384105960265,
 0.8604304635761589,
 0.8655629139072848,
 0.8652317880794702,
 0.8677152317880795,
 0.8678807947019868,
 0.8612582781456953,
 0.8614238410596027,
 0.8630794701986755,
 0.8635761589403973,
 0.8649006622516556,
 0.8619205298013245,
 0.8710264900662251]