# Imports

In [1]:
!pip install torch_geometric
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.1.0+cpu.html
!pip install torch_sparse

Collecting torch_geometric
  Downloading torch_geometric-2.4.0-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.4.0
Looking in links: https://data.pyg.org/whl/torch-2.1.0+cpu.html
Collecting pyg_lib
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcpu/pyg_lib-0.3.1%2Bpt21cpu-cp310-cp310-linux_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_scatter
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcpu/torch_scatter-2.1.2%2Bpt21cpu-cp310-cp310-linux_x86_64.whl (497 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m497.3/497.3 kB[0m [31m905.5 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_sparse
  Downloading https://data.pyg.org/whl/torch-2.1.0%2Bcpu/torch_sparse-

In [2]:
# import required modules
import random
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
import ctypes
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import model_selection, metrics, preprocessing
import copy
from torch_geometric.utils import degree

import torch
from torch import nn, optim, Tensor

from torch_sparse import SparseTensor, matmul

from torch_geometric.utils import structured_negative_sampling
from torch_geometric.data import download_url, extract_zip
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn.conv import MessagePassing
from torch_geometric.typing import Adj

# Load Dataset

In [22]:
# load main dataset
data = pd.read_csv("../data/raw/ml-100k/u.data", sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])
data

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [23]:
# load data about movies
items = pd.read_csv("../data/raw/ml-100k/u.item", sep='|', header=None, names=["movie_id", "movie_title", "release_date", "video_release_date",
              "IMDb_URL", "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
              "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"], encoding='ISO-8859-1')
items = items.drop(columns=['video_release_date', 'IMDb_URL'])
items

Unnamed: 0,movie_id,movie_title,release_date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
GENRES = ["unknown", "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
              "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
OCCUPATIONS = ["administrator", "artist", "doctor", "educator", "engineer", "entertainment", "executive", "healthcare", "homemaker", "lawyer",
               "librarian" ,"marketing" ,"none" ,"other" ,"programmer" ,"retired" ,"salesman" ,"scientist" ,"student" ,"technician" ,"writer"]

In [25]:
# load data about users
users = pd.read_csv("../data/raw/ml-100k/u.user", sep='|', header=None, names=["user_id", "age", "gender", "occupation", "zip_code"], encoding='ISO-8859-1')
users

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


# Data Exploration

In [13]:
data.dtypes

user_id      int64
movie_id     int64
rating       int64
timestamp    int64
dtype: object

In [14]:
data.describe()

Unnamed: 0,user_id,movie_id,rating,timestamp
count,100000.0,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986,883528900.0
std,266.61442,330.798356,1.125674,5343856.0
min,1.0,1.0,1.0,874724700.0
25%,254.0,175.0,3.0,879448700.0
50%,447.0,322.0,4.0,882826900.0
75%,682.0,631.0,4.0,888260000.0
max,943.0,1682.0,5.0,893286600.0


In [15]:
items.dtypes

movie_id         int64
movie_title     object
release_date    object
unknown          int64
Action           int64
Adventure        int64
Animation        int64
Children's       int64
Comedy           int64
Crime            int64
Documentary      int64
Drama            int64
Fantasy          int64
Film-Noir        int64
Horror           int64
Musical          int64
Mystery          int64
Romance          int64
Sci-Fi           int64
Thriller         int64
War              int64
Western          int64
dtype: object

In [16]:
items.describe()

Unnamed: 0,movie_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
count,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0
mean,841.5,0.001189,0.149227,0.080262,0.02497,0.072533,0.300238,0.064804,0.029727,0.431034,0.01308,0.014269,0.054697,0.033294,0.036266,0.146849,0.060048,0.149227,0.042212,0.016052
std,485.695893,0.034473,0.356418,0.271779,0.156081,0.259445,0.458498,0.246253,0.169882,0.495368,0.11365,0.118632,0.227455,0.179456,0.187008,0.354061,0.237646,0.356418,0.201131,0.125714
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,421.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,841.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1261.75,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1682.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
users.dtypes

user_id         int64
age           float64
gender          int64
occupation    float64
zip_code      float64
dtype: object

In [18]:
users.describe()

Unnamed: 0,user_id,age,gender,occupation,zip_code
count,943.0,943.0,943.0,943.0,943.0
mean,472.0,0.34052,0.710498,0.515881,0.512868
std,272.364951,0.121927,0.453772,0.317089,0.292426
min,1.0,0.07,0.0,0.0,0.003464
25%,236.5,0.25,0.0,0.190476,0.262649
50%,472.0,0.31,1.0,0.619048,0.494531
75%,707.5,0.43,1.0,0.857143,0.772773
max,943.0,0.73,1.0,0.952381,1.0


In [26]:
# normalization function for user demographic information
def my_hash(x):
  return ctypes.c_size_t(hash(x)).value

def users_encode(users):
  users_enc = users.copy()
  users_enc["age"] = users['age']/100
  users_enc["gender"] = (users['gender']=='M').astype(int)
  users_enc["zip_code"] = users["zip_code"].apply(my_hash)
  max_zip = users_enc["zip_code"].max()
  users_enc["zip_code"] = users_enc["zip_code"]/max_zip
  users_enc["occupation"] = users["occupation"].apply(OCCUPATIONS.index)/len(OCCUPATIONS)
  return users_enc

def encode_users(users, data, items):
  users_enc = users_encode(users)
  return users_enc

In [27]:
users = encode_users(users, data, items)
users.head()

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,0.24,1,0.904762,0.174231
1,2,0.53,0,0.619048,0.824408
2,3,0.23,1,0.952381,0.968539
3,4,0.24,1,0.904762,0.737644
4,5,0.33,0,0.619048,0.010483


Merge primary data with encoded user demographic data

In [28]:
df = pd.merge(data, users, on='user_id')

In [29]:
df.to_csv('../data/interim/output1.csv', index=False)