In [None]:
import pandas as pd
import torch
from tqdm import tqdm
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import torch.nn as nn
import numpy as np
from math import sqrt

In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall()

loading the files

In [None]:
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
)

ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
)

movies = pd.read_csv(
    "ml-1m/movies.dat", sep="::", names=["movie_id", "title", "genres"], encoding="ISO-8859-1"
)

  users = pd.read_csv(
  ratings = pd.read_csv(
  movies = pd.read_csv(


In [None]:
## Movies
movies["year"] = movies["title"].apply(lambda x: x[-5:-1])
movies.year = pd.Categorical(movies.year)
movies["year"] = movies.year.cat.codes

## Users
users.sex = pd.Categorical(users.sex)
users["sex"] = users.sex.cat.codes


users.age_group = pd.Categorical(users.age_group)
users["age_group"] = users.age_group.cat.codes


users.occupation = pd.Categorical(users.occupation)
users["occupation"] = users.occupation.cat.codes


users.zip_code = pd.Categorical(users.zip_code)
users["zip_code"] = users.zip_code.cat.codes


In [None]:
len(users["zip_code"].unique())

3439

In [None]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

for genre in genres:
    movies[genre] = movies["genres"].apply(
        lambda values: int(genre in values.split("|"))
    )


In [None]:
def create_ratings_df(ratings):
  ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")

  ratings_data = pd.DataFrame(
      data={
          "user_id": list(ratings_group.groups.keys()),
          "movie_ids": list(ratings_group.movie_id.apply(list)),
          "ratings": list(ratings_group.rating.apply(list)),
          "timestamps": list(ratings_group.unix_timestamp.apply(list)),
      }
  )
  sequence_length = 8
  step_size = 1

  def create_sequences(values, window_size, step_size):
      sequences = []
      start_index = 0
      while True:
          end_index = start_index + window_size
          if end_index<len(values):
            seq = values[start_index:end_index]
            sequences.append(seq)
          else:
            break
          start_index += step_size

      return sequences

  ratings_data.movie_ids = ratings_data.movie_ids.apply(
    lambda ids: create_sequences(ids, sequence_length, step_size)
  )

  ratings_data.ratings = ratings_data.ratings.apply(
      lambda ids: create_sequences(ids, sequence_length, step_size)
  )

  ratings_data_movies = ratings_data[["user_id", "movie_ids"]].explode(
    "movie_ids", ignore_index=True
  )
  ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
  ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)
  ratings_data_transformed = ratings_data_transformed.join(
    users.set_index("user_id"), on="user_id"
  )

  del ratings_data_transformed["zip_code"]

  ratings_data_transformed.rename(
    columns={"movie_ids": "sequence_movie_ids", "ratings": "sequence_ratings"},
    inplace=True,
  )
  ratings_data_transformed = ratings_data_transformed.dropna()

  return ratings_data_transformed


In [None]:
ratings["unix_timestamp"].quantile(0.8)

975768738.0

In [None]:
#create the train, validation and test sets. Training set has 80%, Validation has 10% and test has 10%

train_mask = ratings["unix_timestamp"]<975768738.0
val_mask = (ratings["unix_timestamp"]>975768738.0)&(ratings["unix_timestamp"]<978133376.4)
test_mask = (ratings["unix_timestamp"]>978133376.4)

In [None]:
train_ratings = ratings[train_mask]
val_ratings = ratings[val_mask]
test_ratings = ratings[test_mask]

In [None]:
train_data = create_ratings_df(train_ratings)
val_data = create_ratings_df(val_ratings)
test_data = create_ratings_df(test_ratings)

In [None]:
print("Train ratings: ", len(train_data))
print("Val ratings: ", len(val_data))
print("Test ratings: ", len(test_data))

Train ratings:  757077
Val ratings:  92113
Test ratings:  91185


In [None]:
train_data.to_csv("/content/drive/MyDrive/WSTM_latest/data/train.csv", index=False)
val_data.to_csv("/content/drive/MyDrive/WSTM_latest/data/validation.csv", index=False)
test_data.to_csv("/content/drive/MyDrive/WSTM_latest/data/test.csv", index=False)

In [None]:
users.to_csv("/content/drive/MyDrive/WSTM_latest/data/users.csv",index=False)
movies.to_csv("/content/drive/MyDrive/WSTM_latest/data/movies.csv",index=False)
ratings.to_csv("/content/drive/MyDrive/WSTM_latest/data/ratings.csv",index=False)