In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.types as T
import pyspark.sql.functions as F

from itertools import chain
import pickle
import os

spark = SparkSession.builder.getOrCreate()

In [None]:
input_dir = '/mnt/d/datasets/anime2020/animelist.csv'
output_dir = '/mnt/d/datasets/anime2020/animelist_400+'

In [None]:
schema = T.StructType([
    T.StructField("user_id", T.IntegerType(), True),
    T.StructField("anime_id", T.IntegerType(), True),
    T.StructField("rating", T.IntegerType(), True),
    T.StructField("watching_status", T.IntegerType(), True),
    T.StructField("watched_episodes", T.IntegerType(), True),
  ])

In [None]:
df_animelist = spark.read.csv(input_dir, header=True, schema=schema)

In [None]:
df_animelist.show()

df_animelist.dtypes

In [None]:
df_animelist = (
    df_animelist
    .withColumn("rating", (F.col("rating") / F.lit(10.0)).cast(T.FloatType()))
    .join(
        df_animelist.groupBy('user_id').count(), on='user_id'
    )
    .where(F.col('count') >= 400)
)

In [None]:
df_animelist.show()

In [None]:
# Encoding categorical data
user_ids = [
    row["user_id"] for row in
    df_animelist.select(F.col("user_id")).distinct().orderBy(F.col("user_id").asc()).collect()
]

user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
mapping_expr = F.create_map([F.lit(x) for x in chain(*user2user_encoded.items())])
df_animelist = df_animelist.withColumn("user", mapping_expr[F.col("user_id")].cast(T.IntegerType()))

n_users = len(user2user_encoded)


anime_ids = [
    row["anime_id"] for row in
    df_animelist.select(F.col("anime_id")).distinct().orderBy(F.col("anime_id").asc()).collect()
]
anime2anime_encoded = {x: i for i, x in enumerate(anime_ids)}
anime_encoded2anime = {i: x for i, x in enumerate(anime_ids)}
mapping_expr2 = F.create_map([F.lit(x) for x in chain(*anime2anime_encoded.items())])
df_animelist = df_animelist.withColumn("anime", mapping_expr2[F.col("anime_id")].cast(T.IntegerType()))

# anime_ids = rating_df["anime_id"].unique().tolist
# rating_df["anime"] = rating_df["anime_id"].map(anime2anime_encoded).astype("int32")
n_animes = len(anime2anime_encoded)

print("Num of users: {}, Num of animes: {}".format(n_users, n_animes))

In [None]:
df_animelist = df_animelist.orderBy(F.rand())

In [None]:
# df_animelist.show()

In [None]:
(
    df_animelist
    .select("user_id" , "anime_id", "rating", "user", "anime")
    .coalesce(1)
    .write.mode("overwrite").parquet(output_dir)
)

In [None]:
with open(os.path.join(output_dir, "user2user_encoded.pickle"), "wb") as f:
    pickle.dump(user2user_encoded, f)
    
with open(os.path.join(output_dir, "user_encoded2user.pickle"), "wb") as f:
    pickle.dump(user_encoded2user, f)

with open(os.path.join(output_dir, "anime2anime_encoded.pickle"), "wb") as f:
    pickle.dump(anime2anime_encoded, f)

with open(os.path.join(output_dir, "anime_encoded2anime.pickle"), "wb") as f:
    pickle.dump(anime_encoded2anime, f)
