In [None]:
import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F
import sparknlp

# spark= SparkSession \
#        .builder \
#        .appName("Content Based Recommendation") \
#        .getOrCreate()

spark = sparknlp.start()

spark

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/anime-dataset-2023.csv")
df_anime = spark.createDataFrame(df)
df_anime.show()

+--------+--------------------+--------------------+------------------------------+-----+--------------------+--------------------+-----+--------+--------------------+-----------+----------------+--------------------+--------------------+----------------+-----------+-------------+--------------------+------+----------+---------+---------+-------+--------------------+
|anime_id|                Name|        English name|                    Other name|Score|              Genres|            Synopsis| Type|Episodes|               Aired|  Premiered|          Status|           Producers|           Licensors|         Studios|     Source|     Duration|              Rating|  Rank|Popularity|Favorites|Scored By|Members|           Image URL|
+--------+--------------------+--------------------+------------------------------+-----+--------------------+--------------------+-----+--------+--------------------+-----------+----------------+--------------------+--------------------+----------------+-----

# Prepocesamiento

In [None]:
from pyspark.sql.functions import col, explode, split, when, mean

# Reemplazar "UNKNOWN" por NULL (vacío) en la columna "Genres"
df_anime = df_anime.withColumn("Genres", when(col("Genres") == "UNKNOWN", None).otherwise(col("Genres")))

In [None]:
# 1. Filtrar las filas que tienen valores numéricos en la columna "Score"
df_numeric_scores = df_anime.filter(col("Score").cast("float").isNotNull())

# Extraer las calificaciones numéricas en una lista ordenada
numeric_scores = df_numeric_scores.select(col("Score").cast("float")).rdd.flatMap(lambda x: x).collect()
numeric_scores.sort()

# Calcular la mediana
n = len(numeric_scores)
median_score = numeric_scores[n // 2] if n % 2 != 0 else (numeric_scores[n // 2 - 1] + numeric_scores[n // 2]) / 2

# Reemplazar "UNKNOWN" por la mediana calculada
df_anime = df_anime.withColumn("Score", when(col("Score") == "UNKNOWN", median_score).otherwise(col("Score")))

# Cambiar "Score" a tipo float
df_anime = df_anime.withColumn("Score", col("Score").cast("float"))

In [None]:
# Asegurarte de que "Episodes" sea tratado como tipo float
df_anime = df_anime.withColumn("Episodes", col("Episodes").cast("float"))

# Filtrar las filas que tienen valores numéricos en la columna "Episodes"
df_numeric_episodes = df_anime.filter(col("Episodes").isNotNull())

# Calcular la media de los episodios
mean_episodes = df_numeric_episodes.select(mean(col("Episodes"))).collect()[0][0]

# Reemplazar los valores "UNKNOWN" por la media calculada
df_anime = df_anime.withColumn("Episodes", when(col("Episodes").isNull(), mean_episodes).otherwise(col("Episodes")))

In [None]:
# Reemplazar "UNKNOWN" por NULL (vacío) en la columna "Type"
df_anime = df_anime.withColumn("Type", when(col("Type") == "UNKNOWN", None).otherwise(col("Type")))

In [None]:
# Remplazar Unknown por vacios
df_anime = df_anime.withColumn("Duration", when(col("Duration") == "Unknown", None).otherwise(col("Duration")))
df_anime = df_anime.withColumn("Rating", when(col("Rating") == "UNKNOWN", None).otherwise(col("Rating")))


In [None]:
# Crear un dataframe buevo con columnas 'Name', 'Genres', 'Synopsis', 'Type', 'Episodes', 'Score',
# 'Aired', 'Status', 'Source', 'Duration', 'Rating', 'Popularity', 'Members'
data = df_anime.select("anime_id", "Name", "Genres", "Synopsis", "Type", "Episodes","Aired",
                       "Status", "Source", "Duration", "Rating")
data.show()

+--------+--------------------+--------------------+--------------------+-----+-----------------+--------------------+----------------+-----------+-------------+--------------------+
|anime_id|                Name|              Genres|            Synopsis| Type|         Episodes|               Aired|          Status|     Source|     Duration|              Rating|
+--------+--------------------+--------------------+--------------------+-----+-----------------+--------------------+----------------+-----------+-------------+--------------------+
|       1|        Cowboy Bebop|Action, Award Win...|Crime is timeless...|   TV|             26.0|Apr 3, 1998 to Ap...| Finished Airing|   Original|24 min per ep|R - 17+ (violence...|
|       5|Cowboy Bebop: Ten...|      Action, Sci-Fi|Another day, anot...|Movie|              1.0|         Sep 1, 2001| Finished Airing|   Original|  1 hr 55 min|R - 17+ (violence...|
|       6|              Trigun|Action, Adventure...|Vash the Stampede...|   TV|      

In [None]:
from pyspark.sql.functions import concat_ws,regexp_replace

# Concatenar todas las columnas con un separador, por ejemplo, un espacio (' ')
df_with_concatenation = data.withColumn(
    "concatenated_text",
    concat_ws(" ","Name","Genres","Synopsis","Type", "Episodes", "Aired", "Status", "Source", "Duration", "Rating")
)

# Mostrar el DataFrame con la nueva columna concatenada
df_final = df_with_concatenation.drop("Genres","Synopsis","Type", "Episodes", "Aired", "Status", "Source", "Duration", "Rating")

df_final = df_final.withColumn(
    "concatenated_text",
    regexp_replace("concatenated_text", ",", "")
)
df_final.show(truncate=False)

+--------+-------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Embeddings

In [None]:
from sparknlp.base import DocumentAssembler
from sparknlp.annotator import Tokenizer, DistilBertEmbeddings
from pyspark.ml import Pipeline

# Crear el ensamblador de documentos
documentAssembler = DocumentAssembler() \
    .setInputCol("Synopsis") \
    .setOutputCol("document")

# Crear el tokenizador
tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

# Usar DistilBertEmbeddings
embeddings = DistilBertEmbeddings.pretrained("distilbert_base_uncased", "en") \
    .setInputCols(["document", "token"]) \
    .setOutputCol("distilbert_embeddings") \
    .setCaseSensitive(True) \
    .setMaxSentenceLength(512)

# Definir la pipeline
pipeline = Pipeline(stages=[documentAssembler, tokenizer, embeddings])

distilbert_base_uncased download started this may take some time.
Approximate size to download 235.8 MB
[OK!]


In [None]:
model = pipeline.fit(df_anime)
result = model.transform(df_anime)

In [None]:
from pyspark.sql import functions as F

# Extraer el primer embedding de cada fila
df_with_embeddings = result.withColumn(
    "anime_embedding",
    F.col("distilbert_embeddings.embeddings")[0]  # Usa solo el primer embedding
)

# Eliminar columnas innecesarias
df_with_embeddings = df_with_embeddings.drop("document", "distilbert_embeddings", "concatenated_text")


In [None]:
df_with_embeddings.show()

+--------+--------------------+--------------------+
|anime_id|                Name|     anime_embedding|
+--------+--------------------+--------------------+
|       1|        Cowboy Bebop|[-0.38382754, -0....|
|       5|Cowboy Bebop: Ten...|[-0.5310759, 0.29...|
|       6|              Trigun|[-0.51092297, 0.1...|
|       7|  Witch Hunter Robin|[-0.4380115, 0.19...|
|       8|      Bouken Ou Beet|[-0.5964433, -0.0...|
|      15|        Eyeshield 21|[-0.5027993, 0.43...|
|      16|Hachimitsu to Clover|[-0.622882, -0.07...|
|      17|Hungry Heart: Wil...|[-0.6489423, 0.24...|
|      18|Initial D Fourth ...|[-0.58195895, 0.5...|
|      19|             Monster|[-0.47485933, 0.4...|
|      20|              Naruto|[-0.7312896, -0.2...|
|      21|           One Piece|[-0.61763495, -0....|
|      22| Tennis no Ouji-sama|[-0.6668438, 0.32...|
|      23|    Ring ni Kakero 1|[-0.7685425, -0.0...|
|      24|       School Rumble|[-0.44775146, -0....|
|      25|           Sunabouzu|[-0.49340913, -

In [None]:
df_with_embeddings.printSchema()

root
 |-- anime_id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- English name: string (nullable = true)
 |-- Other name: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Synopsis: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Episodes: string (nullable = true)
 |-- Aired: string (nullable = true)
 |-- Premiered: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Producers: string (nullable = true)
 |-- Licensors: string (nullable = true)
 |-- Studios: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Rank: string (nullable = true)
 |-- Popularity: long (nullable = true)
 |-- Favorites: long (nullable = true)
 |-- Scored By: string (nullable = true)
 |-- Members: long (nullable = true)
 |-- Image URL: string (nullable = true)
 |-- token: array (nullable = true)
 |    |-- element: st

In [None]:
df_with_embeddings = df_with_embeddings.select("anime_id", "Name", "anime_embedding")
df_with_embeddings.show()

+--------+--------------------+--------------------+
|anime_id|                Name|     anime_embedding|
+--------+--------------------+--------------------+
|       1|        Cowboy Bebop|[-1.0371023, 0.23...|
|       5|Cowboy Bebop: Ten...|[-0.75202537, 0.0...|
|       6|              Trigun|[-1.1225483, 0.22...|
|       7|  Witch Hunter Robin|[-0.93941563, -0....|
|       8|      Bouken Ou Beet|[-0.9127989, 0.05...|
|      15|        Eyeshield 21|[-0.6559154, 0.23...|
|      16|Hachimitsu to Clover|[-1.1258018, -0.1...|
|      17|Hungry Heart: Wil...|[-0.9605565, 0.39...|
|      18|Initial D Fourth ...|[-0.91625136, 0.0...|
|      19|             Monster|[-0.6514933, -0.0...|
|      20|              Naruto|[-0.8714514, 0.08...|
|      21|           One Piece|[-0.85837144, 0.2...|
|      22| Tennis no Ouji-sama|[-0.88153434, 0.4...|
|      23|    Ring ni Kakero 1|[-0.6822726, 0.24...|
|      24|       School Rumble|[-0.78813845, 0.4...|
|      25|           Sunabouzu|[-0.8186758, 0.

In [None]:
from pyspark.sql.functions import col
import json

# Convertir la columna de ARRAY a String (JSON o CSV)
df_with_embeddings = df_with_embeddings.withColumn("anime_embedding_str", col("anime_embedding").cast("string"))
df_with_embeddings = df_with_embeddings.drop("anime_embedding")


In [None]:
# prompt: guardar df with embedings en parquet

df_with_embeddings.write.parquet("anime_embeddings.parquet")

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 