In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from typing import List
import pyspark.sql.types as T
import pyspark.sql.functions as F

spark= SparkSession \
       .builder \
       .appName("Content Based Recommendation") \
       .getOrCreate()

spark

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
[33m0% [Connecting to archive.ubuntu.com (185.125.190.81)] [Waiting for headers] [C[0m                                                                               Hit:2 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state inform

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import pandas as pd

In [4]:
# Leer primero el Archivo en un DataFrame y pasar a un RDD
# Al abrirlo directamente con Pyspark no toma las columnas correctamente
df = pd.read_csv("/content/anime-dataset-2023.csv")
df_anime = spark.createDataFrame(df)
df_anime.show()

+--------+--------------------+--------------------+------------------------------+-----+--------------------+--------------------+-----+--------+--------------------+-----------+----------------+--------------------+--------------------+----------------+-----------+-------------+--------------------+------+----------+---------+---------+-------+--------------------+
|anime_id|                Name|        English name|                    Other name|Score|              Genres|            Synopsis| Type|Episodes|               Aired|  Premiered|          Status|           Producers|           Licensors|         Studios|     Source|     Duration|              Rating|  Rank|Popularity|Favorites|Scored By|Members|           Image URL|
+--------+--------------------+--------------------+------------------------------+-----+--------------------+--------------------+-----+--------+--------------------+-----------+----------------+--------------------+--------------------+----------------+-----

In [5]:
# Mostrar el esquema del DataFrame
df_anime.printSchema()

root
 |-- anime_id: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- English name: string (nullable = true)
 |-- Other name: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Synopsis: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Episodes: string (nullable = true)
 |-- Aired: string (nullable = true)
 |-- Premiered: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Producers: string (nullable = true)
 |-- Licensors: string (nullable = true)
 |-- Studios: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Rank: string (nullable = true)
 |-- Popularity: long (nullable = true)
 |-- Favorites: long (nullable = true)
 |-- Scored By: string (nullable = true)
 |-- Members: long (nullable = true)
 |-- Image URL: string (nullable = true)



In [6]:
# Número de filas y columnas
num_rows = df_anime.count()
num_cols = len(df_anime.columns)
print(f"Número de filas: {num_rows}, Número de columnas: {num_cols}")

Número de filas: 24905, Número de columnas: 24


In [7]:
df_anime.describe().show()

+-------+------------------+---------------------+--------------------+-----------------+-----------------+-------+---------------------+-------+-----------------+------------------+-----------+----------------+----------+-------------------+--------------------+------------+--------+------------+-----------------+------------------+-----------------+------------------+-----------------+--------------------+
|summary|          anime_id|                 Name|        English name|       Other name|            Score| Genres|             Synopsis|   Type|         Episodes|             Aired|  Premiered|          Status| Producers|          Licensors|             Studios|      Source|Duration|      Rating|             Rank|        Popularity|        Favorites|         Scored By|          Members|           Image URL|
+-------+------------------+---------------------+--------------------+-----------------+-----------------+-------+---------------------+-------+-----------------+-------------

In [8]:
# Lista de columnas y sus tipos
for col, dtype in df_anime.dtypes:
    print(f"Columna: {col}, Tipo: {dtype}")

Columna: anime_id, Tipo: bigint
Columna: Name, Tipo: string
Columna: English name, Tipo: string
Columna: Other name, Tipo: string
Columna: Score, Tipo: string
Columna: Genres, Tipo: string
Columna: Synopsis, Tipo: string
Columna: Type, Tipo: string
Columna: Episodes, Tipo: string
Columna: Aired, Tipo: string
Columna: Premiered, Tipo: string
Columna: Status, Tipo: string
Columna: Producers, Tipo: string
Columna: Licensors, Tipo: string
Columna: Studios, Tipo: string
Columna: Source, Tipo: string
Columna: Duration, Tipo: string
Columna: Rating, Tipo: string
Columna: Rank, Tipo: string
Columna: Popularity, Tipo: bigint
Columna: Favorites, Tipo: bigint
Columna: Scored By, Tipo: string
Columna: Members, Tipo: bigint
Columna: Image URL, Tipo: string


In [9]:
#Simple Group by Function
df_type=df_anime.groupBy("Type").count().orderBy(F.desc("count"))
df_type.show()

+-------+-----+
|   Type|count|
+-------+-----+
|     TV| 7597|
|  Movie| 4381|
|    OVA| 4076|
|    ONA| 3533|
|  Music| 2686|
|Special| 2558|
|UNKNOWN|   74|
+-------+-----+



In [10]:
from pyspark.sql.functions import col, explode, split, when, mean

# Reemplazar "UNKNOWN" por NULL (vacío) en la columna "Genres"
df_anime = df_anime.withColumn("Genres", when(col("Genres") == "UNKNOWN", None).otherwise(col("Genres")))

In [11]:
# Dividir la columna "Genres" en una lista de géneros
df_genres = df_anime.withColumn("GenreArray", split(col("Genres"), ", "))
# Explode: Expandir los géneros en filas individuales
df_exploded = df_genres.withColumn("Genre", explode(col("GenreArray")))
# Contar las ocurrencias de cada género
genre_counts = df_exploded.groupBy("Genre").count().orderBy(col("count").desc())
# Mostrar el resultado
genre_counts.show()

+-------------+-----+
|        Genre|count|
+-------------+-----+
|       Comedy| 7142|
|      Fantasy| 5306|
|       Action| 4730|
|    Adventure| 3842|
|       Sci-Fi| 3091|
|        Drama| 2836|
|      Romance| 2063|
|Slice of Life| 1755|
| Supernatural| 1494|
|       Hentai| 1486|
|      Mystery|  847|
|  Avant Garde|  804|
|        Ecchi|  795|
|       Sports|  771|
|       Horror|  534|
|     Suspense|  242|
|Award Winning|  241|
|    Boys Love|  169|
|      Gourmet|  146|
|   Girls Love|  113|
+-------------+-----+
only showing top 20 rows



In [12]:
# Filtrar las filas donde la columna Score tiene el valor "UNKNOWN"
unknown_count = df_anime.filter(col("Score") == "UNKNOWN").count()
print(f"Número de animes con Score 'UNKNOWN': {unknown_count}")

Número de animes con Score 'UNKNOWN': 9213


In [13]:
# 1. Filtrar las filas que tienen valores numéricos en la columna "Score"
df_numeric_scores = df_anime.filter(col("Score").cast("float").isNotNull())

# Extraer las calificaciones numéricas en una lista ordenada
numeric_scores = df_numeric_scores.select(col("Score").cast("float")).rdd.flatMap(lambda x: x).collect()
numeric_scores.sort()

# Calcular la mediana
n = len(numeric_scores)
median_score = numeric_scores[n // 2] if n % 2 != 0 else (numeric_scores[n // 2 - 1] + numeric_scores[n // 2]) / 2

# Reemplazar "UNKNOWN" por la mediana calculada
df_anime = df_anime.withColumn("Score", when(col("Score") == "UNKNOWN", median_score).otherwise(col("Score")))

# Cambiar "Score" a tipo float
df_anime = df_anime.withColumn("Score", col("Score").cast("float"))

# Ordenar por Score en orden descendente, seleccionar columnas y limitar a 15 resultados
top_15_animes = (df_anime.orderBy(col("Score").desc()).select("Name", "Score").limit(15))

# Mostrar el resultado
top_15_animes.show(truncate=False)

+---------------------------------------------------+-----+
|Name                                               |Score|
+---------------------------------------------------+-----+
|Fullmetal Alchemist: Brotherhood                   |9.1  |
|Bleach: Sennen Kessen-hen                          |9.07 |
|Steins;Gate                                        |9.07 |
|Gintama°                                           |9.06 |
|Shingeki no Kyojin Season 3 Part 2                 |9.05 |
|Kaguya-sama wa Kokurasetai: Ultra Romantic         |9.05 |
|Shingeki no Kyojin: The Final Season - Kanketsu-hen|9.05 |
|Gintama'                                           |9.04 |
|Gintama: The Final                                 |9.04 |
|Hunter x Hunter (2011)                             |9.04 |
|Gintama': Enchousen                                |9.03 |
|Ginga Eiyuu Densetsu                               |9.02 |
|Fruits Basket: The Final                           |9.0  |
|Gintama.                               

In [14]:
# Filtrar las filas donde la columna Episodes tiene el valor "UNKNOWN"
unknown_count = df_anime.filter(col("Episodes") == "UNKNOWN").count()
print(f"Número de animes con Episodes 'UNKNOWN': {unknown_count}")

Número de animes con Episodes 'UNKNOWN': 611


In [15]:
# Asegurarte de que "Episodes" sea tratado como tipo float
df_anime = df_anime.withColumn("Episodes", col("Episodes").cast("float"))

# Filtrar las filas que tienen valores numéricos en la columna "Episodes"
df_numeric_episodes = df_anime.filter(col("Episodes").isNotNull())

# Calcular la media de los episodios
mean_episodes = df_numeric_episodes.select(mean(col("Episodes"))).collect()[0][0]

# Reemplazar los valores "UNKNOWN" por la media calculada
df_anime = df_anime.withColumn("Episodes", when(col("Episodes").isNull(), mean_episodes).otherwise(col("Episodes")))

# Ordenar por Score en orden descendente, seleccionar columnas y limitar a 15 resultados
top_15_animes = (df_anime.orderBy(col("Episodes").desc()).select("Name", "Episodes").limit(15))

# Mostrar el resultado
top_15_animes.show(truncate=False)

+-------------------------------------+--------+
|Name                                 |Episodes|
+-------------------------------------+--------+
|Lan Mao                              |3057.0  |
|Oyako Club                           |1818.0  |
|Doraemon (1979)                      |1787.0  |
|Daomei Xiansheng                     |1664.0  |
|Kirin Monoshiri Yakata               |1565.0  |
|Manga Nippon Mukashibanashi (1976)   |1471.0  |
|Hoka Hoka Kazoku                     |1428.0  |
|Kirin Ashita no Calendar             |1306.0  |
|Monoshiri Daigaku: Ashita no Calendar|1274.0  |
|Sekai Monoshiri Ryoko                |1006.0  |
|Kaixin Hanzi                         |1000.0  |
|Shuimu Baobao Kan Shijie             |800.0   |
|Kotowaza House                       |773.0   |
|Asa da yo! Kaishain                  |744.0   |
|Shima Shima Tora no Shimajirou       |726.0   |
+-------------------------------------+--------+



In [16]:
# Filtrar las filas donde la columna Type tiene el valor "UNKNOWN"
unknown_count = df_anime.filter(col("Type") == "UNKNOWN").count()
print(f"Número de animes con Type 'UNKNOWN': {unknown_count}")

# Reemplazar "UNKNOWN" por NULL (vacío) en la columna "Type"
df_anime = df_anime.withColumn("Type", when(col("Type") == "UNKNOWN", None).otherwise(col("Type")))

Número de animes con Type 'UNKNOWN': 74


In [17]:
# Filtrar las filas donde la columna Aired tiene el valor "UNKNOWN"
unknown_count = df_anime.filter(col("Aired") == "UNKNOWN").count()
print(f"Número de animes con Aired 'UNKNOWN': {unknown_count}")

Número de animes con Aired 'UNKNOWN': 0


In [18]:
# Filtrar las filas donde la columna X tiene el valor "Unknown"
# Para Status
unknown_count = df_anime.filter(col("Status") == "UNKNOWN").count()
print(f"Número de animes con Status 'UNKNOWN': {unknown_count}")
# Para Studios
unknown_count = df_anime.filter(col("Studios") == "UNKNOWN").count()
print(f"Número de animes con Studios 'UNKNOWN': {unknown_count}")
# Para Source
unknown_count = df_anime.filter(col("Source") == "UNKNOWN").count()
print(f"Número de animes con Source 'UNKNOWN': {unknown_count}")
# Para Duration
unknown_count = df_anime.filter(col("Duration") == "Unknown").count()
print(f"Número de animes con Duration 'UNKNOWN': {unknown_count}")
# Para Rating
unknown_count = df_anime.filter(col("Rating") == "UNKNOWN").count()
print(f"Número de animes con Rating 'UNKNOWN': {unknown_count}")
# Para Popularity
unknown_count = df_anime.filter(col("Popularity") == "Unknown").count()
print(f"Número de animes con Popularity 'UNKNOWN': {unknown_count}")
# Para Members
unknown_count = df_anime.filter(col("Members") == "UNKNOWN").count()
print(f"Número de animes con Members 'UNKNOWN': {unknown_count}")

Número de animes con Status 'UNKNOWN': 0
Número de animes con Studios 'UNKNOWN': 10526
Número de animes con Source 'UNKNOWN': 0
Número de animes con Duration 'UNKNOWN': 663
Número de animes con Rating 'UNKNOWN': 669
Número de animes con Popularity 'UNKNOWN': 0
Número de animes con Members 'UNKNOWN': 0


In [19]:
# Remplazar Unknown por vacios
df_anime = df_anime.withColumn("Duration", when(col("Duration") == "Unknown", None).otherwise(col("Duration")))
df_anime = df_anime.withColumn("Rating", when(col("Rating") == "UNKNOWN", None).otherwise(col("Rating")))

# Crear un dataframe buevo con columnas 'Name', 'Genres', 'Synopsis', 'Type', 'Episodes', 'Score',
# 'Aired', 'Status', 'Source', 'Duration', 'Rating', 'Popularity', 'Members'
data = df_anime.select("anime_id", "Name", "Genres", "Synopsis", "Type", "Episodes", "Score", "Aired",
                       "Status", "Source", "Duration", "Rating", "Popularity", "Members")
data.show()

+--------+--------------------+--------------------+--------------------+-----+-----------------+-----+--------------------+----------------+-----------+-------------+--------------------+----------+-------+
|anime_id|                Name|              Genres|            Synopsis| Type|         Episodes|Score|               Aired|          Status|     Source|     Duration|              Rating|Popularity|Members|
+--------+--------------------+--------------------+--------------------+-----+-----------------+-----+--------------------+----------------+-----------+-------------+--------------------+----------+-------+
|       1|        Cowboy Bebop|Action, Award Win...|Crime is timeless...|   TV|             26.0| 8.75|Apr 3, 1998 to Ap...| Finished Airing|   Original|24 min per ep|R - 17+ (violence...|        43|1771505|
|       5|Cowboy Bebop: Ten...|      Action, Sci-Fi|Another day, anot...|Movie|              1.0| 8.38|         Sep 1, 2001| Finished Airing|   Original|  1 hr 55 min|R

In [20]:
for col, dtype in data.dtypes:
    print(f"Columna: {col}, Tipo: {dtype}")

Columna: anime_id, Tipo: bigint
Columna: Name, Tipo: string
Columna: Genres, Tipo: string
Columna: Synopsis, Tipo: string
Columna: Type, Tipo: string
Columna: Episodes, Tipo: double
Columna: Score, Tipo: float
Columna: Aired, Tipo: string
Columna: Status, Tipo: string
Columna: Source, Tipo: string
Columna: Duration, Tipo: string
Columna: Rating, Tipo: string
Columna: Popularity, Tipo: bigint
Columna: Members, Tipo: bigint


In [21]:
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Asegurar que las columnas sean de tipo numérico
columns_to_normalize = ["Episodes", "Score", "Popularity", "Members"]
for column in columns_to_normalize:
    data = data.withColumn(column, col(column).cast("float"))

In [22]:
# Vectorizar columnas numéricas
assembler = VectorAssembler(inputCols=columns_to_normalize, outputCol="features_vector")
data = assembler.transform(data)

In [23]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
from pyspark.ml.linalg import Vector

# Función UDF para obtener la longitud de un vector
def get_vector_size(v):
    return len(v)

# Registrar la UDF
get_vector_size_udf = udf(get_vector_size, IntegerType())

# Aplicar la UDF para obtener el tamaño del vector
data = data.withColumn("vector_size", get_vector_size_udf("features_vector"))

# Verificar los resultados
data.select("features_vector", "vector_size").show(truncate=False)

# Eliminar la columna de verificación
data = data.drop("vector_size")

+---------------------------------------------------+-----------+
|features_vector                                    |vector_size|
+---------------------------------------------------+-----------+
|[26.0,8.75,43.0,1771505.0]                         |4          |
|[1.0,8.380000114440918,602.0,360978.0]             |4          |
|[26.0,8.220000267028809,246.0,727252.0]            |4          |
|[26.0,7.25,1795.0,111931.0]                        |4          |
|[52.0,6.940000057220459,5126.0,15001.0]            |4          |
|[145.0,7.920000076293945,1252.0,177688.0]          |4          |
|[24.0,8.0,862.0,260166.0]                          |4          |
|[52.0,7.550000190734863,4212.0,24172.0]            |4          |
|[24.0,8.15999984741211,1273.0,173710.0]            |4          |
|[74.0,8.869999885559082,142.0,1013100.0]           |4          |
|[220.0,7.989999771118164,8.0,2717330.0]            |4          |
|[14.913517951965332,8.6899995803833,20.0,2168904.0]|4          |
|[178.0,7.

In [24]:
# Aplicar MinMaxScaler
scaler = MinMaxScaler(inputCol="features_vector", outputCol="scaled_features")
scaler_model = scaler.fit(data)
data = scaler_model.transform(data)

In [25]:
# Tokenizar las columnas tipo string

from pyspark.ml.feature import Word2Vec
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# Paso 1: Tokenizar cada columna de texto
def tokenize_column(df, column_name):
    from pyspark.ml.feature import Tokenizer
    tokenizer = Tokenizer(inputCol=column_name, outputCol=f"{column_name}_tokens")
    df = tokenizer.transform(df)
    return df

# Paso 2: Generar los embeddings para cada columna
def generate_word2vec_embeddings(df, column_name, vector_size):
    word2Vec = Word2Vec(vectorSize=vector_size, minCount=1, inputCol=f"{column_name}_tokens", outputCol=f"{column_name}_embeddings")
    model = word2Vec.fit(df)
    df = model.transform(df)
    return df

In [26]:
# Asignación de dimensión de losembeddings

columns_embeddings = {
    'Genres': 25,
    'Synopsis': 100,
    'Type': 25,
    'Aired': 25,
    'Status': 25,
    'Source': 25,
    'Duration': 25,
    'Rating': 25}

# Reemplazar valores nulos o vacíos por una cadena vacía en las columnas relevantes
for column in columns_embeddings.keys():
    data = data.withColumn(column, when(col(column).isNull() | (col(column) == ''), '').otherwise(col(column)))

# Tokenizar todas las columnas
for column in columns_embeddings.keys():
    data = tokenize_column(data, column)

In [27]:
# Generar embeddings para todas las columnas
for column, vector_size in columns_embeddings.items():
    data = generate_word2vec_embeddings(data, column, vector_size)

In [28]:
# Concatenar todos los embeddings en una sola columna
embedding_columns = [f"{col}_embeddings" for col in columns_embeddings.keys()]
assembler = VectorAssembler(inputCols=embedding_columns, outputCol="final_embeddings")
data = assembler.transform(data)

# Mostrar el DataFrame con los embeddings concatenados
data.select("final_embeddings").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [29]:
# Concatenar 'final_embeddings' y 'features_vector'
assembler = VectorAssembler(inputCols=["final_embeddings", "features_vector"], outputCol="anime_embedding")
data = assembler.transform(data)

# Mostrar el DataFrame con la nueva columna 'anime_embedding'
data.select("anime_embedding").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [30]:
from pyspark.ml.linalg import DenseVector
from pyspark.ml.stat import Summarizer
from pyspark.sql import functions as F
import numpy as np
from pyspark.sql.functions import col

# Función para calcular la similitud coseno entre dos vectores
def cosine_similarity(vec1, vec2):
    # Convertir los vectores DenseVector a numpy arrays
    vec1 = np.array(vec1.toArray())
    vec2 = np.array(vec2.toArray())
    # Calcular la similitud del coseno
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

# Función para obtener recomendaciones de animes similares
def get_similar_animes(data, anime_name, top_n=5):
    # Filtrar el anime de referencia (por ejemplo, "One Piece")
    anime = data.filter(F.col("Name") == anime_name).select("anime_id", "Name", "anime_embedding").collect()[0]

    # Crear una lista para almacenar las similitudes
    similarities = []

    # Iterar sobre todos los animes en el dataset
    for row in data.collect():
        if row["Name"] != anime_name:  # Excluir el anime de referencia
            # Calcular la similitud del coseno entre el anime de referencia y el anime actual
            sim = cosine_similarity(anime["anime_embedding"], row["anime_embedding"])
            similarities.append((row["Name"], sim))

    # Ordenar los resultados por similitud descendente y seleccionar los más similares
    similar_animes = sorted(similarities, key=lambda x: x[1], reverse=True)[:top_n]

    return similar_animes

In [31]:
# Obtener recomendaciones
recommendations = get_similar_animes(data, "One Punch Man", top_n=10)

# Mostrar las recomendaciones
print("Recomendaciones para Anime de entrada:")
for anime, sim in recommendations:
    print(f"{anime} - Similarity: {sim}")

Recomendaciones para Anime de entrada:
Boku no Hero Academia - Similarity: 0.9999999999981873
Tokyo Ghoul - Similarity: 0.9999999999972885
Shingeki no Kyojin - Similarity: 0.9999999999952809
Shingeki no Kyojin Season 2 - Similarity: 0.9999999999932869
Sword Art Online - Similarity: 0.9999999999884164
Kimi no Na wa. - Similarity: 0.9999999999879635
Kimetsu no Yaiba - Similarity: 0.9999999999845907
No Game No Life - Similarity: 0.9999999999815673
Death Note - Similarity: 0.9999999999809759
Shingeki no Kyojin Season 3 - Similarity: 0.999999999974056


In [32]:
data.show()

+--------+--------------------+--------------------+--------------------+-----+---------+-----+--------------------+----------------+-----------+-------------+--------------------+----------+---------+--------------------+--------------------+--------------------+--------------------+-----------+--------------------+-------------------+--------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|anime_id|                Name|              Genres|            Synopsis| Type| Episodes|Score|               Aired|          Status|     Source|     Duration|              Rating|Popularity|  Members|     features_vector|     scaled_features|       Genres_tokens|     Synopsis_tokens|Type_tokens|        Aired_tokens|      Status_tokens| Source_tokens|   Duration_tokens|       Rating_tokens|   

In [33]:
# Crear un dataframe con la frase "Hello World"
data = spark.createDataFrame([("Hello World",)], ["text"])

# Tokenizar la columna de texto en palabras
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="text", outputCol="words")
tokenized = tokenizer.transform(data)

# Mostrar las palabras tokenizadas
tokenized.select("words").show(truncate=False)

# Configurar Word2Vec
word2Vec = Word2Vec(vectorSize=100, minCount=1, inputCol="words", outputCol="result")

# Crear y ajustar el modelo
model = word2Vec.fit(tokenized)
result = model.transform(tokenized)

# Mostrar el resultado del embedding
result.select("text", "result").show(truncate=False)

+--------------+
|words         |
+--------------+
|[hello, world]|
+--------------+

+-----------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------