In [None]:
%%configure -f
{ "conf":{
          "spark.pyspark.python": "python3",
          "spark.pyspark.virtualenv.enabled": "true",
          "spark.pyspark.virtualenv.type":"native",
          "spark.pyspark.virtualenv.bin.path":"/usr/bin/virtualenv"
         }
}

In [None]:
sc.install_pypi_package("pandas")
sc.install_pypi_package("numpy")

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Content_Based_Recommendation").getOrCreate()

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, explode, split, when, mean, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import MinMaxScaler, VectorAssembler
from pyspark.ml import Pipeline

In [None]:
sc.install_pypi_package("fsspec")
sc.install_pypi_package("s3fs")

In [None]:
import pandas as pd
df_pandas = pd.read_csv("s3://proyectommds/anime-dataset-2023.csv")
df_anime = spark.createDataFrame(df_pandas)
df_anime.show()

In [None]:
# Mostrar el esquema del DataFrame
df_anime.printSchema()

In [None]:
# Número de filas y columnas
num_rows = df_anime.count()
num_cols = len(df_anime.columns)
print(f"Número de filas: {num_rows}, Número de columnas: {num_cols}")

In [None]:
df_anime.describe().show()

In [None]:
# Lista de columnas y sus tipos
for col, dtype in df_anime.dtypes:
    print(f"Columna: {col}, Tipo: {dtype}")

In [None]:
# Agrupar animes por tipo
df_type=df_anime.groupBy("Type").count().orderBy(F.desc("count"))
df_type.show()

In [None]:
from pyspark.sql.functions import col, when, split, explode
# Reemplazar "UNKNOWN" por NULL (vacío) en la columna "Genres"
df_anime = df_anime.withColumn("Genres", when(col("Genres") == "UNKNOWN", None).otherwise(col("Genres")))
# Dividir la columna "Genres" en una lista de géneros
df_genres = df_anime.withColumn("GenreArray", split(col("Genres"), ", "))
# Explode: Expandir los géneros en filas individuales
df_exploded = df_genres.withColumn("Genre", explode(col("GenreArray")))
# Contar las ocurrencias de cada género
genre_counts = df_exploded.groupBy("Genre").count().orderBy(col("count").desc())
# Mostrar el resultado
genre_counts.show()

In [None]:
# Filtrar filas con valor "UNKNOWN"
unknown_score = df_anime.filter(col("Score") == "UNKNOWN").count()
print(f"Número de animes con Score 'UNKNOWN': {unknown_score}")
unknown_episodes = df_anime.filter(col("Episodes") == "UNKNOWN").count()
print(f"Número de animes con Episodes 'UNKNOWN': {unknown_episodes}")
unknown_type = df_anime.filter(col("Type") == "UNKNOWN").count()
print(f"Número de animes con Type 'UNKNOWN': {unknown_type}")
unknown_aired = df_anime.filter(col("Aired") == "UNKNOWN").count()
print(f"Número de animes con Aired 'UNKNOWN': {unknown_aired}")
unknown_status = df_anime.filter(col("Status") == "UNKNOWN").count()
print(f"Número de animes con Status 'UNKNOWN': {unknown_status}")
unknown_studios = df_anime.filter(col("Studios") == "UNKNOWN").count()
print(f"Número de animes con Studios 'UNKNOWN': {unknown_studios}")
unknown_source = df_anime.filter(col("Source") == "UNKNOWN").count()
print(f"Número de animes con Source 'UNKNOWN': {unknown_source}")
unknown_duration = df_anime.filter(col("Duration") == "Unknown").count()
print(f"Número de animes con Duration 'UNKNOWN': {unknown_duration}")
unknown_rating = df_anime.filter(col("Rating") == "UNKNOWN").count()
print(f"Número de animes con Rating 'UNKNOWN': {unknown_rating}")
unknown_pop = df_anime.filter(col("Popularity") == "Unknown").count()
print(f"Número de animes con Popularity 'UNKNOWN': {unknown_pop}")
unknown_mem = df_anime.filter(col("Members") == "UNKNOWN").count()
print(f"Número de animes con Members 'UNKNOWN': {unknown_mem}")

In [None]:
# Episodes a tipo float
df_anime = df_anime.withColumn("Episodes", col("Episodes").cast("float"))
# Filtrar las filas que tienen valores numéricos
df_numeric_episodes = df_anime.filter(col("Episodes").isNotNull())
# Calcular la media de los episodios
mean_episodes = df_numeric_episodes.select(mean(col("Episodes"))).collect()[0][0]
# Reemplazar los valores "UNKNOWN" por la media calculada
df_anime = df_anime.withColumn("Episodes", when(col("Episodes").isNull(), mean_episodes).otherwise(col("Episodes")))
# Ordenar por Score en orden descendente, seleccionar columnas y limitar a 15 resultados
top_15_animes = (df_anime.orderBy(col("Episodes").desc()).select("Name", "Episodes").limit(15))
# Mostrar el resultado
top_15_animes.show(truncate=False)

In [None]:
# Reemplazar "UNKNOWN" por NULL
df_anime = df_anime.withColumn("Type", when(col("Type") == "UNKNOWN", None).otherwise(col("Type")))
df_anime = df_anime.withColumn("Duration", when(col("Duration") == "Unknown", None).otherwise(col("Duration")))
df_anime = df_anime.withColumn("Rating", when(col("Rating") == "UNKNOWN", None).otherwise(col("Rating")))

In [None]:
# DataFrame con las columnas definitivas
data = df_anime.select("anime_id", "Name", "Genres", "Synopsis", "Type", "Episodes", "Aired", 
                       "Status", "Source", "Duration", "Rating", "Popularity", "Members")

for col, dtype in data.dtypes:
    print(f"Columna: {col}, Tipo: {dtype}")

In [None]:
from pyspark.sql.functions import col, udf
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.types import IntegerType

# Variables tipo numéricas
columns_to_normalize = ["Episodes", "Popularity", "Members"]
for column in columns_to_normalize:
    data = data.withColumn(column, col(column).cast("float"))
# Vectorizar columnas numéricas
assembler = VectorAssembler(inputCols=columns_to_normalize, outputCol="features_vector")
data = assembler.transform(data)

# Función UDF para obtener la longitud de un vector
def get_vector_size(v):
    return len(v)
# Registrar la UDF
get_vector_size_udf = udf(get_vector_size, IntegerType())
# Aplicar la UDF para obtener el tamaño del vector
data = data.withColumn("vector_size", get_vector_size_udf("features_vector"))

# Verificar los resultados
data.select("features_vector", "vector_size").show(truncate=False)

# Eliminar la columna de verificación
data = data.drop("vector_size")

In [None]:
# Aplicar MinMaxScaler
scaler = MinMaxScaler(inputCol="features_vector", outputCol="scaled_features")
scaler_model = scaler.fit(data)
data = scaler_model.transform(data)

## TF-IDF 

In [None]:
from pyspark.sql.functions import concat_ws

# Concatenar todas las columnas tipo string
data = data.withColumn("combined_text", concat_ws(" ", "Name", "Genres", "Synopsis", "Type", 
                                                  "Aired", "Status", "Source", "Duration", "Rating"))

In [None]:
from pyspark.sql.functions import lower, regexp_replace, split
from pyspark.ml.feature import StopWordsRemover

# Texto en minusculas y eliminar caracteres especiales
data = data.withColumn("processed_text", lower(col("combined_text")))
data = data.withColumn("processed_text", regexp_replace(col("processed_text"), "[^a-zA-Z\\s]", ""))

# Convertir la columna 'processed_text' en un array de palabras
data = data.withColumn("words", split(col("processed_text"), " "))

# Eliminar stopwords 
remover = StopWordsRemover(inputCol="words", outputCol="filtered_text")
data = remover.transform(data)

data.select("processed_text", "words", "filtered_text").show(truncate=False)

In [None]:
from pyspark.ml.feature import HashingTF, IDF
from pyspark.ml import Pipeline

# Crear un HashingTF para vectorizar el texto procesado
hashingTF = HashingTF(inputCol="filtered_text", outputCol="raw_features", numFeatures=2000)

# Calcular TF-IDF
idf = IDF(inputCol="raw_features", outputCol="features")

# Pipeline para las transformaciones
pipeline = Pipeline(stages=[hashingTF, idf])

model = pipeline.fit(data)
data = model.transform(data)

data.select("anime_id", "features").show(truncate=False)

In [None]:
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
import numpy as np

# Función UDF para calcular la similitud de coseno
def cosine_similarity(vec1, vec2):
    # Convertir vectores de SparseVector a arrays
    vec1_array = np.array(vec1.toArray())
    vec2_array = np.array(vec2.toArray())
    
    # Calcular el producto punto y las longitudes de los vectores
    dot_product = np.dot(vec1_array, vec2_array)
    norm_a = np.linalg.norm(vec1_array)
    norm_b = np.linalg.norm(vec2_array)
    
    # Calcular la similitud de coseno
    return float(dot_product / (norm_a * norm_b)) if norm_a and norm_b else 0.0

# Registrar la UDF de similitud de coseno
cosine_udf = udf(cosine_similarity, DoubleType())

# matriz de similitudes
cosine_sim_matrix = data.alias("df1").join(data.alias("df2"), col("df1.anime_id") != col("df2.anime_id")) \
    .withColumn("cosine_sim", cosine_udf(col("df1.features"), col("df2.features")))

cosine_sim_matrix.select("df1.anime_id", "df2.anime_id", "cosine_sim").show(truncate=False)

In [None]:
from pyspark.sql.functions import col, desc

# Función para obtener recomendaciones basadas en la similitud de coseno
def get_recommendations(anime_name, cosine_sim_matrix, df, top_n=10):
    # Buscar el anime_id basado en el nombre
    anime_id_row = df.filter(col("Name") == anime_name).select("anime_id").first()
    
    if anime_id_row is None:
        return "Anime not found"
    
    anime_id = anime_id_row["anime_id"]
    
    # Filtrar las similitudes para el anime_id dado
    recommendations = cosine_sim_matrix.filter(col("df1.anime_id") == anime_id) \
        .orderBy(desc("cosine_sim")) \
        .select("df2.anime_id", "df2.Name", "cosine_sim")
    
    # Mostrar las top_n recomendaciones
    recommendations = recommendations.limit(top_n)
    
    return recommendations

In [None]:
# Ejemplo:
recommended_animes = get_recommendations(anime_name="Tensei shitara Slime Datta Ken", cosine_sim_matrix=cosine_sim_matrix, df=data, top_n=20)
recommended_animes.show(truncate=False)