In [2]:

# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


# christmas-movies ✨

In [3]:
# cargamos el dataframe correspondiente 
df = pd.read_csv("archivos/christmas_movies_01.csv")

display(df.head())

Unnamed: 0,title,rating,runtime,imdb_rating,meta_score,genre,release_year,description,director,stars,votes,gross,img_src,type
0,Love Actually,R,135.0,7.6,55.0,"Comedy, Drama, Romance",2003.0,Follows the lives of eight very different coup...,Richard Curtis,"Hugh Grant, Martine McCutcheon, Liam Neeson, L...",517283,$59.70M,https://m.media-amazon.com/images/M/MV5BNThkNj...,Movie
1,Home Alone,PG,103.0,7.7,63.0,"Comedy, Family",1990.0,"An eight-year-old troublemaker, mistakenly lef...",Chris Columbus,"Macaulay Culkin, Joe Pesci, Daniel Stern, John...",629713,$285.76M,https://m.media-amazon.com/images/M/MV5BMzFkM2...,Movie
2,National Lampoon's Christmas Vacation,PG-13,97.0,7.5,49.0,Comedy,1989.0,The Griswold family's plans for a big family C...,Jeremiah S. Chechik,"Chevy Chase, Beverly D'Angelo, Juliette Lewis,...",213196,$71.32M,https://m.media-amazon.com/images/M/MV5BMGZkMW...,Movie
3,Elf,PG,97.0,7.1,66.0,"Adventure, Comedy, Family",2003.0,"Raised as an oversized elf, Buddy travels from...",Jon Favreau,"Will Ferrell, James Caan, Bob Newhart, Zooey D...",300546,$173.40M,https://m.media-amazon.com/images/M/MV5BMzUxNz...,Movie
4,How the Grinch Stole Christmas,PG,104.0,6.3,46.0,"Comedy, Family, Fantasy",2000.0,"On the outskirts of Whoville lives a green, re...",Ron Howard,"Jim Carrey, Taylor Momsen, Kelley, Jeffrey Tam...",280898,$260.04M,https://m.media-amazon.com/images/M/MV5BNWNiNT...,Movie


In [4]:
def exploracion(df):
    df_info = pd.DataFrame()
    df_info["% nulos"] = round(df.isna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["% no_nulos"] = round(df.notna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["tipo_dato"] = df.dtypes
    df_info["num_valores_unicos"] = df.nunique()
    print(f"""El DataFrame tiene {df.shape[0]} filas y {df.shape[1]} columnas.
Tiene {df.duplicated().sum()} datos duplicados, lo que supone un porcentaje de {round(df.duplicated().sum()/df.shape[0], 2)}% de los datos.
Hay {len(list(df_info[(df_info["% nulos"] != "0.0%")].index))} columnas con datos nulos, y son:
{list(df_info[(df_info["% nulos"] != "0.0%")].index)}
y sin nulos hay {len(list(df_info[(df_info["% nulos"] == "0.0%")].index))} columnas y son:
{list(df_info[(df_info["% nulos"] == "0.0%")].index)}
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:""")
    display(df_info.head())
    print("Principales estadísticos de las columnas categóricas:")
    display(df.describe(include="O").T)
    print("Principales estadísticos de las columnas numéricas:")
    display(df.describe(exclude="O").T)
    return df_info

exploracion(df)

El DataFrame tiene 873 filas y 14 columnas.
Tiene 0 datos duplicados, lo que supone un porcentaje de 0.0% de los datos.
Hay 10 columnas con datos nulos, y son:
['rating', 'runtime', 'imdb_rating', 'meta_score', 'genre', 'release_year', 'director', 'stars', 'votes', 'gross']
y sin nulos hay 4 columnas y son:
['title', 'description', 'img_src', 'type']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
title,0.0%,100.0%,object,848
rating,24.4%,75.6%,object,13
runtime,4.7%,95.3%,float64,95
imdb_rating,3.89%,96.11%,float64,61
meta_score,88.55%,11.45%,float64,54


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
title,873,848,A Christmas Carol,6
rating,660,13,TV-G,262
genre,872,126,"Comedy, Drama, Romance",109
description,873,858,Add a Plot,16
director,868,546,Peter Sullivan,13
stars,862,858,"Rose McIver, Ben Lamb, Alice Krige, Honor Knea...",3
votes,839,763,992,4
gross,79,79,$16.60M,1
img_src,873,854,https://m.media-amazon.com/images/S/sash/i-t32...,20
type,873,2,Movie,864


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
runtime,832.0,86.596154,18.152062,1.0,84.0,87.0,91.0,199.0
imdb_rating,839.0,6.100477,0.966392,1.3,5.6,6.2,6.7,9.2
meta_score,100.0,57.65,17.901174,18.0,46.75,56.0,69.0,96.0
release_year,862.0,2009.49884,20.274696,1898.0,2010.0,2017.0,2020.0,2023.0


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
title,0.0%,100.0%,object,848
rating,24.4%,75.6%,object,13
runtime,4.7%,95.3%,float64,95
imdb_rating,3.89%,96.11%,float64,61
meta_score,88.55%,11.45%,float64,54
genre,0.11%,99.89%,object,126
release_year,1.26%,98.74%,float64,82
description,0.0%,100.0%,object,858
director,0.57%,99.43%,object,546
stars,1.26%,98.74%,object,858


In [5]:
# Informacion general 
print(f'Informacion df: {df.info()}')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 873 entries, 0 to 872
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         873 non-null    object 
 1   rating        660 non-null    object 
 2   runtime       832 non-null    float64
 3   imdb_rating   839 non-null    float64
 4   meta_score    100 non-null    float64
 5   genre         872 non-null    object 
 6   release_year  862 non-null    float64
 7   description   873 non-null    object 
 8   director      868 non-null    object 
 9   stars         862 non-null    object 
 10  votes         839 non-null    object 
 11  gross         79 non-null     object 
 12  img_src       873 non-null    object 
 13  type          873 non-null    object 
dtypes: float64(4), object(10)
memory usage: 95.6+ KB
Informacion df: None


# Valores unicos de las variables categoricas ✨

In [6]:
# creamos una lista con los nombres de las columnas categoricas 
columnas = df.select_dtypes(include='object').columns.tolist()
print(columnas)
# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas:
    print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
    print(f"Sus valores únicos son: {df[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de las categorías son: {df[columna].value_counts()} ")

['title', 'rating', 'genre', 'description', 'director', 'stars', 'votes', 'gross', 'img_src', 'type']
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'TITLE' -----------

Sus valores únicos son: ['Love Actually' 'Home Alone' "National Lampoon's Christmas Vacation"
 'Elf' 'How the Grinch Stole Christmas' 'The Grinch' 'Die Hard'
 'Home Alone 2: Lost in New York' 'The Polar Express'
 "It's a Wonderful Life" 'The Santa Clause' 'White Christmas' 'Scrooged'
 'Spirited' 'Miracle on 34th Street' 'Jingle All the Way' 'Klaus'
 'A Christmas Carol' 'Last Christmas' 'Christmas with the Kranks'
 'The Santa Clause 2' 'Four Christmases' 'Home Alone 3'
 'The Muppet Christmas Carol' 'Office Christmas Party'
 'Feast of the Seven Fishes' 'Bad Santa' 'The Christmas Chronicles'
 'The Nightmare Before Christmas' 'Deck the Halls'
 'Best. Christmas. Ever!' "It's a Wonderful Knife"
 'A Christmas Story Christmas' "Santa's Slay"
 'Rudolph the Red-Nosed Reindeer' 'How the Grinch Stole Christmas!'
 'The Naughty Nine' 

# DUPLICADOS

In [7]:
def get_duplicate_rows(df):
    """
    Encuentra y devuelve las filas completas duplicadas de un DataFrame.

    Args:
        df (pd.DataFrame): El DataFrame de entrada.

    Returns:
        pd.DataFrame: Un nuevo DataFrame con las filas duplicadas completas.
    """
    # Identificar duplicados basados en todas las columnas
    duplicate_rows = df[df.duplicated(keep=False)]
    return duplicate_rows

get_duplicate_rows(df)

Unnamed: 0,title,rating,runtime,imdb_rating,meta_score,genre,release_year,description,director,stars,votes,gross,img_src,type


# imdb-christmas-movies-from-20162022 ✨

In [11]:
# cargamos el dataframe correspondiente 

df = pd.read_csv('archivos/christmas_movies_02.csv', encoding='latin1')

display(df.head())

Unnamed: 0,movie_title,Title_URL,image_url,movie_url,release_year,certificate,time,genre,stars,Score,description,director_url,director_name,cast_url1,cast1,cast_url2,cast2,cast_url3,cast3,cast_url4,cast4,votes
0,Shazam!,https://www.imdb.com/title/tt0448115/?ref_=kw_...,https://m.media-amazon.com/images/M/MV5BOWZhZj...,https://www.imdb.com/title/tt0448115/?ref_=kw_...,(2019),PG-13,132 min,"Action, Adventure, Comedy",7.0,71 \n Metascore,A newly fostered young boy in search of his mo...,https://www.imdb.com/name/nm2497546/?ref_=kw_l...,David F. Sandberg,https://www.imdb.com/name/nm1157048/?ref_=kw_l...,Zachary Levi,https://www.imdb.com/name/nm0835016/?ref_=kw_l...,Mark Strong,https://www.imdb.com/name/nm4755508/?ref_=kw_l...,Asher Angel,https://www.imdb.com/name/nm6244013/?ref_=kw_l...,Jack Dylan Grazer,335718
1,Little Women,https://www.imdb.com/title/tt3281548/?ref_=kw_...,https://m.media-amazon.com/images/S/sash/4Fyxw...,https://www.imdb.com/title/tt3281548/?ref_=kw_...,(2019),PG,135 min,"Drama, Romance",7.8,91 \n Metascore,"Jo March reflects back and forth on her life, ...",https://www.imdb.com/name/nm1950086/?ref_=kw_l...,Greta Gerwig,https://www.imdb.com/name/nm1519680/?ref_=kw_l...,Saoirse Ronan,https://www.imdb.com/name/nm0914612/?ref_=kw_l...,Emma Watson,https://www.imdb.com/name/nm6073955/?ref_=kw_l...,Florence Pugh,https://www.imdb.com/name/nm7340546/?ref_=kw_l...,Eliza Scanlen,203580
2,A Bad Moms Christmas,https://www.imdb.com/title/tt6359956/?ref_=kw_...,https://m.media-amazon.com/images/S/sash/4Fyxw...,https://www.imdb.com/title/tt6359956/?ref_=kw_...,(2017),R,104 min,Comedy,5.6,42 \n Metascore,"As their own mothers drop in unexpectedly, our...",https://www.imdb.com/name/nm0524190/?ref_=kw_l...,Jon Lucas,https://www.imdb.com/name/nm0601859/?ref_=kw_l...,Scott Moore,https://www.imdb.com/name/nm0005109/?ref_=kw_l...,Mila Kunis,https://www.imdb.com/name/nm0068338/?ref_=kw_l...,Kristen Bell,https://www.imdb.com/name/nm1063517/?ref_=kw_l...,Kathryn Hahn,51131
3,The Grinch,https://www.imdb.com/title/tt2709692/?ref_=kw_...,https://m.media-amazon.com/images/S/sash/4Fyxw...,https://www.imdb.com/title/tt2709692/?ref_=kw_...,(2018),PG,85 min,"Animation, Comedy, Family",6.3,51 \n Metascore,A grumpy Grinch plots to ruin Christmas for th...,https://www.imdb.com/name/nm0155528/?ref_=kw_l...,Yarrow Cheney,https://www.imdb.com/name/nm0608714/?ref_=kw_l...,Scott Mosier,https://www.imdb.com/name/nm1212722/?ref_=kw_l...,Benedict Cumberbatch,https://www.imdb.com/name/nm7372981/?ref_=kw_l...,Cameron Seely,https://www.imdb.com/name/nm0429069/?ref_=kw_l...,Rashida Jones,70679
4,Last Christmas,https://www.imdb.com/title/tt8623904/?ref_=kw_...,https://m.media-amazon.com/images/S/sash/4Fyxw...,https://www.imdb.com/title/tt8623904/?ref_=kw_...,(2019),PG-13,103 min,"Comedy, Drama, Fantasy",6.5,50 \n Metascore,Kate is a young woman subscribed to bad decisi...,https://www.imdb.com/name/nm0082450/?ref_=kw_l...,Paul Feig,https://www.imdb.com/name/nm10916154/?ref_=kw_...,Madison Ingoldsby,https://www.imdb.com/name/nm0000668/?ref_=kw_l...,Emma Thompson,https://www.imdb.com/name/nm0410667/?ref_=kw_l...,Boris Isakovic,https://www.imdb.com/name/nm8838405/?ref_=kw_l...,Lucy Miller,78112


In [12]:
exploracion(df)

El DataFrame tiene 278 filas y 22 columnas.
Tiene 0 datos duplicados, lo que supone un porcentaje de 0.0% de los datos.
Hay 4 columnas con datos nulos, y son:
['certificate', 'Score', 'cast_url4', 'cast4']
y sin nulos hay 18 columnas y son:
['movie_title', 'Title_URL', 'image_url', 'movie_url', 'release_year', 'time', 'genre', 'stars', 'description', 'director_url', 'director_name', 'cast_url1', 'cast1', 'cast_url2', 'cast2', 'cast_url3', 'cast3', 'votes']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
movie_title,0.0%,100.0%,object,278
Title_URL,0.0%,100.0%,object,278
image_url,0.0%,100.0%,object,4
movie_url,0.0%,100.0%,object,54
release_year,0.0%,100.0%,object,20


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
movie_title,278,278,re: View,1
Title_URL,278,278,https://www.imdb.com/title/tt5763098/?ref_=kw_...,1
image_url,278,4,https://m.media-amazon.com/images/S/sash/4Fyxw...,275
movie_url,278,54,https://www.imdb.com/search/keyword/?keywords=...,50
release_year,278,20,(2020 TV Movie),48
certificate,233,11,TV-G,124
time,278,49,84 min,72
genre,278,50,"Drama, Romance",55
Score,26,22,51 \n Metascore,3
description,278,278,Jay Bauman and Josh Davis take a look at Jay's...,1


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
stars,278.0,6.066906,0.765491,3.2,5.7,6.2,6.5,8.1


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
movie_title,0.0%,100.0%,object,278
Title_URL,0.0%,100.0%,object,278
image_url,0.0%,100.0%,object,4
movie_url,0.0%,100.0%,object,54
release_year,0.0%,100.0%,object,20
certificate,16.19%,83.81%,object,11
time,0.0%,100.0%,object,49
genre,0.0%,100.0%,object,50
stars,0.0%,100.0%,float64,40
Score,90.65%,9.35%,object,22


In [13]:
# creamos una lista con los nombres de las columnas categoricas 
columnas = df.select_dtypes(include='object').columns.tolist()
print(columnas)
# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas:
    print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
    print(f"Sus valores únicos son: {df[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de las categorías son: {df[columna].value_counts()} ")

['movie_title', 'Title_URL', 'image_url', 'movie_url', 'release_year', 'certificate', 'time', 'genre', 'Score', 'description', 'director_url', 'director_name', 'cast_url1', 'cast1', 'cast_url2', 'cast2', 'cast_url3', 'cast3', 'cast_url4', 'cast4', 'votes']
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'MOVIE_TITLE' -----------

Sus valores únicos son: ['Shazam!' 'Little Women' 'A Bad Moms Christmas' 'The Grinch'
 'Last Christmas' 'Happiest Season' 'Holidate' 'Klaus'
 'Christmas at Castle Hart' 'The Christmas Train' 'Instant Family'
 'The Christmas Chronicles' 'A Christmas Wish' 'Fatman'
 'Next Stop, Christmas' 'Christmas on My Mind' 'One Royal Holiday'
 'The Nine Kittens of Christmas' 'A Timeless Christmas'
 'My Christmas Family Tree' 'The Christmas Ring' 'Office Christmas Party'
 'The Christmas Chronicles: Part Two'
 'Time for Me to Come Home for Christmas' '8-Bit Christmas'
 'The Nutcracker and the Four Realms' 'Noelle' 'Switched for Christmas'
 'Christmas by Starlight' 'Chateau Chris