# Transformaciones en el Dataset de Películas

Este notebook contiene las transformaciones realizadas en el dataset de películas para preparar los datos para la implementación de una API.

In [2]:

import pandas as pd
import ast

# Función para desanidar columnas que contienen listas de diccionarios
def extract_names_from_list(x):
    try:
        parsed = ast.literal_eval(x)
        if isinstance(parsed, list):
            return ', '.join([i['name'] for i in parsed])
        return None
    except (ValueError, SyntaxError, TypeError):
        return None

# Cargar el dataset
file_path = '../data/movies_dataset.csv'
movies_df = pd.read_csv(file_path)

# Mostrar las primeras filas del dataset original
movies_df.head()


  movies_df = pd.read_csv(file_path)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


## Transformaciones aplicadas

### Desanidación de belongs_to_collection

In [3]:

def extract_collection_name(x):
    try:
        return ast.literal_eval(x).get('name') if pd.notnull(x) else None
    except (ValueError, SyntaxError, AttributeError):
        return None

movies_df['belongs_to_collection'] = movies_df['belongs_to_collection'].apply(extract_collection_name)
movies_df[['belongs_to_collection']].head()


Unnamed: 0,belongs_to_collection
0,Toy Story Collection
1,
2,Grumpy Old Men Collection
3,
4,Father of the Bride Collection


### Desanidación de production_companies

In [4]:

def extract_production_companies_safe(x):
    try:
        parsed = ast.literal_eval(x)
        if isinstance(parsed, list):
            return ', '.join([i['name'] for i in parsed])
        return None
    except (ValueError, SyntaxError, TypeError):
        return None

movies_df['production_companies'] = movies_df['production_companies'].apply(extract_production_companies_safe)
movies_df[['production_companies']].head()


Unnamed: 0,production_companies
0,Pixar Animation Studios
1,"TriStar Pictures, Teitler Film, Interscope Com..."
2,"Warner Bros., Lancaster Gate"
3,Twentieth Century Fox Film Corporation
4,"Sandollar Productions, Touchstone Pictures"


### Desanidación de genres

In [5]:

movies_df['genres'] = movies_df['genres'].apply(extract_names_from_list)
movies_df[['genres']].head()


Unnamed: 0,genres
0,"Animation, Comedy, Family"
1,"Adventure, Fantasy, Family"
2,"Romance, Comedy"
3,"Comedy, Drama, Romance"
4,Comedy


### Desanidación de spoken_languages

In [6]:

movies_df['spoken_languages'] = movies_df['spoken_languages'].apply(extract_names_from_list)
movies_df[['spoken_languages']].head()


Unnamed: 0,spoken_languages
0,English
1,"English, Français"
2,English
3,English
4,English


### Desanidación de production_countries

In [7]:

movies_df['production_countries'] = movies_df['production_countries'].apply(extract_names_from_list)
movies_df[['production_countries']].head()


Unnamed: 0,production_countries
0,United States of America
1,United States of America
2,United States of America
3,United States of America
4,United States of America


### Relleno de valores nulos en revenue y budget

In [8]:

# Convertir columnas a valores numéricos, forzando a NaN los valores que no puedan convertirse
movies_df['revenue'] = pd.to_numeric(movies_df['revenue'], errors='coerce').fillna(0)
movies_df['budget'] = pd.to_numeric(movies_df['budget'], errors='coerce').fillna(0)
movies_df[['revenue', 'budget']].head()


Unnamed: 0,revenue,budget
0,373554033.0,30000000.0
1,262797249.0,65000000.0
2,0.0,0.0
3,81452156.0,16000000.0
4,76578911.0,0.0


### Eliminación de valores nulos en release_date

In [9]:

movies_df.dropna(subset=['release_date'], inplace=True)
movies_df['release_date'].head()


0    1995-10-30
1    1995-12-15
2    1995-12-22
3    1995-12-22
4    1995-02-10
Name: release_date, dtype: object

### Formateo de fecha y creación de columna release_year

In [10]:

movies_df['release_date'] = pd.to_datetime(movies_df['release_date'], errors='coerce').dt.strftime('%Y-%m-%d')
movies_df['release_year'] = pd.to_datetime(movies_df['release_date'], errors='coerce').dt.year
movies_df[['release_date', 'release_year']].head()


Unnamed: 0,release_date,release_year
0,1995-10-30,1995.0
1,1995-12-15,1995.0
2,1995-12-22,1995.0
3,1995-12-22,1995.0
4,1995-02-10,1995.0


### Creación de la columna return (revenue / budget)

In [11]:

movies_df['return'] = movies_df.apply(lambda row: row['revenue'] / row['budget'] if row['budget'] > 0 else 0, axis=1)
movies_df[['revenue', 'budget', 'return']].head()


Unnamed: 0,revenue,budget,return
0,373554033.0,30000000.0,12.451801
1,262797249.0,65000000.0,4.043035
2,0.0,0.0,0.0
3,81452156.0,16000000.0,5.09076
4,76578911.0,0.0,0.0


### Eliminación de columnas innecesarias

In [12]:

movies_df.drop(columns=['video', 'imdb_id', 'adult', 'original_title', 'poster_path', 'homepage'], inplace=True)
movies_df.head()


Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,release_year,return
0,Toy Story Collection,30000000.0,"Animation, Comedy, Family",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios,United States of America,1995-10-30,373554033.0,81.0,English,Released,,Toy Story,7.7,5415.0,1995.0,12.451801
1,,65000000.0,"Adventure, Fantasy, Family",8844,en,When siblings Judy and Peter discover an encha...,17.015539,"TriStar Pictures, Teitler Film, Interscope Com...",United States of America,1995-12-15,262797249.0,104.0,"English, Français",Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995.0,4.043035
2,Grumpy Old Men Collection,0.0,"Romance, Comedy",15602,en,A family wedding reignites the ancient feud be...,11.7129,"Warner Bros., Lancaster Gate",United States of America,1995-12-22,0.0,101.0,English,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,1995.0,0.0
3,,16000000.0,"Comedy, Drama, Romance",31357,en,"Cheated on, mistreated and stepped on, the wom...",3.859495,Twentieth Century Fox Film Corporation,United States of America,1995-12-22,81452156.0,127.0,English,Released,Friends are the people who let you be yourself...,Waiting to Exhale,6.1,34.0,1995.0,5.09076
4,Father of the Bride Collection,0.0,Comedy,11862,en,Just when George Banks has recovered from his ...,8.387519,"Sandollar Productions, Touchstone Pictures",United States of America,1995-02-10,76578911.0,106.0,English,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,5.7,173.0,1995.0,0.0
