# Transformación de Datos

## 1. Importar las Librerías:

In [1]:
import pandas as pd
import numpy as np
import os
import warnings
import re
import ast

warnings.filterwarnings('ignore')

## 2. Cargar el Dataset:

In [2]:
# Definir la ruta al directorio de datasets
ruta_datasets = '../Datasets'

# Cargar el archivo credits.csv
archivo_credits = os.path.join(ruta_datasets, 'credits.csv')
df_credits = pd.read_csv(archivo_credits)

# Cargar el archivo movies_dataset.csv
archivo_movies = os.path.join(ruta_datasets, 'movies_dataset.csv')
df_movies = pd.read_csv(archivo_movies)

# Mostrar las primeras filas de cada dataset para verificar la carga
print("Datos de credits.csv:")
display(df_credits.head(1))

print("\nDatos de movies_dataset.csv:")
display(df_movies.head(2))

Datos de credits.csv:


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862



Datos de movies_dataset.csv:


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


## 3. Limpieza de Datos

### 3.1 Limpieza de credits.csv


#### 3.1.1 Columna 'CAST'

Desanidación de datos, filtrado de columnas y guardado en formato CSV

In [3]:
# Lista para almacenar los resultados
cast_data = []

# Iterar sobre cada fila del DataFrame original
for index, row in df_credits.iterrows():
    cast_list = row['cast']
    # Extraer el ID de la película
    movie_id = row['id']  
    
    # Validar si cast_list es una lista
    if isinstance(cast_list, str):
        try:
            cast_list = eval(cast_list)
        except:
            print('No se puede convertir: ', row['id'])

    if isinstance(cast_list, list):
        for cast_member in cast_list:
            # Asegurarse de que es un diccionario
            if isinstance(cast_member, dict):  
                # Extraer las columnas deseadas
                cast_data.append({
                    'ordenReparto': cast_member.get('order'),
                    'id': cast_member.get('id'),
                    'nombre': cast_member.get('name'),
                    'genero': cast_member.get('gender'),

                    # Añadir el ID de la película
                    'idPelicula': movie_id  
                })

# Convertir la lista de diccionarios en un DataFrame
cast_selected = pd.DataFrame(cast_data)

# Dummies para gender

# Mapear los valores de 'gender'
cast_selected['genero'] = cast_selected['genero'].map({0: 'No especificado', 1: 'Mujer', 2: 'Hombre'})

# Crear variables dummies para la columna 'gender'
gender_dummies = pd.get_dummies(cast_selected['genero'])

# Unir las columnas dummies con el DataFrame original
cast_selected = pd.concat([cast_selected, gender_dummies], axis=1)

# Eliminar fila genero
cast_selected = cast_selected.drop(columns='genero')

cast_selected.head()

Unnamed: 0,ordenReparto,id,nombre,idPelicula,Hombre,Mujer,No especificado
0,0,31,Tom Hanks,862,True,False,False
1,1,12898,Tim Allen,862,True,False,False
2,2,7167,Don Rickles,862,True,False,False
3,3,12899,Jim Varney,862,True,False,False
4,4,12900,Wallace Shawn,862,True,False,False


Exportar Dataset

In [4]:
#cast_selected.to_csv('../Datasets/Datasets Limpios/cast.csv')
#cast_selected.to_parquet('../Datasets/Datasets_Limpios/Parquet/cast.parquet', engine='pyarrow')

#### 3.1.2 Columna 'CREW'

Desanidación de datos y guardado en formato CSV

In [5]:
# Lista para almacenar los resultados
crew_data = []

# Iterar sobre cada fila del DataFrame original
for index, row in df_credits.iterrows():
    crew_list = row['crew']
    # Extraer el ID de la película
    movie_id = row['id']  
    
    # Validar si cast_list es una cadena de texto
    if isinstance(crew_list, str):
        try:
            crew_list = eval(crew_list)
        except:
            print('No se puede convertir: ', row['id'])
            
    if isinstance(crew_list, list):
        for crew_member in crew_list:
            # Asegurarse de que es un diccionario
            if isinstance(crew_member, dict):  
                # Extraer las columnas deseadas
                crew_data.append({
                    'id': crew_member.get('id'),
                    'nombre': crew_member.get('name'),
                    'departamento': crew_member.get('department'),
                    'cargo': crew_member.get('job'),
                    'genero': crew_member.get('gender'),
                    
                    # Añadir el ID de la película
                    'idPelicula': movie_id  
                })

# Convertir la lista de diccionarios en un DataFrame
crew_selected = pd.DataFrame(crew_data)

# Dummies para gender

# Mapear los valores de 'gender'
crew_selected['genero'] = crew_selected['genero'].map({0: 'No especificado', 1: 'Mujer', 2: 'Hombre'})

# Crear variables dummies para la columna 'gender'
gender_dummies = pd.get_dummies(crew_selected['genero'])

# Unir las columnas dummies con el DataFrame original
crew_selected = pd.concat([crew_selected, gender_dummies], axis=1)

# Eliminar fila genero
crew_selected = crew_selected.drop(columns='genero')

crew_selected.head()


Unnamed: 0,id,nombre,departamento,cargo,idPelicula,Hombre,Mujer,No especificado
0,7879,John Lasseter,Directing,Director,862,True,False,False
1,12891,Joss Whedon,Writing,Screenplay,862,True,False,False
2,7,Andrew Stanton,Writing,Screenplay,862,True,False,False
3,12892,Joel Cohen,Writing,Screenplay,862,True,False,False
4,12893,Alec Sokolow,Writing,Screenplay,862,False,False,True


Exportar Dataset

In [6]:
#crew_selected.to_csv('../Datasets/Datasets Limpios/crew.csv')
#crew_selected.to_parquet('../Datasets/Datasets_Limpios/Parquet/crew.parquet', engine='pyarrow')

### 3.2 Limpieza de movies_dataset.csv

#### 3.2.1 Selección y filtrado de columnas

Limpieza de datos

In [7]:
# Obtener las columnas del Dataframe para filtrar a partir del diccionario
print(f"Columnas del dataframe original: {df_movies.columns}")

# Seleccionar y ordenar las columnas a utilizar
columns = [
    'id', 'title', 'genres', 'overview', 'tagline', 'runtime', 'release_date', 'production_companies', 'belongs_to_collection',
    'original_language', 'spoken_languages', 'budget', 'revenue', 'status', 'production_countries',
    'popularity', 'vote_average', 'vote_count'
]

df_movies_new = df_movies[columns]

Columnas del dataframe original: Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


In [8]:
df_movies_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     45466 non-null  object 
 1   title                  45460 non-null  object 
 2   genres                 45466 non-null  object 
 3   overview               44512 non-null  object 
 4   tagline                20412 non-null  object 
 5   runtime                45203 non-null  float64
 6   release_date           45379 non-null  object 
 7   production_companies   45463 non-null  object 
 8   belongs_to_collection  4494 non-null   object 
 9   original_language      45455 non-null  object 
 10  spoken_languages       45460 non-null  object 
 11  budget                 45466 non-null  object 
 12  revenue                45460 non-null  float64
 13  status                 45379 non-null  object 
 14  production_countries   45463 non-null  object 
 15  po

- bugdet: 
    - cambiar tipo de dato 
    - valores nulos cambiados por cero

In [9]:
# Cambiar valores de 'budget' a int
df_movies_new['budget'] = pd.to_numeric(df_movies_new['budget'], errors='coerce')

# Utilizar 0 para reemplazar valores nulos o faltantes
df_movies_new['budget'].replace(np.nan, 0, inplace=True)

- revenue: 
    - cambiar valores nulos por 0

In [10]:
# Cambiar valores de 'revenue' a número
df_movies_new['revenue'] = pd.to_numeric(df_movies_new['revenue'], errors='coerce')

# Utilizar 0 para reemplazar valores nulos o faltantes
df_movies_new['revenue'].replace(np.nan, 0, inplace=True)

- release_date:
    - eliminar datos faltantes
    - cambiar el tipo de dato
    - crear columna 'release_year'
    

In [11]:
# Eliminar datos faltantes de la columna 'release_date'
df_movies_new.dropna(subset=['release_date'], inplace=True)

# Convertir la columna 'release_date' a datetime, ignorando errores
df_movies_new['release_date'] = pd.to_datetime(df_movies_new['release_date'], errors='coerce')

# Crear una nueva columna 'release_year' para extraer el año de la fecha de lanzamiento
# Convertimos a 'Int64' para permitir valores nulos (NaN) y enteros
df_movies_new['release_year'] = df_movies_new['release_date'].dt.year.astype('Int64')

# Mostrar las primeras 2 filas del DataFrame
df_movies_new.head(2)


Unnamed: 0,id,title,genres,overview,tagline,runtime,release_date,production_companies,belongs_to_collection,original_language,spoken_languages,budget,revenue,status,production_countries,popularity,vote_average,vote_count,release_year
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",,81.0,1995-10-30,"[{'name': 'Pixar Animation Studios', 'id': 3}]","{'id': 10194, 'name': 'Toy Story Collection', ...",en,"[{'iso_639_1': 'en', 'name': 'English'}]",30000000.0,373554033.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",21.946943,7.7,5415.0,1995
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,104.0,1995-12-15,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",,en,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",65000000.0,262797249.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",17.015539,6.9,2413.0,1995


Crear la columna con el retorno de inversión, llamada return

In [12]:
# Crear columna 'return' - valores nulos cambiados por 0
df_movies_new['return'] = df_movies_new.apply(lambda row: row['revenue'] / row['budget'] if row['budget'] > 0 and row['revenue'] > 0 else 0, axis=1)

#### 3.2.2 Columna 'genres'

Creacion del dataset de generos

In [13]:
# Desanidar datos de la columna 'genres'
df_genres = pd.json_normalize(
    df_movies_new['genres'].apply(ast.literal_eval).explode()

    # Eliminar id's duplicados
).dropna().drop_duplicates().astype({'id': 'int'}).reset_index(drop=True)

# Crear una nueva fila con id=0 y name="No Data"
nueva_fila = pd.DataFrame({'id': [0], 'name': ["No Data"]})

# Concatenar la nueva fila al DataFrame df_genres
df_genres = pd.concat([df_genres, nueva_fila], ignore_index=True)

# Ordenar el DataFrame por la columna 'id'
df_genres = df_genres.sort_values('id').reset_index(drop=True)

df_genres.head(5)

Unnamed: 0,id,name
0,0,No Data
1,12,Adventure
2,14,Fantasy
3,16,Animation
4,18,Drama


Exportar Dataset

In [14]:
#df_genres.to_csv('../Datasets/Datasets_Limpios/genres.csv')
#df_genres.to_parquet('../Datasets/Datasets_Limpios/Parquet/genres.parquet', engine='pyarrow')

Cambiar datos de 'genres' para mantener solamente los ids

In [15]:
id_genre_list = []

# Itera sobre cada fila del DataFrame
for idx, row in df_movies_new.iterrows():
    id_genre = []  # Lista temporal para los IDs de la fila actual
    
    # Convierte el valor de 'genres' en un objeto Python si es una cadena
    if isinstance(row['genres'], str):
        row['genres'] = eval(row['genres'])
    
    # Extrae los IDs de género si existen y los agrega a id_genre
    for item in row['genres']:
        if isinstance(item, dict) and 'id' in item:
            id_genre.append(item['id'])
    
    id_genre_list.append(id_genre)

# Asigna la lista de IDs a la columna 'genres' del DataFrame
df_movies_new['genres'] = id_genre_list

In [16]:
df_movies_new.head(2)

Unnamed: 0,id,title,genres,overview,tagline,runtime,release_date,production_companies,belongs_to_collection,original_language,spoken_languages,budget,revenue,status,production_countries,popularity,vote_average,vote_count,release_year,return
0,862,Toy Story,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",,81.0,1995-10-30,"[{'name': 'Pixar Animation Studios', 'id': 3}]","{'id': 10194, 'name': 'Toy Story Collection', ...",en,"[{'iso_639_1': 'en', 'name': 'English'}]",30000000.0,373554033.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",21.946943,7.7,5415.0,1995,12.451801
1,8844,Jumanji,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,104.0,1995-12-15,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",,en,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",65000000.0,262797249.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",17.015539,6.9,2413.0,1995,4.043035


#### 3.2.3 Columna 'production_companies'

Creación del dataset de compañías productoras

In [17]:
# Desanidar datos de la columna 'production_companies'

df_prodcompanies = pd.json_normalize(
    df_movies_new['production_companies'].apply(ast.literal_eval).explode()
    
    #Eliminar id's duplicados
).dropna().drop_duplicates().astype({'id': 'int'}).reset_index(drop=True)

# Crear una nueva fila con id=0 y name="No Data"
nueva_fila = pd.DataFrame({'id': [0], 'name': ["No Data"]})

# Concatenar la nueva fila al DataFrame df_genres
df_genres = pd.concat([df_genres, nueva_fila], ignore_index=True).sort_values('id')

df_prodcompanies.head()

Unnamed: 0,name,id
0,Pixar Animation Studios,3
1,TriStar Pictures,559
2,Teitler Film,2550
3,Interscope Communications,10201
4,Warner Bros.,6194


Exportar Dataset

In [18]:
#df_prodcompanies.to_csv('../Datasets/Datasets_Limpios/prodcompanies.csv')
#df_prodcompanies.to_parquet('../Datasets/Datasets_Limpios/Parquet/prodcompanies.parquet', engine='pyarrow')

Limpiar valores de 'production_companies'

In [19]:
aux_droplist = []

# Revisión de todas las compañías de la columna
for idx, i in enumerate(df_movies_new['production_companies']):
    if isinstance(i, str):
        try:
            i = eval(i)
        except:
            aux_droplist.append(idx)
            print(f"Error al evaluar el dato en la fila {idx}: {i}")
            continue
        if not isinstance(i, list):
            aux_droplist.append(idx)
            print(f"El dato no es una lista en la fila {idx}: {i}")

df_movies_new = df_movies_new.drop(df_movies_new.index[aux_droplist])

El dato no es una lista en la fila 19714: False
El dato no es una lista en la fila 29472: False
El dato no es una lista en la fila 35543: False


Cambiar datos de 'production_companies' para mantener solamente los ids

In [20]:
id_prodcompany_list = []

# Itera sobre cada fila del DataFrame
for idx, row in df_movies_new.iterrows():
    id_prodcompany = []  # Lista temporal para los IDs de la fila actual
    
    # Convierte el valor de 'production_companies' en un objeto Python si es una cadena
    if isinstance(row['production_companies'], str):
        row['production_companies'] = eval(row['production_companies'])
    
    # Extrae los IDs de género si existen y los agrega a id_prodcompany
    for item in row['production_companies']:
        if isinstance(item, dict) and 'id' in item:
            id_prodcompany.append(item['id'])
    
    id_prodcompany_list.append(id_prodcompany)

# Asigna la lista de IDs a la columna 'production_companies' del DataFrame
df_movies_new['production_companies'] = id_prodcompany_list

In [21]:
df_movies_new.head(2)

Unnamed: 0,id,title,genres,overview,tagline,runtime,release_date,production_companies,belongs_to_collection,original_language,spoken_languages,budget,revenue,status,production_countries,popularity,vote_average,vote_count,release_year,return
0,862,Toy Story,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",,81.0,1995-10-30,[3],"{'id': 10194, 'name': 'Toy Story Collection', ...",en,"[{'iso_639_1': 'en', 'name': 'English'}]",30000000.0,373554033.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",21.946943,7.7,5415.0,1995,12.451801
1,8844,Jumanji,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,104.0,1995-12-15,"[559, 2550, 10201]",,en,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",65000000.0,262797249.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",17.015539,6.9,2413.0,1995,4.043035


#### 3.2.4 Columna 'belongs_to_collection'

In [22]:
# Desanidar datos de la columna 'belongs_to_collection'

df_collections = pd.json_normalize(
    df_movies_new['belongs_to_collection']
    .apply(lambda x: ast.literal_eval(x) if pd.notna(x) else None)  # Evalúa solo si no es nulo  

    #Eliminar id's duplicados
).dropna().drop_duplicates().astype({'id': 'int'}).sort_values('id').reset_index(drop=True)

#Eliminar columnas innecesarias
df_collections.drop(columns=['poster_path', 'backdrop_path'], inplace=True)


df_collections.head(10)

Unnamed: 0,id,name
0,10,Star Wars Collection
1,84,Indiana Jones Collection
2,119,The Lord of the Rings Collection
3,131,Three Colors Collection
4,151,Star Trek: The Original Series Collection
5,230,The Godfather Collection
6,263,The Dark Knight Collection
7,264,Back to the Future Collection
8,295,Pirates of the Caribbean Collection
9,304,Ocean's Collection


Cambiar datos de 'belongs_to_collection' para mantener solamente los ids

In [23]:
id_collection_list = []

# Itera sobre todas las filas de 'belongs_to_collection'
for item in df_movies_new['belongs_to_collection']:
    
    # Verifica si el valor es una cadena (representando un diccionario)
    if isinstance(item, str):
        # Convierte la cadena a diccionario
        try:
            item = ast.literal_eval(item)  
        except ValueError:
            print('No se pudo convertir:', item)
            id_collection_list.append(None)  # En caso de error, agrega None
            continue

    # Verifica si el valor es un diccionario y tiene la clave 'id'
    if isinstance(item, dict) and 'id' in item:
        id_collection_list.append(item['id'])  # Añade el valor del ID
    else:
        # Si no tiene 'id' o no es un diccionario, agrega None
        id_collection_list.append(None)

# Asigna la lista de IDs o None a la columna 'belongs_to_collection'
df_movies_new['belongs_to_collection'] = pd.Series(id_collection_list, dtype='Int64')


In [24]:
df_movies_new.head(2)

Unnamed: 0,id,title,genres,overview,tagline,runtime,release_date,production_companies,belongs_to_collection,original_language,spoken_languages,budget,revenue,status,production_countries,popularity,vote_average,vote_count,release_year,return
0,862,Toy Story,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",,81.0,1995-10-30,[3],10194.0,en,"[{'iso_639_1': 'en', 'name': 'English'}]",30000000.0,373554033.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",21.946943,7.7,5415.0,1995,12.451801
1,8844,Jumanji,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,104.0,1995-12-15,"[559, 2550, 10201]",,en,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",65000000.0,262797249.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",17.015539,6.9,2413.0,1995,4.043035


#### 3.2.5 Columna 'spoken_languages'


Cargar dataset iso_639_1

In [25]:
df_spokenlang = pd.read_csv('../Datasets/ISO 639 1 Language Codes.csv')

df_spokenlang.head(2)

Unnamed: 0,ISO_code,Language
0,aa,Afar
1,ab,Abkhaz


Cambiar datos de la columna 'spoken_languages'

In [26]:
id_spokenlang_list = []

# Itera sobre cada fila del DataFrame
for idx, row in df_movies_new.iterrows():
    id_spokenlang = []  # Lista temporal para los IDs de la fila actual
    
    # Convierte el valor de 'spoken_languages' en un objeto Python si es una cadena
    if isinstance(row['spoken_languages'], str):
        row['spoken_languages'] = eval(row['spoken_languages'])
    
    # Extrae los IDs de género si existen y los agrega a id_spokenlang
    for item in row['spoken_languages']:
        if isinstance(item, dict) and 'iso_639_1' in item:
            id_spokenlang.append(item['iso_639_1'])
    
    id_spokenlang_list.append(id_spokenlang)

# Asigna la lista de IDs a la columna 'spoken_languages' del DataFrame
df_movies_new['spoken_languages'] = id_spokenlang_list

In [27]:
df_movies_new.head(2)

Unnamed: 0,id,title,genres,overview,tagline,runtime,release_date,production_companies,belongs_to_collection,original_language,spoken_languages,budget,revenue,status,production_countries,popularity,vote_average,vote_count,release_year,return
0,862,Toy Story,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",,81.0,1995-10-30,[3],10194.0,en,[en],30000000.0,373554033.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",21.946943,7.7,5415.0,1995,12.451801
1,8844,Jumanji,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,104.0,1995-12-15,"[559, 2550, 10201]",,en,"[en, fr]",65000000.0,262797249.0,Released,"[{'iso_3166_1': 'US', 'name': 'United States o...",17.015539,6.9,2413.0,1995,4.043035


#### 3.2.6 Columna 'production_countries'


Crear dataset a partir de la columna 'production_countries'

In [28]:
df_prodcountries = pd.json_normalize(
    df_movies_new['production_countries'].apply(ast.literal_eval).explode()
    
    # Eliminar id's duplicados
).dropna().drop_duplicates().sort_values('iso_3166_1').reset_index(drop=True)

df_prodcountries.head(2)

Unnamed: 0,iso_3166_1,name
0,AE,United Arab Emirates
1,AF,Afghanistan


Exportar Dataset

In [29]:
#df_prodcountries.to_csv('../Datasets/Datasets_Limpios/csv/prodcountries.csv')
#df_prodcountries.to_parquet('../Datasets/Datasets_Limpios/Parquet/prodcountries.parquet', engine='pyarrow')

Cambiar datos de la columna 'production_countries'

In [30]:
id_prodcountry_list = []

# Itera sobre cada fila del DataFrame
for idx, row in df_movies_new.iterrows():
    id_prodcountry = []  # Lista temporal para los IDs de la fila actual
    
    # Convierte el valor de 'production_countries' en un objeto Python si es una cadena
    if isinstance(row['production_countries'], str):
        row['production_countries'] = eval(row['production_countries'])
    
    # Extrae los IDs de pais si existen y los agrega a id_prodcountry
    for item in row['production_countries']:
        if isinstance(item, dict) and 'iso_3166_1' in item:
            id_prodcountry.append(item['iso_3166_1'])
    
    # Si la lista está vacía, agregar 'No Data'
    if len(id_prodcountry) == 0:
        id_prodcountry.append('No Data')
    
    id_prodcountry_list.append(id_prodcountry)

# Asigna la lista de IDs a la columna 'production_countries' del DataFrame
df_movies_new['production_countries'] = id_prodcountry_list

In [31]:
df_movies_new.head(2)

Unnamed: 0,id,title,genres,overview,tagline,runtime,release_date,production_companies,belongs_to_collection,original_language,spoken_languages,budget,revenue,status,production_countries,popularity,vote_average,vote_count,release_year,return
0,862,Toy Story,"[16, 35, 10751]","Led by Woody, Andy's toys live happily in his ...",,81.0,1995-10-30,[3],10194.0,en,[en],30000000.0,373554033.0,Released,[US],21.946943,7.7,5415.0,1995,12.451801
1,8844,Jumanji,"[12, 14, 10751]",When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,104.0,1995-12-15,"[559, 2550, 10201]",,en,"[en, fr]",65000000.0,262797249.0,Released,[US],17.015539,6.9,2413.0,1995,4.043035


#### 3.2.7 Columna 'id'

Convertir columna 'id' a int64

In [32]:
df_movies_new['id'] = df_movies_new['id'].astype(int)

## 4. Exportar dataset df_movies_new

In [33]:
df_movies_new['popularity'] = df_movies_new['popularity'].astype(float)

In [34]:
df_movies_new.to_parquet('../Datasets/Datasets_Limpios/Parquet/movies.parquet', engine='pyarrow')