In [None]:
import pandas as pd
import requests
import time

# API Y URL
API_KEY = "b65533ac79f01d8fc205f6db406ccacc"
BASE_URL = "https://api.themoviedb.org/3"

# Función para buscar ID de la película en TMDB
def get_movie_id(title, year=None):
    params = {"api_key": API_KEY, "query": title}
    if year and not pd.isna(year):
        params["year"] = int(year)
    response = requests.get(f"{BASE_URL}/search/movie", params=params)
    if response.status_code == 200:
        results = response.json().get("results", [])
        if results:
            return results[0]["id"]  # toma la coincidencia más probable
    return None

# Función para conseguir budget y revenue
def get_movie_details(movie_id):
    response = requests.get(f"{BASE_URL}/movie/{movie_id}", params={"api_key": API_KEY})
    if response.status_code == 200:
        data = response.json()
        return data.get("budget", None), data.get("revenue", None)
    return None, None

# Cargar el dataset original
df = pd.read_csv("/content/movie_metadata.csv")

# Crear copias de gross y budget para actualizar
df["budget_filled"] = df["budget"]
df["gross_filled"] = df["gross"]

# Iterar sobre las filas con nulos
for idx, row in df[df["gross"].isna() | df["budget"].isna()].iterrows():
    title = str(row["movie_title"]).strip()
    year = row["title_year"]

    movie_id = get_movie_id(title, year)
    if movie_id:
        budget, revenue = get_movie_details(movie_id)

        if pd.isna(row["budget"]) and budget:
            df.at[idx, "budget_filled"] = budget
        if pd.isna(row["gross"]) and revenue:
            df.at[idx, "gross_filled"] = revenue

    # tiempo para evitar saturar al API
    time.sleep(0.25)

# Guardar dataset nuevo
df.to_csv("movie_metadata_enriched.csv", index=False)
print("guardado")

guardado


In [None]:
df_limpio = pd.read_csv("/content/movie_metadata_enriched.csv")
df_limpio.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,budget_filled,gross_filled
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000,237000000.0,760505800.0
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0,300000000.0,309404200.0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000,245000000.0,200074200.0
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000,250000000.0,448130600.0
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,12.0,7.1,,0,245000000.0,2068224000.0


In [None]:
df_limpio.shape

(5043, 30)

In [None]:
df_limpio.drop_duplicates(inplace = True)
df_limpio.shape

(4998, 30)

In [None]:
#dropear columnas que ya no se necistan

df_limpio.drop(columns=["color", "budget", "gross"], inplace=True)

In [None]:
df_limpio.head()

Unnamed: 0,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,genres,actor_1_name,movie_title,...,language,country,content_rating,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes,budget_filled,gross_filled
0,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,...,English,USA,PG-13,2009.0,936.0,7.9,1.78,33000,237000000.0,760505800.0
1,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,...,English,USA,PG-13,2007.0,5000.0,7.1,2.35,0,300000000.0,309404200.0
2,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,...,English,UK,PG-13,2015.0,393.0,6.8,2.35,85000,245000000.0,200074200.0
3,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,...,English,USA,PG-13,2012.0,23000.0,8.5,2.35,164000,250000000.0,448130600.0
4,Doug Walker,,,131.0,,Rob Walker,131.0,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,...,,,,,12.0,7.1,,0,245000000.0,2068224000.0


In [None]:
#ver nulos
"""
A pesar de que aún existen nulos en gross y budget, estos valores son significativamente menores a los que originalmente se tenían
"""

df_limpio.isnull().sum()

Unnamed: 0,0
director_name,103
num_critic_for_reviews,49
duration,15
director_facebook_likes,103
actor_3_facebook_likes,23
actor_2_name,13
actor_1_facebook_likes,7
genres,0
actor_1_name,7
movie_title,0


ELIMINAR LAS FILAS NULAS DE BUDGET Y GROSS, ya que son columnas fundamentales para la predicción

In [None]:
#analizar las proporciones de gross y budget luego de la conexión con el API
proporcion_budget = df["budget_filled"].isnull().sum() / df.shape[0]
print("El porcentaje de nulos de budget es de", proporcion_budget*100)

proporcion_gross = df["gross_filled"].isnull().sum() / df.shape[0]
print("El porcentaje de nulos de gross es de", proporcion_gross*100)

El porcentaje de nulos de budget es de 5.096172912948642
El porcentaje de nulos de gross es de 9.855244893912355


In [None]:
#eliminar las filas nulas de budget y gross

df_limpio.dropna(subset = ["budget_filled", "gross_filled" ], inplace = True)
df_limpio.shape

(4364, 27)

In [None]:
df_limpio.isnull().sum()

Unnamed: 0,0
color,19
director_name,104
num_critic_for_reviews,50
duration,15
director_facebook_likes,104
actor_3_facebook_likes,23
actor_2_name,13
actor_1_facebook_likes,7
gross,0
genres,0


In [None]:
df_limpio.to_csv("movies_FINAL.csv", index=False)
print("guardado")

guardado
