In [15]:
import pandas as pd
df = pd.read_excel('items_inicial.xlsx', header = None, names = ['id', 'title', 'day', 'month', 'year', 'IMDB_url', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

In [16]:
# Lista de preposiciones o artículos que suelen ir al principio
preposiciones = ['The ', 'A ', 'An ', "L' ", 'Le ', 'La ', 'Il ']

def corregir_titulo(titulo):
    partes = str(titulo).split(', ')
    if len(partes) == 2 and partes[1] in preposiciones:
        return f"{partes[1]}{partes[0]}"
    return str(titulo)

# Aplicar la corrección
df['title'] = df['title'].apply(corregir_titulo)

In [17]:
import requests
import time

# Define la clave API de OMDb
OMDB_API_KEY = '19d426e6'

In [18]:
def get_movie_info_omdb(title, api_key):
    """Consulta los datos de una película por su título usando la API de OMDb."""
    url = f"http://www.omdbapi.com/?t={title}&apikey={api_key}"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if data['Response'] == 'True':
            return data
        else:
            return None
    else:
        return None

In [19]:
def enrich_movies_with_omdb(df, api_key):
    """Enriquece un DataFrame con información obtenida desde OMDb."""
    # Crear nuevas columnas en el DataFrame para almacenar la información adicional
    df.loc[:, 'OMDb_imdbRating'] = None
    df.loc[:, 'OMDb_Year'] = None
    df.loc[:, 'OMDb_Genre'] = None
    df.loc[:, 'OMDb_Rated'] = None
    df.loc[:, 'OMDb_Runtime'] = None
    df.loc[:, 'OMDb_Plot'] = None
    df.loc[:, 'OMDb_Ratings'] = None
    df.loc[:, 'OMDb_BoxOffice'] = None
    df.loc[:, 'OMDb_Metascore'] = None
    df.loc[:, 'OMDb_imdbVotes'] = None
    df.loc[:, 'OMDb_Released'] = None
    df.loc[:, 'OMDb_Poster'] = None

    

    for index, row in df.iterrows():
        title = row['title']
        
        # Obtener la información de la película desde OMDb
        movie_info = get_movie_info_omdb(title, api_key)
        
        if movie_info:
            df.at[index, 'OMDb_Rating'] = movie_info.get('imdbRating')
            df.at[index, 'OMDb_Year'] = movie_info.get('Year')
            df.at[index, 'OMDb_Genre'] = movie_info.get('Genre')
            df.at[index, 'OMDb_Rated'] = movie_info.get('Rated')
            df.at[index, 'OMDb_Runtime'] = movie_info.get('Runtime')
            df.at[index, 'OMDb_Plot'] = movie_info.get('Plot')
            df.at[index, 'OMDb_Ratings'] = movie_info.get('Ratings')
            df.at[index, 'OMDb_BoxOffice'] = movie_info.get('BoxOffice')
            df.at[index, 'OMDb_Metascore'] = movie_info.get('Metascore')
            df.at[index, 'OMDb_imdbVotes'] = movie_info.get('imdbVotes')
            df.at[index, 'OMDb_Released'] = movie_info.get('Released')
            df.at[index, 'OMDb_Poster'] = movie_info.get('Poster')
        
        # OMDb tiene un límite de 1 llamada por segundo en su versión gratuita
        time.sleep(1)
    
    return df

In [20]:
data1 = df.iloc[:1011]

# Llamar a la función para enriquecer el DataFrame con la información de OMDb
df1_enriched = enrich_movies_with_omdb(data1, OMDB_API_KEY)

# Ver los primeros registros con la nueva información
print(df1_enriched.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'OMDb_imdbRating'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'OMDb_Year'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'OMDb_Genre'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] =

   id        title  day  month  year  \
0   1   Toy Story     1      1  1995   
1   2   GoldenEye     1      1  1995   
2   3  Four Rooms     1      1  1995   
3   4  Get Shorty     1      1  1995   
4   5     Copycat     1      1  1995   

                                            IMDB_url  unknown  Action  \
0  http://us.imdb.com/M/title/exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title/exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title/exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title/exact?Get%20Shorty%...        0       1   
4  http://us.imdb.com/M/title/exact?Copycat%20(1995)        0       0   

   Adventure  Animation  ...  OMDb_Rated  OMDb_Runtime  \
0          0          1  ...           G        81 min   
1          1          0  ...       PG-13       130 min   
2          0          0  ...           R        98 min   
3          0          0  ...           R       105 min   
4          0          0  ...  

In [23]:
data2 = df.iloc[1011:]

# Llamar a la función para enriquecer el DataFrame con la información de OMDb
df2_enriched = enrich_movies_with_omdb(data2, OMDB_API_KEY)

# Ver los primeros registros con la nueva información
print(df2_enriched.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'OMDb_imdbRating'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'OMDb_Year'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'OMDb_Genre'] = None
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] =

        id                                    title  day  month  year  \
1011  1013                                Anaconda    11      4  1997   
1012  1014  Romy and Michele's High School Reunion    25      4  1997   
1013  1015                                  Shiloh    23      5  1997   
1014  1016                                 Con Air     6      6  1997   
1015  1017                            Trees Lounge    11     10  1996   

                                               IMDB_url  unknown  Action  \
1011  http://us.imdb.com/M/title/exact?Anaconda%20%2...        0       1   
1012  http://us.imdb.com/M/title/exact?Romy%20and%20...        0       0   
1013  http://us.imdb.com/M/title/exact?Shiloh%20%281...        0       0   
1014  http://us.imdb.com/M/title/exact?Con%20Air%20%...        0       1   
1015  http://us.imdb.com/M/title/exact?Trees%20Loung...        0       0   

      Adventure  Animation  ...  OMDb_Rated  OMDb_Runtime  \
1011          1          0  ...       PG-13

In [30]:
df_final = pd.concat([df1_enriched, df2_enriched], ignore_index=True)
df_final.to_csv('items_enriched.csv', index = False)

In [None]:
#tuve que hacerlo en 2 veces y concatenar, 1 request por segundo, 1000 requests al dia