In [3]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [8]:
def extract_oscar_table():
    url_oscars = "https://es.wikipedia.org/wiki/Premios_%C3%93scar"

    # Extract the Oscars table and store it in a DataFrame
    res_oscar = requests.get(url_oscars)

    # Check if the request was successful
    if res_oscar.status_code == 200:
        # Create BeautifulSoup object to access HTML content
        soup_oscar = BeautifulSoup(res_oscar.content, 'html.parser')

        # Find all tables with class 'wikitable'
        tables = soup_oscar.find_all("table", {"class": "wikitable"})

        if not tables:
            print("Error: No tables found with class 'wikitable'.")
            return (None, "No tables found with class 'wikitable'")

        # Select the first table (adjust if necessary)
        table_oscar = tables[0]

        # Get column names from the table
        columns = [th.getText().strip().lower() for th in table_oscar.find_all("th")]

        # Filter desired columns
        desired_columns = ['ceremonia', 'mejor película', 'mejor director', 'mejor actor', 'mejor actriz']
        column_indices = [columns.index(col) for col in desired_columns if col in columns]

        data = []
        # Traverse through table rows
        ceremony_found = False
        for row in table_oscar.find_all('tr'):
            # Get cells from the row
            cells = [td.getText().strip() for td in row.find_all("td")]

            # Check if we are at ceremony number 72 or later
            if cells and cells[0].startswith("72"):
                ceremony_found = True

            # If we have reached ceremony number 72, extract data
            if ceremony_found:
                # Check if there are enough cells in the row
                if len(cells) >= len(column_indices):
                    # Filter cells according to desired columns
                    filtered_cells = [cells[idx] for idx in column_indices]

                    # Aquí tomamos el string que obtenemos de la ceremonia en un año  y lo transformamos a interger.
                    ceremonia = filtered_cells[0]
                    anio_ceremonia = int(ceremonia[-4:])
                    filtered_cells[0] = anio_ceremonia

                    # Esta opción también funciona pero es abreviada de lo anterior.
                    # toma la primera celda filtrada y la reemplaza por los últimos cuatros carácteres de ella misma que coinciden con el año (empieza en la posición -4 porque son los 4 últimos dígitos)
                    # la primera celda es 24 de marzo del 2000 y la cambia por 2000
                    # filtered_cells[0] = int(filtered_cells[0][-4:])

                    # Ha continuación limpio los nombres de los directores, actores y actrices para sacar los parentesis y las peliculas dentro de ellos.
                    director = filtered_cells[2]
                    mejor_director = director.split('(')[0].strip() #Uso un split paradividir el string por el primer '(' que encuentra y uso el strip para eliminar espacios.
                    filtered_cells[2] = mejor_director

                    actor = filtered_cells[3]
                    mejor_actor = actor.split('(')[0].strip()
                    filtered_cells[3] = mejor_actor
                    
                    actriz = filtered_cells[4]
                    mejor_actriz = actriz.split('(')[0].strip()
                    filtered_cells[4] = mejor_actriz
                    
                    # Add data to the list only if valid and matching desired columns
                    if len(filtered_cells) == len(column_indices) and any(filtered_cells):
                        data.append(tuple(filtered_cells))
                        #print(len(data))

        if data:
            # Create DataFrame with filtered data and desired columns
            df = pd.DataFrame(data, columns=desired_columns)
            return (df, "Success")
        else:
            print("No data found starting from ceremony number 72.")
            return (None, "No data found starting from ceremony number 72.")

    else:
        print(f"Error: Request failed with status code {res_oscar.status_code}")
        return (None, f"Request failed with status code {res_oscar.status_code}")

# Extract Oscars table and store it in a tuple
oscar_results = extract_oscar_table()

# Check the type and content of the result
print(f"Result type: {type(oscar_results)}")
print("----------------------------------")
print(f"Result content: {oscar_results}")

# Ensure the tuple has exactly two elements
if isinstance(oscar_results, tuple) and len(oscar_results) == 2:
    # Unpack the tuple
    df_oscar, status = oscar_results

    # Display the first few rows of the resulting DataFrame if successful
    if df_oscar is not None:
        print(df_oscar.head())
    else:
        print(f"Error extracting table: {status}")
else:
    print("Error: The returned result is not a tuple with exactly two elements.")

Result type: <class 'tuple'>
----------------------------------
Result content: (    ceremonia                                   mejor película  \
0        2000                                  American Beauty   
1        2001                                        Gladiator   
2        2002                                 A Beautiful Mind   
3        2003                                          Chicago   
4        2004    The Lord of the Rings: The Return of the King   
5        2005                              Million Dollar Baby   
6        2006                                            Crash   
7        2007                                     The Departed   
8        2008                           No Country for Old Men   
9        2009                              Slumdog Millionaire   
10       2010                                  The Hurt Locker   
11       2011                                The King's Speech   
12       2012                                       The Artis

In [9]:
# Tabla
df_oscar

Unnamed: 0,ceremonia,mejor película,mejor director,mejor actor,mejor actriz
0,2000,American Beauty,S. Mendes,K. Spacey,H. Swank
1,2001,Gladiator,S. Soderbergh,R. Crowe,J. Roberts
2,2002,A Beautiful Mind,R. Howard,D. Washington,H. Berry
3,2003,Chicago,R. Polanski,A. Brody,N. Kidman
4,2004,The Lord of the Rings: The Return of the King,P. Jackson,S. Penn,C. Theron
5,2005,Million Dollar Baby,C. Eastwood,J. Foxx,H. Swank
6,2006,Crash,A. Lee,P. S. Hoffman,R. Witherspoon
7,2007,The Departed,M. Scorsese,F. Whitaker,H. Mirren
8,2008,No Country for Old Men,J. Coen E. Coen,D. Day-Lewis,M. Cotillard
9,2009,Slumdog Millionaire,D. Boyle,S. Penn,K. Winslet
