In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re

In [8]:
def extract_oscar_table():
    url_oscars = "https://es.wikipedia.org/wiki/Premios_%C3%93scar"

    # Extract the Oscars table and store it in a DataFrame
    res_oscar = requests.get(url_oscars)

    # Check if the request was successful.
    if res_oscar.status_code == 200:
        # Create BeautifulSoup object to access HTML content.
        soup_oscar = BeautifulSoup(res_oscar.content, 'html.parser')

        # Find all tables with class 'wikitable'.
        tables = soup_oscar.find_all("table", {"class": "wikitable"})

        if not tables:
            print("Error: No tables found with class 'wikitable'.")
            return (None, "No tables found with class 'wikitable'")

        # Select the first table (adjust if necessary).
        table_oscar = tables[0]

        # Get column names from the table.
        columns = [th.getText().strip().lower() for th in table_oscar.find_all("th")]

        # Filter desired columns.
        desired_columns = ['ceremonia', 'mejor película', 'mejor director', 'mejor actor', 'mejor actriz']
        column_indices = [columns.index(col) for col in desired_columns if col in columns]

        # Desired columns in English.
        desired_columns_en = ['ceremony_year', 'best_movie', 'best_director', 'best_actor', 'best_actress']

        data = []
        # Traverse through table rows.
        ceremony_found = False
        for row in table_oscar.find_all('tr'):
            # Get cells from the row
            cells = [td.getText().strip() for td in row.find_all("td")]

            # Check if we are at ceremony number 72 or later.
            if cells and cells[0].startswith("72"):
                ceremony_found = True

            # If we have reached ceremony number 72, extract data.
            if ceremony_found:
                # Check if there are enough cells in the row.
                if len(cells) >= len(column_indices):
                    # Filter cells according to desired columns.
                    filtered_cells = [cells[idx] for idx in column_indices]

                    # Here we take the string we get from the ceremony in one year and transform it to interger.
                    ceremony = filtered_cells[0]
                    ceremony_year = int(ceremony[-4:])
                    filtered_cells[0] = ceremony_year 

                    # This option also works but is abbreviated from the above.
                    # Takes the first filtered cell and replaces it with the last four characters of the cell that match the year (starts at position -4 because they are the last 4 digits).
                    # The first cell is March 24, 2000 and change it to 2000.
                    # filtered_cells[0] = int(filtered_cells[0][-4:])

                    # Below I clean the names of the directors, actors and actresses to remove the brackets and the movies inside them.
                    director = filtered_cells[2]
                    best_director = director.split('(')[0].strip() # I use a split to divide the string by the first '(' it finds and use the strip to remove spaces.
                    filtered_cells[2] = best_director

                    actor = filtered_cells[3]
                    best_actor = actor.split('(')[0].strip()
                    filtered_cells[3] = best_actor
                    
                    actress = filtered_cells[4]
                    best_actress = actress.split('(')[0].strip()
                    filtered_cells[4] = best_actress
                    
                    # Add data to the list only if valid and matching desired columns
                    if len(filtered_cells) == len(column_indices) and any(filtered_cells):
                        data.append(tuple(filtered_cells))
                        #print(len(data))

        if data:
            # Create DataFrame with filtered data and desired columns
            df = pd.DataFrame(data, columns=desired_columns_en)
            # Set the 'ceremony_year' column as the index
            df.set_index('ceremony_year', inplace=True)
            return (df, "Success")
        else:
            print("No data found starting from ceremony number 72.")
            return (None, "No data found starting from ceremony number 72.")

    else:
        print(f"Error: Request failed with status code {res_oscar.status_code}")
        return (None, f"Request failed with status code {res_oscar.status_code}")

# Extract Oscars table and store it in a tuple
oscar_results = extract_oscar_table()

# Check the type and content of the result
print(f"Result type: {type(oscar_results)}")
print("----------------------------------")
print(f"Result content: {oscar_results}")

# Ensure the tuple has exactly two elements
if isinstance(oscar_results, tuple) and len(oscar_results) == 2:
    # Unpack the tuple
    df_oscar, status = oscar_results

    # Display the first few rows of the resulting DataFrame if successful
    if df_oscar is not None:
        print(df_oscar.head())
    else:
        print(f"Error extracting table: {status}")
else:
    print("Error: The returned result is not a tuple with exactly two elements.")

Result type: <class 'tuple'>
----------------------------------
Result content: (                                                    best_movie  \
ceremony_year                                                    
2000                                           American Beauty   
2001                                                 Gladiator   
2002                                          A Beautiful Mind   
2003                                                   Chicago   
2004             The Lord of the Rings: The Return of the King   
2005                                       Million Dollar Baby   
2006                                                     Crash   
2007                                              The Departed   
2008                                    No Country for Old Men   
2009                                       Slumdog Millionaire   
2010                                           The Hurt Locker   
2011                                         The King's Speec

In [9]:
# Table
df_oscar

Unnamed: 0_level_0,best_movie,best_director,best_actor,best_actress
ceremony_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000,American Beauty,S. Mendes,K. Spacey,H. Swank
2001,Gladiator,S. Soderbergh,R. Crowe,J. Roberts
2002,A Beautiful Mind,R. Howard,D. Washington,H. Berry
2003,Chicago,R. Polanski,A. Brody,N. Kidman
2004,The Lord of the Rings: The Return of the King,P. Jackson,S. Penn,C. Theron
2005,Million Dollar Baby,C. Eastwood,J. Foxx,H. Swank
2006,Crash,A. Lee,P. S. Hoffman,R. Witherspoon
2007,The Departed,M. Scorsese,F. Whitaker,H. Mirren
2008,No Country for Old Men,J. Coen E. Coen,D. Day-Lewis,M. Cotillard
2009,Slumdog Millionaire,D. Boyle,S. Penn,K. Winslet


In [10]:
# Save CSV
df_oscar.to_csv("data/list_oscars_BS.csv")