In [None]:
!pip install beautifulsoup4



Scrape Nominations

In [None]:
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import time

base_url = "https://en.wikipedia.org"

awards = {}

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                   (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}

def get_suffix(number):
    if 10 <= number % 100 <= 20:
        return "th"
    return {1: "st", 2: "nd", 3: "rd"}.get(number % 10, "th")

for i in range(10, 98):
    suffix = get_suffix(i)
    url = f"https://en.wikipedia.org/wiki/{i}{suffix}_Academy_Awards"
    try:
        response = requests.get(url, headers=headers)
        print(f"Fetching: {url} - Status: {response.status_code}")
        while(response.status_code!=200):
          time.sleep(5)
          response = requests.get(url)
          print(f"Fetching: {url} - Status: {response.status_code}")

        soup = BeautifulSoup(response.content, "html.parser")
        table = soup.find("table", class_="wikitable")
        if not table:
            print("No table found on page.")
            continue

        tds = table.find_all("td")
        awards[i] = {}
        categories = table.find_all("th")
        for index, td in enumerate(tds):
            if(categories == []):
                category = td.find('b')
            else:
              category = categories[index]
            print(category.text)
            print("-" * 40)
            awards[i][category.text] = {
                "nominations": [],
                "winners": []
            }
            nominies = td.find("ul")
            if not nominies:
                continue
            nominiesTxt = nominies.get_text()
            for line in nominiesTxt.strip().split('\n'):
                if '‡' in line:
                    line = line.replace('‡', '').strip()
                    awards[i][category.text]["winners"].append(line)
                else:
                    line = line.strip()
                    awards[i][category.text]["nominations"].append(line)
    except Exception as e:
        print(f"Error processing {url}: {e}")
# Your existing awards data processing
awards_data = []
for year, categories in awards.items():
    for category, details in categories.items():
        winners = details["winners"]  # List of winners
        nominations = details["nominations"] + winners  # Treat winners as nominations as well

        for nomination in nominations:
            won = 1 if nomination in winners else 0  # Check if the nomination is in the winners list
            awards_data.append([year, category, nomination, won])

# Create the DataFrame
awards_df = pd.DataFrame(awards_data, columns=["iteration", "Category", "Nomination", "Won"])

# Display the DataFrame
print(awards_df)

Fetching: https://en.wikipedia.org/wiki/10th_Academy_Awards - Status: 200
Outstanding Production
----------------------------------------
Best Directing
----------------------------------------
Best Actor
----------------------------------------
Best Actress
----------------------------------------
Best Actor in a Supporting Role
----------------------------------------
Best Actress in a Supporting Role
----------------------------------------
Best Writing (Original Story)
----------------------------------------
Best Writing (Screenplay)
----------------------------------------
Best Short Subject (One-Reel)
----------------------------------------
Best Short Subject (Two-Reel)
----------------------------------------
Best Short Subject (Color)
----------------------------------------
Best Short Subject (Cartoon)
----------------------------------------
Best Music (Scoring)
----------------------------------------
Best Music (Song)
----------------------------------------
Best Sound Re

In [None]:
print(awards)






```
# This is formatted as code
```

**Scrape Actors and Movies**

In [None]:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
from datetime import datetime

#
base_url = "https://en.wikipedia.org"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
                   (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
}


visited_actors = set()
visited_movies = set()

actors = []
movies = []


def generate_date(text):
      # Try to match a date directly in YYYY-MM-DD format
    date_match = re.search(r'(\d{4}-\d{2}-\d{2})', str(text))
    if date_match:
        return date_match.group(1)
    # Try to match month, day, and year
    date_match = re.search(r'([A-Za-z]+)\s*(\d{1,2})\s*,\s*(\d{4})', str(text))
    if date_match:
        month_str, day, year = date_match.groups()
        # Convert month name to number
        month_num = datetime.strptime(month_str, "%B").month
        # Create a date string in YYYY-MM-DD format
        return f"{year}-{month_num:02d}-{int(day):02d}"
    return text




def get_suffix(number):
    if 10 <= number % 100 <= 20:
        return "th"
    return {1: "st", 2: "nd", 3: "rd"}.get(number % 10, "th")


def fetch_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
           print(f"Fetched: {url}")
           return BeautifulSoup(response.content, "html.parser")
        print(f"Failed to fetch {url} - Status: {response.status_code}")
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return None


def extract_actor_info(actor_name, actor_link):
    if actor_link in visited_actors:
        print (f"Skipping {actor_name}- {actor_link} (already visited) ")
        return None
    visited_actors.add(actor_link)
    actor_soup = fetch_page(actor_link)
    if not actor_soup:
        return None
    bio = actor_soup.find("table", class_="infobox")
    if not bio:
      print(f"infobox not found for {actor_name} - {actor_link}")
      return None
    actor_info = {"name": actor_name, "DOB": "N/A", "Birth Country": "N/A", "DOD": "N/A"}
    for field in ["Born", "Died"]:
        row = bio.find("th", string=field)
        if row:
            data = row.find_next_sibling("td")
            if data:
                text = data.get_text(separator=" ", strip=True)
                date = generate_date(text)
                parts = text.split(", ")
                if field == "Born":
                    if re.search(r'\d{4}', text):  # Check if a year is present (indicating it's a date)
                        actor_info["DOB"] = date
                    if len(parts) > 1:  # If both date and country are present
                        actor_info["Birth Country"] = parts[-1]
                    elif len(parts) == 1:  # If only country is present
                        actor_info["Birth Country"] = parts[0]
                elif field == "Died":
                    actor_info["DOD"] = date
    actors.append(actor_info)
    return actor_info


def extract_movie_info(movie_name, movie_link):
    if movie_link in visited_movies:
        print (f"Skipping {movie_name}- {movie_link} (already visited) ")
        return None
    visited_movies.add(movie_link)
    movie_soup = fetch_page(movie_link)
    if not movie_soup:
        return None
    info = movie_soup.find("table", class_="infobox")
    if not info:
      print(f"infobox not found for {movie_name} - {movie_link}")
      return None
    movie_data = {"Name": movie_name, "Release date": "N/A", "Distributed by": "N/A", "Language": "N/A", "Running time": "N/A"}
    fields = ["Distributed by", "Release date", "Release dates", "Language", "Running time"]
    for field in fields:
        row = info.find("th", string=field)
        if row:
            data = row.find_next_sibling("td")
            if data:
                # Remove unnecessary tags and get clean text
                for sup in data.find_all("sup"):
                    sup.decompose()
                list_item = data.find("li")
                text = list_item.get_text(separator=" ", strip=True) if list_item else data.get_text(separator=" ", strip=True)
                if field == "Release dates" or field == "Release date" :
                    date_container = data.find("div", class_="film-date")
                    if date_container:
                        first_date = date_container.find("li")
                        if first_date:
                            text = first_date.get_text(separator=" ", strip=True)
                            date_match = re.search(r'(\d{4}-\d{2}-\d{2})', str(first_date))
                            numerical_date = date_match.group(1) if date_match else generate_date(text)
                            movie_data["Release date"] = numerical_date
                        else:
                            text = data.get_text(separator=" ", strip=True)
                            movie_data["Release date"] = generate_date(text)
                    else:
                        text = data.get_text(separator=" ", strip=True)
                        movie_data["Release date"] = generate_date(text)
                elif field == "Running time":
                    # Handle running time field
                    running_time_match = re.search(r'(\d{1,2}):(\d{2}):(\d{2})', text)  # e.g., 1:30:00
                    if running_time_match:
                        hours, minutes, seconds = map(int, running_time_match.groups())
                        total_minutes = hours * 60 + minutes
                        movie_data["Running time"] = total_minutes
                    else:
                        # If no "h:mm:ss" format, try to extract minutes directly (e.g., "90 min")
                        minutes_match = re.search(r'(\d+)\s*min', text)
                        if minutes_match:
                            movie_data["Running time"] = minutes_match.group(1)
                else:
                    text = list_item.get_text(separator=" ", strip=True) if list_item else data.get_text(separator=" ", strip=True)
                    movie_data[field] = text
    movies.append(movie_data)
    return movie_data


if __name__ == "__main__":
      for i in range(10, 98):
        suffix = get_suffix(i)
        url = f"https://en.wikipedia.org/wiki/{i}{suffix}_Academy_Awards"
        soup = fetch_page(url)
        if not soup:
            continue
        table = soup.find("table", class_="wikitable")
        if not table:
            continue
        tds = table.find_all("td")
        for td in tds:
            actor_links = td.select("ul > li > b > a, ul > li > a")
            for actor in actor_links:
                actor_name = actor.text.strip()
                actor_link = urljoin(base_url, actor["href"])
                if any(word in actor_link.lower() for word in ["production", "department", "artists", "prod.","(song)"]):
                    print(f"Skipping: {actor_name} (Irrelevant)")
                    continue
                actor_info = extract_actor_info(actor_name, actor_link)
                if actor_info:
                    print(f"Actor: {actor_info}")
            movie_links = td.select("ul > li > i > b > a, ul > li > i > a")
            for movie in movie_links:
                movie_name = movie.text.strip()
                movie_link = urljoin(base_url, movie["href"])
                movie_info = extract_movie_info(movie_name, movie_link)
                if movie_info:
                    print(f"Movie: {movie_info}")


Fetched: https://en.wikipedia.org/wiki/45th_Academy_Awards
Fetched: https://en.wikipedia.org/wiki/Albert_S._Ruddy
Actor: {'name': 'Albert S. Ruddy', 'DOB': '1930-03-28', 'Birth Country': 'Canada', 'DOD': '2024-05-25'}
Fetched: https://en.wikipedia.org/wiki/Cy_Feuer
Actor: {'name': 'Cy Feuer', 'DOB': '1911-01-15', 'Birth Country': 'U.S.', 'DOD': '2006-05-17'}
Fetched: https://en.wikipedia.org/wiki/John_Boorman
Actor: {'name': 'John Boorman', 'DOB': '1933-01-18', 'Birth Country': 'England', 'DOD': 'N/A'}
Fetched: https://en.wikipedia.org/wiki/Bengt_Forslund
Actor: {'name': 'Bengt Forslund', 'DOB': '1932-06-22', 'Birth Country': '( 1932-06-22 ) 22 June 1932 (age\xa092)', 'DOD': 'N/A'}
Fetched: https://en.wikipedia.org/wiki/Robert_B._Radnitz
infobox not found for Robert B. Radnitz - https://en.wikipedia.org/wiki/Robert_B._Radnitz
Fetched: https://en.wikipedia.org/wiki/Cabaret_(1972_film)
Movie: {'Name': 'Cabaret', 'Release date': '1972-02-13', 'Distributed by': 'Allied Artists', 'Language'

Create the panda Dataframe for movies.

Remove problematic datapoints.

In [None]:
import pandas as pd

actors_df = pd.DataFrame(actors)
names_to_remove = ["his play","the book","his novel","his book", "the novel", "the play","Belgium","Israel","20th Century Fox","United Kingdom"]

# Filter the DataFrame to exclude these names
actors_df = actors_df[~actors_df['name'].isin(names_to_remove)]
actors_df.rename(columns={
    'name': 'Name',

}, inplace=True)


actors_df.to_csv('actors.csv', index=False)
print("Actors DataFrame:")
print(actors_df)


Actors DataFrame:
                            Name         DOB Birth Country         DOD
0                   Henry Blanke  1901-12-30       Germany  1981-05-28
1                   Warner Bros.         N/A           N/A         N/A
2                    Leo McCarey  1898-10-03          U.S.  1969-07-05
3                       Columbia         N/A           N/A         N/A
4               Louis D. Lighton  1895-11-25           USA  1963-02-01
...                          ...         ...           ...         ...
6906                 Theo Bialek         N/A           N/A         N/A
6907                 Alex Wuttke         N/A           N/A         N/A
6908                 Simone Coco         N/A           N/A         N/A
6909             Jeff Sutherland         N/A           N/A         N/A
6910  Luc-Ewen Martin-Fenouillet         N/A           N/A         N/A

[6762 rows x 4 columns]


Create the movies Dataframe

and update the names

In [None]:
movies_df = pd.DataFrame(movies)
# Rename columns in the movies DataFrame
movies_df.rename(columns={
    'Name': 'Title',
    'Release date': 'releaseDate',
    'Running time': 'runTime',
    'Distributed by': 'productionCompany'
}, inplace=True)


print(movies_df)

# save the updated DataFrame to a csv
movies_df.to_csv('movies.csv', index=False)


                                              Title releaseDate  \
0                            The Life of Emile Zola  1937-08-11   
1                                   The Awful Truth  1937-10-21   
2                               Captains Courageous  1937-05-11   
3                                          Dead End  1937-08-27   
4                                    The Good Earth  1937-01-29   
...                                             ...         ...   
4112  Mission: Impossible – Dead Reckoning Part One  2023-06-19   
4113                                       Napoleon  2023-11-14   
4114                                       El Conde  2023-08-31   
4115                                          Golda  2023-02-20   
4116                 Guardians of the Galaxy Vol. 3  2023-04-22   

                                      productionCompany Language runTime  
0                                 Warner Bros. Pictures  English     116  
1                                     Columbi

 See the duplicate Movies

In [None]:
duplicate_movies = movies_df[movies_df.duplicated(subset=['Title', 'releaseDate'], keep=False)]

# Display the duplicate movies
print("Movies with the same name and release date:")
print(duplicate_movies)

Movies with the same name and release date:
                  Title releaseDate productionCompany Language runTime
1031  Blackboard Jungle  1955-03-20       Loew's Inc.  English     101
1032  Blackboard Jungle  1955-03-20       Loew's Inc.  English     101


Remove the duplicates and update the CSV

In [None]:
# Remove duplicates based on 'Name' and 'Release date', keeping the first occurrence
movies_no_duplicates = movies_df.drop_duplicates(subset=['Title', 'releaseDate'], keep='first')

# Save the result to a CSV file
movies_no_duplicates.to_csv('movies.csv', index=False)

print("Movies without duplicates saved to: movies.csv")

Movies without duplicates saved to: movies.csv


See the duplicate Actors

In [None]:
duplicate_actors = actors_df[actors_df.duplicated(subset=['Name', 'DOB'], keep=False)]

# Display the duplicates
print("Duplicate actors based on name and DOB:")
print(duplicate_actors)

Duplicate actors based on name and DOB:
                        Name         DOB  Birth Country         DOD
16                 RKO Radio         N/A            N/A         N/A
64                 RKO Radio         N/A            N/A         N/A
94           Elmer A. Raguse  1901-05-09  United States  1972-03-02
178            Alfred Newman  1900-03-17           U.S.  1970-02-17
192               Bob Wright  1914-09-25           U.S.  2005-07-27
...                      ...         ...            ...         ...
6502  Bosnia and Herzegovina         N/A            N/A         N/A
6615         the short story         N/A            N/A         N/A
6680           Tristan Myles         N/A            N/A         N/A
6754            Hold My Hand         N/A            N/A         N/A
6877           Gary A. Rizzo  1972-01-31  United States         N/A

[116 rows x 4 columns]


Remove duplicates and update CSV

In [None]:
actors_no_duplicates = actors_df.drop_duplicates(subset=['Name', 'DOB'], keep='first')

# Optionally, save the updated DataFrame to a new CSV
actors_no_duplicates.to_csv('actors.csv', index=False)



create nominations_df
and The awards Csv


In [None]:
nominations_df = awards_df
awards_df = awards_df.iloc[:, :2]  # Select the first two columns

# Save to CSV without modifying the original DataFrame
awards_df.to_csv("awards.csv", index=False)

Create the nominations.csv

In [None]:
nominations_df.to_csv ("nominations.csv", index=False)

See the unique categories

In [None]:
unique_categories = awards_df['Category'].unique()
print(unique_categories)

['Outstanding Production' 'Best Directing' 'Best Actor' 'Best Actress'
 'Best Actor in a Supporting Role' 'Best Actress in a Supporting Role'
 'Best Writing (Original Story)' 'Best Writing (Screenplay)'
 'Best Short Subject (One-Reel)' 'Best Short Subject (Two-Reel)'
 'Best Short Subject (Color)' 'Best Short Subject (Cartoon)'
 'Best Music (Scoring)' 'Best Music (Song)' 'Best Sound Recording'
 'Best Art Direction' 'Best Cinematography' 'Best Film Editing'
 'Best Dance Direction' 'Best Assistant Director'
 'Best Music (Original Score)' 'Best Cinematography (Black-and-White)'
 'Best Cinematography (Color)' 'Best Special Effects'
 'Best Writing (Original Screenplay)'
 'Best Art Direction (Black-and-White)' 'Best Art Direction (Color)'
 'Outstanding Motion Picture' 'Best Documentary (Short Subject)'
 'Best Music (Music Score of a Dramatic Picture)'
 'Best Music (Scoring of a Musical Picture)'
 'Best Writing (Original Motion Picture Story)' 'Best Documentary'
 'Best Music (Music Score of a 

Desgin a map to figure out the roles

In [None]:
from ast import keyword
category_role_mapping = {}

# Define role keywords
actor_keywords = ["Actor", "Actress", "Leading Role", "Supporting Role"]
director_keywords = ["Directing", "Assistant Director"]
writer_keywords = ["Writing", "Screenplay", "Story"]
composer_keywords = ["Music", "Song", "Score", "Scoring"]
sound_visual_keywords = ["Sound", "Visual Effects", "Sound Editing", "Sound Mixing"]
short_doc_keywords = ["Short Film", "Short Subject", "Documentary"]
design_keywords = ["Production Design", "Costume Design", "Makeup", "Art Direction"]
cinematography_editing_keywords = ["Cinematography", "Editing"]
picture_keywords = ["Picture", "Motion Picture", "Production"]
international_keywords = ["Foreign Language Film", "International Feature Film"]
vfx_keywords = ["Special Effects"]

# List of all categories (add more if needed)


def assign_role(category):
    # Check the category against each role's keywords
    if any(keyword in category for keyword in actor_keywords):
        return "Actor/Actress"
    elif any(keyword in category for keyword in director_keywords):
        return "Director"
    elif any(keyword in category for keyword in writer_keywords):
        return "Writer"
    elif any(keyword in category for keyword in composer_keywords):
        return "Composer"
    elif any(keyword in category for keyword in sound_visual_keywords):
        return "Sound/Visual Engineer"
    elif any(keyword in category for keyword in short_doc_keywords):
        return "Director/Producer"
    elif any(keyword in category for keyword in design_keywords):
        return "Designer"
    elif any(keyword in category for keyword in cinematography_editing_keywords):
        return "Cinematographer/Editor"
    elif any(keyword in category for keyword in picture_keywords):
        return "Producer"
    elif any(keyword in category for keyword in vfx_keywords):
        return "VFX artist"
    elif any(keyword in category for keyword in international_keywords):
        return "Director/Producer"
    else:
        return "Unknown"

# Map each category to a role
for category in unique_categories:
    role = assign_role(category)
    category_role_mapping[category] = role

for category, role in category_role_mapping.items():
    print(f"{category}: {role}")


Outstanding Production: Producer
Best Directing: Director
Best Actor: Actor/Actress
Best Actress: Actor/Actress
Best Actor in a Supporting Role: Actor/Actress
Best Actress in a Supporting Role: Actor/Actress
Best Writing (Original Story): Writer
Best Writing (Screenplay): Writer
Best Short Subject (One-Reel): Director/Producer
Best Short Subject (Two-Reel): Director/Producer
Best Short Subject (Color): Director/Producer
Best Short Subject (Cartoon): Director/Producer
Best Music (Scoring): Composer
Best Music (Song): Composer
Best Sound Recording: Sound/Visual Engineer
Best Art Direction: Designer
Best Cinematography: Cinematographer/Editor
Best Film Editing: Cinematographer/Editor
Best Dance Direction: Unknown
Best Assistant Director: Director
Best Music (Original Score): Composer
Best Cinematography (Black-and-White): Cinematographer/Editor
Best Cinematography (Color): Cinematographer/Editor
Best Special Effects: VFX artist
Best Writing (Original Screenplay): Writer
Best Art Direction

Process the nominations and construct the new csv

In [None]:
# Function to split actors and clean the string
def extract_actors(nomination):
    # First, remove everything after "for" (including "for")
    nomination_clean = re.split(r"\s+for\s+", nomination)[0]

    # Now split the actors by commas or "and"
    actors = re.split(r",|\band\b", nomination_clean)

    # Clean up any extra spaces
    actors = [actor.strip() for actor in actors]

    return actors

# Example usage within the processing loop
processed_nominations = []

for index, row in awards_df.iterrows():
    nomination = row["Nomination"]
    category = row["Category"]

    try:
        # Split nomination at "–"
        parts = nomination.split("– ")

        if len(parts) < 2:
            print(f"Skipping invalid nomination format: {nomination}")
            continue

        part1 = parts[0].strip()  # Actor(s)
        part2 = parts[1].strip()  # Movie

        # Heuristic to determine whether part1 or part2 is the actor or movie
        if any(char.isdigit() for char in part2):  # Movie likely contains a year
            movie_name = part2
            actors = extract_actors(part1)
        else:
            movie_name = part1
            actors = extract_actors(part2)

        # Get the movie release date
        movie_row = movies_df[movies_df["Title"] == movie_name]
        if not movie_row.empty:
            release_date = movie_row["releaseDate"].values[0]
        else:
            release_date = "N/A"

        # Get actor DOBs
        actors_dobs = {}
        for actor in actors:
            actor_row = actors_df[actors_df["Name"] == actor]
            if not actor_row.empty:
                actors_dobs[actor] = actor_row["DOB"].values[0]
            else:
                actors_dobs[actor] = "N/A"

        # Get the role from category_role_mapping
        role = category_role_mapping.get(category, "N/A")

        # Append the processed data to the list
        for actor in actors:
            processed_nominations.append({
                "iteration": row["iteration"],
                "Category": category,
                "Nomination": nomination,
                "Won": row["Won"],
                "Movie": movie_name,
                "Movie Release Date": release_date,
                "Actor": actor,
                "Actor DOB": actors_dobs[actor],
                "Role": role
            })

    except ValueError:
        print(f"Skipping invalid nomination format: {nomination}")
        continue

# Convert to DataFrame
processed_nominations_df = pd.DataFrame(processed_nominations)

# Display the result
print(processed_nominations_df)
processed_nominations_df.to_csv("processed_nominations.csv",index=False)

Skipping invalid nomination format: The Last Bomb
Skipping invalid nomination format: The True Glory
Skipping invalid nomination format: Hitler Lives
Skipping invalid nomination format: Library of Congress
Skipping invalid nomination format: To the Shores of Iwo Jima
Skipping invalid nomination format: Donald's Crime - Walt Disney
Skipping invalid nomination format: Jasper and the Beanstalk  - George Pal
Skipping invalid nomination format: Life with Feathers - Edward Selzer
Skipping invalid nomination format: Mighty Mouse in Gypsy Life - Paul Terry
Skipping invalid nomination format: The Poet and Peasant - Walter Lantz
Skipping invalid nomination format: Rippling Romance - Ray Katz
Skipping invalid nomination format: Quiet Please! - Fred Quimby
Skipping invalid nomination format: Atomic Power
Skipping invalid nomination format: Life at the Zoo
Skipping invalid nomination format: Paramount News Issue #37
Skipping invalid nomination format: Traffic with the Devil
Skipping invalid nominat

** This last cell was used to see the different lengths of attributes to update the schema as to minimise the size **



In [None]:
# Find the largest actor name
largest_actor_name = actors_df['Birth Country'].apply(len).idxmax()
largest_actor_name_value = actors_df.loc[largest_actor_name, 'Birth Country']

# Find the largest movie title
largest_movie_name = movies_df['Language'].apply(len).idxmax()
largest_movie_name_value = movies_df.loc[largest_movie_name, 'Language']

len(largest_actor_name_value), len(largest_movie_name_value)

(78, 40)