<a href="https://colab.research.google.com/github/baizhankyzy/female-directors/blob/main/Project_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Our Goal

To discover trends in the role of female directors in cinema by analyzing their activity across different countries and examining movie databases for specific periods of study. This will include exploring the role of independent cinema as a pathway for women directors. The research will uncover patterns of representation, contributions, and shifts over time.

Converting JSON file from Wikidata to n-quads format graph

In [None]:
!pip install rdflib
from rdflib import ConjunctiveGraph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
from urllib.parse import quote
import requests
from google.colab import files

# Fetch the JSON data
url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/dataset_final.json"
response = requests.get(url)
data = response.json()['results']['bindings']

# Initialize the graph
g = ConjunctiveGraph()

# Define namespaces
wd = Namespace("http://www.wikidata.org/entity/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
wikidata = URIRef("http://example.org/graph/wikidata")  # Explicitly set graph name

# Add data to the graph
for item in data:
    film = URIRef(item['film']['value'])
    director = URIRef(item['director']['value'])
    release_year = Literal(item['releaseYear']['value'], datatype=RDFS.Literal)
    film_label = Literal(item['filmLabel']['value'], lang='en')
    director_label = Literal(item['directorLabel']['value'], lang='en')

    genre_label = item.get('genreLabel', {}).get('value', "Unknown Genre")
    genre_uri = URIRef(f"http://www.wikidata.org/genre/{quote(genre_label)}")

    country_label = item.get('countryOfBirthLabel', {}).get('value', "Unknown Country")
    country_uri = URIRef(f"http://www.wikidata.org/country/{quote(country_label)}")

    # Avoid blank nodes by always ensuring explicit URIs or Literals
    if isinstance(film, URIRef) and isinstance(director, URIRef):
        # Add RDF quads with the graph name
        g.add((film, wdt.P31, wd.Q11424, wikidata))  # Film type
        g.add((film, RDFS.label, film_label, wikidata))
        g.add((film, wdt.P57, director, wikidata))
        g.add((film, wdt.P577, release_year, wikidata))
        g.add((film, wdt.P136, genre_uri, wikidata))
        g.add((film, wdt.P19, country_uri, wikidata))
        g.add((director, RDFS.label, director_label, wikidata))
        g.add((genre_uri, RDFS.label, Literal(genre_label, lang='en'), wikidata))
        g.add((country_uri, RDFS.label, Literal(country_label, lang='en'), wikidata))

# Serialize the graph to N-Quads
output_file = "wikidata_graph.nq"
g.serialize(output_file, format="nquads")

# Download the file
files.download(output_file)
print(f"Data saved as {output_file} and ready for download.")

  g = ConjunctiveGraph()


KeyboardInterrupt: 

# Loading the wikidata graph

Download wikidata graph from github to Google Colab

In [11]:
!pip install rdflib
from rdflib import ConjunctiveGraph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
import requests

# GitHub raw URL for the .nq file
github_raw_url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/wikidata_graph.nq"

# Download the nquads file from GitHub
nquads_file = "wikidata_graph.nq"
response = requests.get(github_raw_url)

if response.status_code == 200:
    with open(nquads_file, 'wb') as file:
        file.write(response.content)
    print(f"File {nquads_file} downloaded successfully.")
else:
    print(f"Failed to download file: {response.status_code}")
    response.raise_for_status()

# Load the graph
g = ConjunctiveGraph()
try:
    g.parse(nquads_file, format="nquads")
    print("Graph loaded successfully.")
except Exception as e:
    print(f"Error loading graph: {e}")

# Now you can query the graph


Collecting rdflib
  Downloading rdflib-7.1.3-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.9/564.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdflib
Successfully installed rdflib-7.1.3
File wikidata_graph.nq downloaded successfully.



ConjunctiveGraph is deprecated, use Dataset instead.



Graph loaded successfully.


## Research question 1
### How has the number of films directed by women changed over the last 30 years?



**Filtering data for research question**

In [12]:
# Iterate through each year from 1994 to 2024 and count films
print("Year\tNumber of Films")
for year in range(1994, 2025):
    # Construct the query dynamically for each year
    query = f"""
    SELECT (COUNT(?film) AS ?film_count)
    WHERE {{
      ?film <http://www.wikidata.org/prop/direct/P31> <http://www.wikidata.org/entity/Q11424> .
      ?film <http://www.wikidata.org/prop/direct/P577> "{year}"^^<http://www.w3.org/2000/01/rdf-schema#Literal> .
    }}
    """

    # Execute the query
    results = g.query(query)

    # Print the result for the current year
    for row in results:
        print(f"{year}\t{row['film_count']}")

Year	Number of Films
1994	224
1995	247
1996	242
1997	255
1998	292
1999	305
2000	313
2001	364
2002	371
2003	389
2004	438
2005	502
2006	553
2007	586
2008	620
2009	697
2010	614
2011	707
2012	773
2013	827
2014	824
2015	844
2016	1448
2017	897
2018	886
2019	865
2020	627
2021	712
2022	788
2023	706
2024	468


Visualisation

In [13]:
# Placeholder for years and film counts
years = []
film_counts = []

# Iterate through each year from 1994 to 2024 and count films
print("Year\tNumber of Films")
for year in range(1994, 2025):
    # Construct the query dynamically for each year
    query = f"""
    SELECT (COUNT(?film) AS ?film_count)
    WHERE {{
      ?film <http://www.wikidata.org/prop/direct/P31> <http://www.wikidata.org/entity/Q11424> .
      ?film <http://www.wikidata.org/prop/direct/P577> "{year}"^^<http://www.w3.org/2000/01/rdf-schema#Literal> .
    }}
    """

    # Execute the query
    results = g.query(query)

    # Extract the results into the lists
    for row in results:
        years.append(year)
        film_counts.append(int(row['film_count']))

# Import the Plotly library for visualization
import plotly.graph_objects as go

# Create a line chart using Plotly
fig = go.Figure()

# Add a line trace with hover information
fig.add_trace(go.Scatter(
    x=years,
    y=film_counts,
    mode='lines+markers',  # Includes both lines and markers
    name='Number of Films',
    hovertemplate='<b>Year</b>: %{x}<br><b>Number of Films</b>: %{y}<extra></extra>'  # Custom hover text
))

# Customize the layout
fig.update_layout(
    title="Number of Films Released Each Year (1994–2024)",
    xaxis_title="Year",
    yaxis_title="Number of Films",
    template="plotly_white",  # Clean aesthetic
    hovermode="x",  # Hover aligned to x-axis
    xaxis=dict(showgrid=True, tickmode='linear', dtick=1),  # Show all years with gridlines
    yaxis=dict(showgrid=True),
)

# Enable zooming and panning
fig.update_layout(dragmode='pan')  # Default drag behavior for zoom/pan

# Display the figure
fig.show()

Year	Number of Films


Download the html file

In [14]:
# Save the chart as an HTML file
output_file = "number_of_films_chart_rq1.html"
fig.write_html(output_file)
print(f"Interactive chart saved as '{output_file}'")

# Download the HTML file to the local system
files.download(output_file)

Interactive chart saved as 'number_of_films_chart_rq1.html'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The most popular genres

In [18]:
def get_top_genres(year):
    query = f"""
    SELECT ?genre (COUNT(?film) AS ?film_count)
    WHERE {{
      ?film <http://www.wikidata.org/prop/direct/P31> <http://www.wikidata.org/entity/Q11424> . # Movies only
      ?film <http://www.wikidata.org/prop/direct/P136> ?genre . # Genre of the movie
      ?film <http://www.wikidata.org/prop/direct/P577> ?date . # Release date
      FILTER(STRSTARTS(STR(?date), "{year}")) # Match year as a prefix
    }}
    GROUP BY ?genre
    ORDER BY DESC(?film_count)
    LIMIT 5
    """

    # Execute the query
    results = g.query(query)

    # Extract genres and their counts
    genres = []
    counts = []
    for row in results:
        genres.append(str(row['genre'].split('/')[-1]))  # Extract genre name from URI
        counts.append(int(row['film_count']))

    return genres, counts


Show

In [19]:
# Fetch data for each year
genres_2015, counts_2015 = get_top_genres(2015)
print("Top 5 Genres for 2015:")
for genre, count in zip(genres_2015, counts_2015):
    print(f"Genre: {genre}, Count: {count}")

genres_2016, counts_2016 = get_top_genres(2016)
print("\nTop 5 Genres for 2016:")
for genre, count in zip(genres_2016, counts_2016):
    print(f"Genre: {genre}, Count: {count}")

genres_2017, counts_2017 = get_top_genres(2017)
print("\nTop 5 Genres for 2017:")
for genre, count in zip(genres_2017, counts_2017):
    print(f"Genre: {genre}, Count: {count}")

Top 5 Genres for 2015:
Genre: drama%20film, Count: 272
Genre: documentary%20film, Count: 251
Genre: Unknown%20Genre, Count: 118
Genre: comedy%20film, Count: 77
Genre: romance%20film, Count: 39

Top 5 Genres for 2016:
Genre: documentary%20film, Count: 845
Genre: drama%20film, Count: 317
Genre: Unknown%20Genre, Count: 133
Genre: comedy%20film, Count: 98
Genre: biographical%20film, Count: 93

Top 5 Genres for 2017:
Genre: drama%20film, Count: 306
Genre: documentary%20film, Count: 282
Genre: Unknown%20Genre, Count: 117
Genre: comedy%20film, Count: 97
Genre: LGBT-related%20film, Count: 37


Visualisation of top 2 genres

In [22]:
import plotly.graph_objects as go

# Fetch data for each year dynamically (only top 2 genres for each year)
genres_2015, counts_2015 = get_top_genres(2015)
genres_2016, counts_2016 = get_top_genres(2016)
genres_2017, counts_2017 = get_top_genres(2017)

# Select only the top 2 genres for each year
genres_2015, counts_2015 = genres_2015[:2], counts_2015[:2]
genres_2016, counts_2016 = genres_2016[:2], counts_2016[:2]
genres_2017, counts_2017 = genres_2017[:2], counts_2017[:2]

# Ensure genres are human-readable (replace %20 with spaces)
genres_2015 = [genre.replace('%20', ' ') for genre in genres_2015]
genres_2016 = [genre.replace('%20', ' ') for genre in genres_2016]
genres_2017 = [genre.replace('%20', ' ') for genre in genres_2017]

# Combine genres for consistent ordering across years
all_genres = list(set(genres_2015 + genres_2016 + genres_2017))

# Create a dictionary to map genres to counts for each year
data_2015 = {genre: count for genre, count in zip(genres_2015, counts_2015)}
data_2016 = {genre: count for genre, count in zip(genres_2016, counts_2016)}
data_2017 = {genre: count for genre, count in zip(genres_2017, counts_2017)}

# Fill in missing genres with count 0 for consistency
counts_2015 = [data_2015.get(genre, 0) for genre in all_genres]
counts_2016 = [data_2016.get(genre, 0) for genre in all_genres]
counts_2017 = [data_2017.get(genre, 0) for genre in all_genres]

# Create a grouped bar chart with Plotly
fig = go.Figure()

# Add bars for each genre (grouped by year)
for i, genre in enumerate(all_genres):
    fig.add_trace(go.Bar(
        x=['2015', '2016', '2017'],  # Years on the x-axis
        y=[counts_2015[i], counts_2016[i], counts_2017[i]],  # Counts for the genre
        name=genre,  # Genre as legend entry
    ))

# Customize layout
fig.update_layout(
    title="Top 2 Genres by Year (2015, 2016, 2017)",
    xaxis_title="Year",
    yaxis_title="Number of Movies",
    barmode='group',  # Grouped bar chart
    xaxis=dict(
        title_font=dict(size=14),
        tickfont=dict(size=12)
    ),
    yaxis=dict(
        title_font=dict(size=14),
        tickfont=dict(size=12)
    ),
    legend=dict(
        title="Genre",
        font=dict(size=12)
    ),
    plot_bgcolor='rgba(240,240,240,0.8)'  # Light background for better visibility
)

# Show the interactive chart
fig.show()


Download HTML

In [23]:
# Save the chart as an HTML file
output_file = "top_genres_rq1.html"
fig.write_html(output_file)
print(f"Interactive chart saved as '{output_file}'")

# Download the HTML file to the local system
files.download(output_file)

Interactive chart saved as 'top_genres_rq1.html'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#RQ2
**Which countries have the highest and lowest numbers of women directors ?**

Loading Wikidata graph

In [None]:
from rdflib import ConjunctiveGraph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
import requests

# GitHub raw URL for the .nq file
github_raw_url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/wikidata_graph.nq"

# Download the nquads file from GitHub
nquads_file = "wikidata_graph.nq"
response = requests.get(github_raw_url)

if response.status_code == 200:
    with open(nquads_file, 'wb') as file:
        file.write(response.content)
    print(f"File {nquads_file} downloaded successfully.")
else:
    print(f"Failed to download file: {response.status_code}")
    response.raise_for_status()

# Load the graph
g = ConjunctiveGraph()
try:
    g.parse(nquads_file, format="nquads")
    print("Graph loaded successfully.")
except Exception as e:
    print(f"Error loading graph: {e}")



File wikidata_graph.nq downloaded successfully.
Graph loaded successfully.


**Filter**

In [24]:
from rdflib.plugins.sparql import prepareQuery

# Prepare the SPARQL query to count directors per country
query = prepareQuery("""
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?countryLabel (COUNT(DISTINCT ?director) AS ?director_count)
WHERE {
  # Instance of film
  ?film wdt:P31 wd:Q11424 .

  # Directed by a director
  ?film wdt:P57 ?director .

  # Country of origin
  ?film wdt:P19 ?country .
  ?country rdfs:label ?countryLabel . # Get country label
  FILTER(LANG(?countryLabel) = "en") # English label
  FILTER(?countryLabel != "Unknown Country") # Exclude Unknown Country
}
GROUP BY ?countryLabel
ORDER BY DESC(?director_count)
""")

# Execute the query
results = g.query(query)

# Print the results
print("Country\tNumber of Women Directors")
for row in results:
    print(f"{row['countryLabel']}\t{row['director_count']}")




Country	Number of Women Directors
Unknown Country	4552
United States of America	833
Germany	630
France	424
Canada	249
Spain	227
United Kingdom	174
Italy	171
India	143
Sweden	126
Russia	118
Switzerland	107
Israel	103
Japan	101
Brazil	99
Poland	96
Argentina	91
Denmark	85
South Korea	75
Netherlands	74
People's Republic of China	71
Czech Republic	70
Austria	70
Australia	68
Iran	68
Finland	57
Turkey	55
Mexico	54
Norway	50
Hungary	48
Belgium	47
Ukraine	46
Portugal	40
Indonesia	38
Peru	37
New Zealand	33
Bulgaria	30
Nigeria	29
Lebanon	29
Serbia	28
Romania	27
Colombia	26
Bosnia and Herzegovina	25
Morocco	24
Chile	20
Taiwan	19
Pakistan	18
State of Palestine	18
Kenya	18
Bangladesh	18
Georgia	17
Greece	17
Estonia	17
Croatia	17
Jordan	16
Venezuela	16
Slovakia	15
Egypt	15
South Africa	15
Mandatory Palestine	15
Uruguay	15
Ireland	15
Kingdom of Denmark	14
Ayyubid dynasty	13
Hasmonean dynasty	13
Occupied Enemy Territory Administration	13
Roman Palestine	13
early Islamic period in Palestine	13
Kingdom o

Visualisation pie chart (without Unknown country)

In [33]:
import plotly.express as px
import pandas as pd

# Execute the query
results = g.query(query)

# Extract the data from the results, skipping the first row
countries = []
director_counts = []

# Use enumerate to skip the first row (index 0)
for idx, row in enumerate(results):
    if idx == 0:  # Skip the first row
        continue

    country = row['countryLabel']
    count = int(row['director_count'])

    countries.append(country)
    director_counts.append(count)

# Create a DataFrame for easier handling
data = pd.DataFrame({'Country': countries, 'Director Count': director_counts})

# Group countries with less than 70 directors as "Others"
threshold = 70
others_count = data.loc[data['Director Count'] < threshold, 'Director Count'].sum()
data = data.loc[data['Director Count'] >= threshold]

# Add the "Others" category using pd.concat
if others_count > 0:
    others_row = pd.DataFrame({'Country': ['Others(Countries with < 70 Directors)'], 'Director Count': [others_count]})
    data = pd.concat([data, others_row], ignore_index=True)

# Visualize the data as a pie chart
fig = px.pie(
    data,
    names='Country',
    values='Director Count',
    title="Distribution of Directors by Country",
    hole=0.4  # Donut-style chart
)

# Customize the chart
fig.update_traces(
    textinfo='percent+label',  # Show percentage and label
    pull=[0.1 if count == max(data['Director Count']) else 0 for count in data['Director Count']],  # Emphasize the largest slice
)
fig.update_layout(
    title_font=dict(size=24),
    legend_title="Country",
    legend=dict(font=dict(size=12)),
    width=1000,  # Increase chart width
    height=800   # Increase chart height
)

# Show the chart
fig.show()




Download the chart

In [34]:
# Save the chart as an HTML file
output_file = "pie_chart_rq2.html"
fig.write_html(output_file)
print(f"Interactive chart saved as '{output_file}'")

# Download the HTML file to the local system
files.download(output_file)

Interactive chart saved as 'pie_chart_rq2.html'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Number of directors on the map

In [41]:
import plotly.express as px
import pandas as pd

# Execute the query
results = g.query(query)

# Extract the country names and director counts, starting from the second row to skip "Unknown Country"
countries = []
director_counts = []

# Use enumerate to start from the second row
for idx, row in enumerate(results):
    if idx == 0:  # Skip the first row ("Unknown Country")
        continue

    country = row['countryLabel']
    count = int(row['director_count'])

    countries.append(country)
    director_counts.append(count)

# Create a DataFrame for easier handling
data = pd.DataFrame({'Country': countries, 'Director Count': director_counts})

# Visualize the data as a choropleth map
fig = px.choropleth(
    data,
    locations='Country',
    locationmode='country names',  # Use country names for location matching
    color='Director Count',
    title="Distribution of Directors by Country",
    color_continuous_scale='Viridis',  # Color scale
)

# Customize the layout
fig.update_layout(
    geo=dict(showcoastlines=True, coastlinecolor="Black", projection_type="natural earth"),
    title_font=dict(size=24),
    width=1000,  # Increase chart width
    height=800   # Increase chart height
)

# Show the chart
fig.show()

Download

In [42]:
# Save the chart as an HTML file
output_file = "map_rq2.html"
fig.write_html(output_file)
print(f"Interactive chart saved as '{output_file}'")

# Download the HTML file to the local system
files.download(output_file)

Interactive chart saved as 'map_rq2.html'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### RQ3 What is the ratio of awards won to nominations across years?

1. Extracting all the names of the directors from the graph

In [None]:
# Load RDF data (replace 'data.rdf' with your RDF file or URL)
g.parse("wikidata_graph.nq")

# Define the SPARQL query with appropriate prefixes
query = """
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX wd: <http://www.wikidata.org/entity/>

SELECT DISTINCT (STR(?directorName) AS ?name)  # Use DISTINCT to avoid duplicates in the query results
WHERE {
  ?film wdt:P31 wd:Q11424 .  # Film
  ?film wdt:P57 ?director .  # Director
  ?director rdfs:label ?directorName .
}
"""

# Execute the query
results = g.query(query)

# Create a list of director names (allowing duplicates from query results initially)
director_names_list = [str(row.name) for row in results]

# Remove duplicates from the list by converting it to a set and back to a list
director_names_list = list(dict.fromkeys(director_names_list))  # Preserves order while removing duplicates

# Count and print the number of unique directors
num_directors = len(director_names_list)
print(f"Total number of unique directors: {num_directors}")

# Check for duplicates again
if len(director_names_list) == len(set(director_names_list)):
    print("No duplicates found in the list.")
else:
    print("Duplicates found in the list.")

# Print the list of directors (optional)
print("List of unique directors:")
print(director_names_list)

Total number of unique directors: 9258
No duplicates found in the list.
List of unique directors:
['Leslie Zemeckis', 'Mireia Gabilondo', 'Halitha shameem', 'Gigi Gaston', 'Yukiko Mishima', 'Caroline Sascha Cogez', 'Maria Essén', 'Bonnie Rotten', 'Marsia Tzivara', 'Rachel Reichman', 'Colombe Schneck', 'Kanika Verma', 'Linda-Maria Birbeck', 'Ghyslaine Côté', 'Susanne Müller', 'Ginger Gentile', 'Sandra Fernández Ferreira', 'Liu Jiayin', 'Maja Classen', 'Silja Somby', 'Pilar Palomero', 'Jennifer Elster', 'Sandrine Veysset', 'Antoinette Jadaone', 'Inés Toharia Terán', 'Ulrike Ottinger', 'Brigitte Maria Mayer', 'Sólveig Anspach', 'Mariana Chenillo', 'Houda Benyamina', 'Mandie Fletcher', 'Marie-Louise Asseu', 'Susana Nobre', 'Chiara Edmands', 'April Mullen', 'Alina Marazzi', 'Mélanie Carrier', 'Marta Ferencová', 'Emily Atef', 'Eva Dahr', 'İlksen Başarır', 'Natalia Bodiul', 'Gina Kim', 'Marisa Sistach', 'Sophie Fillières', 'Naomi Kawase', 'Isabel Kleefeld', 'Omoni Oboli', 'Maria Martinelli', 

2. Search these names in IMDb

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import os

# Headers to mimic a browser request
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Function to search IMDb and extract the director's IMDb page
def search_imdb_for_director(director_name):
    try:
        search_url = f"https://www.imdb.com/find/?q={'+'.join(director_name.split())}&s=nm"
        response = requests.get(search_url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")
        result = soup.find('li', class_='ipc-metadata-list-summary-item ipc-metadata-list-summary-item--click find-result-item find-name-result')
        if result:
            link = result.find('a', class_='ipc-metadata-list-summary-item__t')
            if link and 'href' in link.attrs:
                return {"Director": director_name, "IMDb Page": "https://www.imdb.com" + link['href']}
        return {"Director": director_name, "IMDb Page": "Not Found"}
    except requests.exceptions.RequestException as e:
        print(f"Network error for {director_name}: {e}")
        return {"Director": director_name, "IMDb Page": "Not Found"}

# Function to save results to a CSV file
def save_progress_to_csv(results, filename="directors_imdb_pages.csv"):
    # Check if the file already exists to append data
    file_exists = os.path.exists(filename)
    with open(filename, "a" if file_exists else "w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=["Director", "IMDb Page"])
        if not file_exists:
            writer.writeheader()  # Write header only if the file is new
        writer.writerows(results)

# Function to process a chunk of directors
def process_chunk(director_chunk, filename="directors_imdb_pages.csv"):
    chunk_results = []
    for director in director_chunk:
        director = director.strip()  # Clean up any extra spaces or formatting
        imdb_page = search_imdb_for_director(director)
        result = {"Director": director, "IMDb Page": imdb_page if imdb_page else "Not Found"}
        chunk_results.append(result)
        print(f"Director: {result['Director']}, IMDb Page: {result['IMDb Page']}")
    save_progress_to_csv(chunk_results, filename)

# Example director list (replace with your actual data)
# director_names_list = [...]

# Split the list into smaller chunks (e.g., 50 names per chunk)
chunk_size = 50
chunks = [director_names_list[i:i + chunk_size] for i in range(0, len(director_names_list), chunk_size)]

# Process each chunk separately and save progress
csv_filename = "directors_imdb_pages.csv"
for i, chunk in enumerate(chunks):
    print(f"Processing chunk {i + 1} of {len(chunks)}...")
    process_chunk(chunk, csv_filename)
    print(f"Chunk {i + 1} saved to {csv_filename}")

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Director: Andrea Blaugrund Nevins, IMDb Page: {'Director': 'Andrea Blaugrund Nevins', 'IMDb Page': 'https://www.imdb.com/name/nm0087895/?ref_=fn_nme_nme_1'}
Director: Chiu Li-Kwan, IMDb Page: {'Director': 'Chiu Li-Kwan', 'IMDb Page': 'https://www.imdb.com/name/nm15663885/?ref_=fn_nme_nme_1'}
Director: Marie Poledňáková, IMDb Page: {'Director': 'Marie Poledňáková', 'IMDb Page': 'https://www.imdb.com/name/nm0688949/?ref_=fn_nme_nme_1'}
Director: Michela Andreozzi, IMDb Page: {'Director': 'Michela Andreozzi', 'IMDb Page': 'https://www.imdb.com/name/nm2594938/?ref_=fn_nme_nme_1'}
Director: Stefanie Sycholt, IMDb Page: {'Director': 'Stefanie Sycholt', 'IMDb Page': 'https://www.imdb.com/name/nm0842955/?ref_=fn_nme_nme_1'}
Director: Lala Gomà Presas, IMDb Page: {'Director': 'Lala Gomà Presas', 'IMDb Page': 'Not Found'}
Director: Ewa Pięta, IMDb Page: {'Director': 'Ewa Pięta', 'IMDb Page': 'https://www.imdb.com/n

Download CSV

In [None]:
from google.colab import files

# Specify the file to download
filename = "directors_imdb_pages.csv"

# Download the file
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Cleaning the file

In [None]:
import csv
import requests

# URL of the raw CSV file
url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/directors_imdb_pages.csv"

# Download the CSV file
response = requests.get(url)
response.raise_for_status()  # Raise an exception if the download fails

# Decode the content into a string format
csv_content = response.content.decode("utf-8").splitlines()

# Prepare the output file name
output_file = "directors_imdb_pages_cleaned.csv"

# Open the CSV content and process it
with open(output_file, "w", newline="", encoding="utf-8") as outfile:
    reader = csv.DictReader(csv_content)
    writer = csv.writer(outfile)

    # Write the header
    writer.writerow(["Director", "IMDb Page"])

    for row in reader:
        # Parse the IMDb Page to extract only the URL
        imdb_page = eval(row["IMDb Page"])["IMDb Page"]  # Safely extract IMDb Page using `eval`

        # Write the cleaned row
        writer.writerow([row["Director"], imdb_page])

print(f"Cleaned data saved to '{output_file}'")

Cleaned data saved to 'directors_imdb_pages_cleaned.csv'


Print 10 rows

In [None]:
import csv

# Name of the output file
output_file = "directors_imdb_pages_cleaned.csv"

# Opening the file and reading the content
with open(output_file, "r", encoding="utf-8") as file:
    reader = csv.reader(file)

    # Read the header
    header = next(reader)
    print("Header:", header)

    # Print the first 10 rows
    print("\nFirst 10 rows:")
    for i, row in enumerate(reader):
        print(row)
        if i == 9:
            break



Header: ['Director', 'IMDb Page']

First 10 rows:
['Leslie Zemeckis', 'https://www.imdb.com/name/nm0366667/?ref_=fn_nme_nme_1']
['Mireia Gabilondo', 'https://www.imdb.com/name/nm1481127/?ref_=fn_nme_nme_1']
['Halitha shameem', 'https://www.imdb.com/name/nm6274646/?ref_=fn_nme_nme_1']
['Gigi Gaston', 'https://www.imdb.com/name/nm0309456/?ref_=fn_nme_nme_1']
['Yukiko Mishima', 'https://www.imdb.com/name/nm3446966/?ref_=fn_nme_nme_1']
['Caroline Sascha Cogez', 'https://www.imdb.com/name/nm0169032/?ref_=fn_nme_nme_1']
['Maria Essén', 'https://www.imdb.com/name/nm0261345/?ref_=fn_nme_nme_1']
['Bonnie Rotten', 'https://www.imdb.com/name/nm6720874/?ref_=fn_nme_nme_1']
['Marsia Tzivara', 'https://www.imdb.com/name/nm3103786/?ref_=fn_nme_nme_1']
['Rachel Reichman', 'https://www.imdb.com/name/nm0717116/?ref_=fn_nme_nme_1']


Download new csv

In [None]:
from google.colab import files

# Specify the file to download
filename = "directors_imdb_pages_cleaned.csv"

# Download the file
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Upload cleaned file from github



In [None]:
import pandas as pd

# URL of the CSV file
url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/directors_imdb_pages_cleaned.csv"

# Load the CSV file into a Pandas DataFrame
try:
    directors_data = pd.read_csv(url)
    print("File successfully loaded!")
    print(directors_data.head())  # Display the first 5 rows as a preview
except Exception as e:
    print(f"An error occurred: {e}")

File successfully loaded!
           Director                                          IMDb Page
0   Leslie Zemeckis  https://www.imdb.com/name/nm0366667/?ref_=fn_n...
1  Mireia Gabilondo  https://www.imdb.com/name/nm1481127/?ref_=fn_n...
2   Halitha shameem  https://www.imdb.com/name/nm6274646/?ref_=fn_n...
3       Gigi Gaston  https://www.imdb.com/name/nm0309456/?ref_=fn_n...
4    Yukiko Mishima  https://www.imdb.com/name/nm3446966/?ref_=fn_n...


Scrape awards of the directors

In [None]:
import os
import requests
import pandas as pd
import time
import random
from bs4 import BeautifulSoup

# File and URL details
csv_file = "directors_imdb_pages_cleaned.csv"
csv_url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/directors_imdb_pages_cleaned.csv"

# Download the CSV file if it does not exist locally
if not os.path.exists(csv_file):
    print("Downloading the CSV file...")
    try:
        response = requests.get(csv_url, timeout=10)
        response.raise_for_status()  # Ensure the request was successful
        with open(csv_file, "wb") as file:
            file.write(response.content)
        print("File downloaded successfully.")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading the CSV file: {e}")
        exit()

# Load the cleaned CSV file with director names and IMDb URLs
data = pd.read_csv(csv_file)

# Verify the structure of the CSV file
print("Loaded data:")
print(data.head())  # Should have 'Director' and 'IMDb Page' columns

# Prepare to collect awards data
output_file = "directors_awards_all.csv"
awards_data = []

# Headers for requests, including specifying the language as English
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}

# Split the data into chunks of 100 directors each
chunk_size = 100
chunks = [data[i:i + chunk_size] for i in range(0, len(data), chunk_size)]

# Process each chunk
for chunk_idx, chunk in enumerate(chunks, start=1):
    print(f"Processing chunk {chunk_idx} of {len(chunks)}...")

    # Process each director in the chunk
    for _, row in chunk.iterrows():
        director_name = row['Director']
        imdb_url = row['IMDb Page']

        # Skip if the IMDb URL is "Not Found"
        if imdb_url == "Not Found":
            print(f"Skipping {director_name} because IMDb URL is 'Not Found'.")
            continue

        # Extract the IMDb ID from the URL
        try:
            imdb_id = imdb_url.split('/name/')[1].split('/')[0]
        except IndexError:
            print(f"Invalid IMDb URL for {director_name}. Skipping.")
            continue

        # Construct the awards URL
        awards_url = f"https://www.imdb.com/name/{imdb_id}/awards/"

        print(f"Scraping awards for: {director_name} ({awards_url})")

        try:
            # Fetch the awards page
            response = requests.get(awards_url, headers=headers, timeout=10)
            response.raise_for_status()

            # Parse the page
            soup = BeautifulSoup(response.content, "html.parser")
            award_entries = soup.find_all('li', class_='ipc-metadata-list-summary-item')

            # Process each award entry
            for award in award_entries:
                year_and_outcome = award.find('a', class_='ipc-metadata-list-summary-item__t')
                year_and_outcome_text = year_and_outcome.text.strip() if year_and_outcome else "N/A"

                award_name = award.find('span', class_='ipc-metadata-list-summary-item__tst')
                award_name_text = award_name.text.strip() if award_name else "N/A"

                category = award.find('span', class_='ipc-metadata-list-summary-item__li awardCategoryName')
                category_text = category.text.strip() if category else "N/A"

                associated_work = award.find('a', class_='ipc-metadata-list-summary-item__li ipc-metadata-list-summary-item__li--link')
                associated_work_text = associated_work.text.strip() if associated_work else "N/A"

                # Append the data to the list
                awards_data.append({
                    "Director": director_name,
                    "Year & Outcome": year_and_outcome_text,
                    "Award Name": award_name_text,
                    "Category": category_text,
                    "Associated Work": associated_work_text
                })
        except requests.exceptions.RequestException as e:
            print(f"Error fetching awards for {director_name}: {e}")

        # Random delay to mimic human behavior (helps avoid blocking)
        time.sleep(random.uniform(1, 3))

    # Save the awards data after each chunk
    if chunk_idx == 1:
        # Overwrite file for the first chunk
        pd.DataFrame(awards_data).to_csv(output_file, index=False, encoding="utf-8")
    else:
        # Append to the file for subsequent chunks
        pd.DataFrame(awards_data).to_csv(output_file, mode='a', index=False, encoding="utf-8", header=False)

    # Clear memory after saving the chunk
    awards_data = []

    # Print awards data only for the first chunk
    if chunk_idx == 1:
        print("Sample awards data from the first chunk:")
        print(pd.read_csv(output_file).head())

print(f"All awards data saved to {output_file}")


# Opening the file and reading the content
with open(output_file, "r", encoding="utf-8") as file:
    reader = csv.reader(file)

    # Read the header
    header = next(reader)
    print("Header:", header)

    # Print the first 10 rows
    print("\nFirst 20 rows:")
    for i, row in enumerate(reader):
        print(row)
        if i == 19:
            break




[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
Scraping awards for: Tilde Harkamp (https://www.imdb.com/name/nm0363087/awards/)
Scraping awards for: Deborah Chow (https://www.imdb.com/name/nm1278887/awards/)
Scraping awards for: Joanna Angel (https://www.imdb.com/name/nm0744884/awards/)
Skipping Ksenia Ratushnaya because IMDb URL is 'Not Found'.
Scraping awards for: Lola Amaria (https://www.imdb.com/name/nm1146909/awards/)
Scraping awards for: Mahshid Afsharzadeh (https://www.imdb.com/name/nm0996324/awards/)
Scraping awards for: Anusha Rizvi (https://www.imdb.com/name/nm3470556/awards/)
Scraping awards for: Eliane Caffé (https://www.imdb.com/name/nm0128464/awards/)
Scraping awards for: Sabine Krayenbühl (https://www.imdb.com/name/nm0470483/awards/)
Scraping awards for: Zeva Oelbaum (https://www.imdb.com/name/nm3572143/awards/)
Scraping awards for: Emma Rozanski (https://www.imdb.com/name/nm3143391/awards/)
Scraping awards for: Khrystyna Syvolap (https

NameError: name 'csv' is not defined

Check values

In [None]:
import csv
output_file = "directors_awards_all.csv"

 # Opening the file and reading the content
with open(output_file, "r", encoding="utf-8") as file:
    reader = csv.reader(file)
 # Read the header
    header = next(reader)
    print("Header:", header)

    # Print the first 10 rows
    print("\nFirst 20 rows:")
    for i, row in enumerate(reader):
        print(row)
        if i == 19:
            break

[1;30;43mВыходные данные были обрезаны до нескольких последних строк (5000).[0m
['Inês de Medeiros', '1999 Nominee Crystal Star', 'Crystal Star', 'Best European Short', 'Senhor Jerónimo']
['Inês de Medeiros', '1999 Nominee European Film Award', 'European Film Award', 'European Short Film', 'Senhor Jerónimo']
['Inês de Medeiros', '1997 Winner Golden Globe', 'Golden Globe', 'Best Actress', 'Pandora']
['Inês de Medeiros', '1996 Nominee Golden Globe', 'Golden Globe', 'Best Actress', 'Down to Earth']
['Inês de Medeiros', '1998 Nominee Best Film Award', 'Best Film Award', 'Best Short Fiction Film', 'Senhor Jerónimo']
['Inês de Medeiros', '2007 Winner Audience Award', 'Audience Award', 'Best Medium Length Film', 'Cartas a Uma Ditadura']
['Inês de Medeiros', '1998 Winner Grand Prize', 'Grand Prize', 'Best Film', 'Senhor Jerónimo']
['Inês de Medeiros', '1999 Nominee Grand Prix', 'Grand Prix', 'European', 'Senhor Jerónimo']
['Katja Colja', '2019 Nominee Golden Goblet', 'Golden Goblet', 'Best F

Download file with awards

In [None]:
from google.colab import files

# Specify the file to download
filename = "directors_awards_all.csv"

# Download the file
files.download(filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Uploading the awards from Github

In [None]:
import pandas as pd

# URL of the CSV file
url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/directors_awards_all.csv"

# Load the CSV file into a Pandas DataFrame
try:
    directors_data = pd.read_csv(url)
    print("File successfully loaded!")
    print(directors_data.head())  # Display the first 5 rows as a preview
except Exception as e:
    print(f"An error occurred: {e}")

File successfully loaded!
          Director                         Year & Outcome  \
0  Leslie Zemeckis                 2012 Nominee Gold Hugo   
1  Leslie Zemeckis  2012 Winner Hollywood Discovery Award   
2  Leslie Zemeckis                2017 Nominee Jury Award   
3  Leslie Zemeckis                  2017 Winner IFS Award   
4  Leslie Zemeckis               2018 Winner Best Of Fest   

                  Award Name               Category  \
0                  Gold Hugo       Best Documentary   
1  Hollywood Discovery Award       Best Documentary   
2                 Jury Award  Best Documentary Film   
3                  IFS Award       Best Documentary   
4               Best Of Fest            Feature Doc   

               Associated Work  
0               Bound by Flesh  
1               Bound by Flesh  
2  Mabel, Mabel, Tiger Trainer  
3  Mabel, Mabel, Tiger Trainer  
4  Mabel, Mabel, Tiger Trainer  


Awards vs Nominations for each year visualisation

In [55]:
import pandas as pd
import plotly.graph_objects as go

# URL of the CSV file
url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/directors_awards_all.csv"

# Load the CSV file into a Pandas DataFrame
try:
    directors_data = pd.read_csv(url)
    print("File successfully loaded!")
    print(directors_data.head())  # Display the first 5 rows as a preview
except Exception as e:
    print(f"An error occurred: {e}")

# Ensure columns are clean and in the expected format
directors_data['Year & Outcome'] = directors_data['Year & Outcome'].str.strip()

# Extract the year and outcome (Winner/Nominee) from the "Year & Outcome" column
directors_data['Year'] = directors_data['Year & Outcome'].str.extract(r'(\d{4})')  # Extract the year (4 digits)
directors_data['Outcome'] = directors_data['Year & Outcome'].apply(lambda x: 'Winner' if 'Winner' in x else 'Nominee')

# Filter data for the years between 1994 and 2024
filtered_data = directors_data[(directors_data['Year'] >= '1994') & (directors_data['Year'] <= '2024')]

# Group by year and outcome to calculate counts
yearly_awards = filtered_data.groupby(['Year', 'Outcome']).size().reset_index(name='Count')

# Pivot data to create separate columns for 'Winner' and 'Nominee'
pivot_yearly = yearly_awards.pivot(index='Year', columns='Outcome', values='Count').fillna(0)

# Add a column for Total Nominations (Winner + Nominee)
pivot_yearly['Total Nominations'] = pivot_yearly['Winner'] + pivot_yearly['Nominee']

# Calculate win ratio as a percentage
pivot_yearly['Win Ratio (%)'] = (pivot_yearly['Winner'] / pivot_yearly['Total Nominations']) * 100

# Ensure the years are sorted
pivot_yearly = pivot_yearly.sort_index()

# Create an interactive stacked bar chart with Plotly
fig = go.Figure()

# Add bars for "Winner"
fig.add_trace(go.Bar(
    x=pivot_yearly.index,
    y=pivot_yearly['Winner'],
    name='Winner',
    marker_color='blue',
    hovertext=[
        f"Year: {year}<br>Wins: {wins}<br>Total Nominations: {total}<br>Win Ratio: {ratio:.2f}%"
        for year, wins, total, ratio in zip(
            pivot_yearly.index,
            pivot_yearly['Winner'],
            pivot_yearly['Total Nominations'],
            pivot_yearly['Win Ratio (%)']
        )
    ],
    hoverinfo="text"  # Hover info only
))

# Add bars for the remaining "Total Nominations" (excluding Winner)
fig.add_trace(go.Bar(
    x=pivot_yearly.index,
    y=pivot_yearly['Total Nominations'] - pivot_yearly['Winner'],  # Nominee count
    name='Nominee',
    marker_color='orange',
    hovertext=[
        f"Year: {year}<br>Wins: {wins}<br>Total Nominations: {total}<br>Win Ratio: {ratio:.2f}%"
        for year, wins, total, ratio in zip(
            pivot_yearly.index,
            pivot_yearly['Winner'],
            pivot_yearly['Total Nominations'],
            pivot_yearly['Win Ratio (%)']
        )
    ],
    hoverinfo="text"  # Hover info only
))

# Update layout for better presentation
fig.update_layout(
    title='Awards vs. Nominations by Year (1994–2024)',
    xaxis_title='Year',
    yaxis_title='Count',
    barmode='stack',  # Use stacked bars for visualization
    legend_title='Outcome',
    template='plotly_white',
    xaxis=dict(tickangle=45),
    showlegend=True
)

# Display the chart in the notebook
fig.show()


An error occurred: HTTP Error 404: Not Found


Download html

In [56]:

# Save the chart as an HTML file
output_file = "awards_vs_nominations_chart_rq3.html"
fig.write_html(output_file)
print(f"Interactive chart saved as '{output_file}'")

# Download the HTML file to the local system
files.download(output_file)

Interactive chart saved as 'awards_vs_nominations_chart_rq3.html'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>