<a href="https://colab.research.google.com/github/baizhankyzy/female-directors/blob/main/Project_new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rdflib


Collecting rdflib
  Downloading rdflib-7.1.1-py3-none-any.whl.metadata (11 kB)
Collecting isodate<1.0.0,>=0.7.2 (from rdflib)
  Downloading isodate-0.7.2-py3-none-any.whl.metadata (11 kB)
Downloading rdflib-7.1.1-py3-none-any.whl (562 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m562.4/562.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading isodate-0.7.2-py3-none-any.whl (22 kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.7.2 rdflib-7.1.1


RDF GRaph

In [None]:
from rdflib import ConjunctiveGraph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
from urllib.parse import quote
import requests
from google.colab import files

# Fetch the JSON data
url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/dataset_final.json"
response = requests.get(url)
data = response.json()['results']['bindings']

# Initialize the graph
g = ConjunctiveGraph()

# Define namespaces
wd = Namespace("http://www.wikidata.org/entity/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
wikidata = Namespace("http://www.wikidata.org/")

# Add data to the graph
for item in data:
    film = URIRef(item['film']['value'])
    director = URIRef(item['director']['value'])
    release_year = Literal(item['releaseYear']['value'], datatype=RDFS.Literal)
    film_label = Literal(item['filmLabel']['value'], lang='en')
    director_label = Literal(item['directorLabel']['value'], lang='en')

    genre_label = item.get('genreLabel', {}).get('value', "Unknown Genre")
    genre_uri = URIRef(f"http://www.wikidata.org/genre/{quote(genre_label)}")

    country_label = item.get('countryLabel', {}).get('value', "Unknown Country")
    country_uri = URIRef(f"http://www.wikidata.org/country/{quote(country_label)}")

    # Avoid blank nodes by always ensuring explicit URIs or Literals
    if isinstance(film, URIRef) and isinstance(director, URIRef):
        # Add RDF triples
        g.add((film, wdt.P31, wd.Q11424))  # Film type
        g.add((film, RDFS.label, film_label))
        g.add((film, wdt.P57, director))
        g.add((film, wdt.P577, release_year))
        g.add((film, wdt.P136, genre_uri))
        g.add((film, wdt.P19, country_uri))
        g.add((director, RDFS.label, director_label))
        g.add((genre_uri, RDFS.label, Literal(genre_label, lang='en')))
        g.add((country_uri, RDFS.label, Literal(country_label, lang='en')))

# Serialize the graph
output_file = "director_graph.nq"
g.serialize(output_file, format="nquads")

# Download the fixed file
files.download(output_file)
print(f"Data saved as {output_file} and ready for download.")

  g = ConjunctiveGraph()


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Data saved as director_graph.nq and ready for download.


# Loading the graph

In [None]:
from rdflib import ConjunctiveGraph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
from urllib.parse import quote
import requests
from google.colab import files

g = ConjunctiveGraph()
g.parse("director_graph_final.nq", format="nquads")

  g = ConjunctiveGraph()


FileNotFoundError: [Errno 2] No such file or directory: '/director_graph.nq'

Download graph from github to Google Colab

In [14]:
from rdflib import ConjunctiveGraph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
import requests

# GitHub raw URL for the .nq file
github_raw_url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/director_graph_final.nq"

# Download the nquads file from GitHub
nquads_file = "director_graph_final.nq"
response = requests.get(github_raw_url)

if response.status_code == 200:
    with open(nquads_file, 'wb') as file:
        file.write(response.content)
    print(f"File {nquads_file} downloaded successfully.")
else:
    print(f"Failed to download file: {response.status_code}")
    response.raise_for_status()

# Load the graph
g = ConjunctiveGraph()
try:
    g.parse(nquads_file, format="nquads")
    print("Graph loaded successfully.")
except Exception as e:
    print(f"Error loading graph: {e}")

# Now you can query the graph


File director_graph_final.nq downloaded successfully.


  g = ConjunctiveGraph()


Graph loaded successfully.


## Research questions
### How has the number of women directors in feature films changed over the last 30 years, including fluctuations during specific periods and potential external factors such as societal events or industry shifts?




In [15]:
# Iterate through each year from 1994 to 2024 and count films
print("Year\tNumber of Films")
for year in range(1994, 2025):
    # Construct the query dynamically for each year
    query = f"""
    SELECT (COUNT(?film) AS ?film_count)
    WHERE {{
      ?film <http://www.wikidata.org/prop/direct/P31> <http://www.wikidata.org/entity/Q11424> .
      ?film <http://www.wikidata.org/prop/direct/P577> "{year}"^^<http://www.w3.org/2000/01/rdf-schema#Literal> .
    }}
    """

    # Execute the query
    results = g.query(query)

    # Print the result for the current year
    for row in results:
        print(f"{year}\t{row['film_count']}")

Year	Number of Films
1994	224
1995	247
1996	242
1997	255
1998	292
1999	305
2000	313
2001	364
2002	371
2003	389
2004	438
2005	502
2006	553
2007	586
2008	620
2009	697
2010	614
2011	707
2012	773
2013	827
2014	824
2015	844
2016	1448
2017	897
2018	886
2019	865
2020	627
2021	712
2022	788
2023	706
2024	468


## **2nd research question**



```
# Which countries have the highest and lowest numbers of women directors in feature films?
```



In [16]:
from rdflib.plugins.sparql import prepareQuery

# Prepare the SPARQL query to count directors per country
query = prepareQuery("""
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

SELECT ?countryLabel (COUNT(DISTINCT ?director) AS ?director_count)
WHERE {
  # Instance of film
  ?film wdt:P31 wd:Q11424 .

  # Directed by a director
  ?film wdt:P57 ?director .

  # Country of origin
  ?film wdt:P19 ?country .
  ?country rdfs:label ?countryLabel . # Get country label
  FILTER(LANG(?countryLabel) = "en") # English label
  FILTER(?countryLabel != "Unknown Country") # Exclude Unknown Country
}
GROUP BY ?countryLabel
ORDER BY DESC(?director_count)
""")

# Execute the query
results = g.query(query)

# Print the results
print("Country\tNumber of Women Directors")
for row in results:
    print(f"{row['countryLabel']}\t{row['director_count']}")




Country	Number of Women Directors
Unknown Country	4552
United States of America	833
Germany	630
France	424
Canada	249
Spain	227
United Kingdom	174
Italy	171
India	143
Sweden	126
Russia	118
Switzerland	107
Israel	103
Japan	101
Brazil	99
Poland	96
Argentina	91
Denmark	85
South Korea	75
Netherlands	74
People's Republic of China	71
Czech Republic	70
Austria	70
Iran	68
Australia	68
Finland	57
Turkey	55
Mexico	54
Norway	50
Hungary	48
Belgium	47
Ukraine	46
Portugal	40
Indonesia	38
Peru	37
New Zealand	33
Bulgaria	30
Lebanon	29
Nigeria	29
Serbia	28
Romania	27
Colombia	26
Bosnia and Herzegovina	25
Morocco	24
Chile	20
Taiwan	19
Pakistan	18
Bangladesh	18
State of Palestine	18
Kenya	18
Greece	17
Croatia	17
Georgia	17
Estonia	17
Venezuela	16
Jordan	16
Ireland	15
Egypt	15
Slovakia	15
South Africa	15
Uruguay	15
Mandatory Palestine	15
Kingdom of Denmark	14
Latvia	13
Roman Palestine	13
Kingdom of Jerusalem	13
early Islamic period in Palestine	13
Occupied Enemy Territory Administration	13
Ayyubid dynasty

In [12]:
query = prepareQuery("""
SELECT ?film ?country
WHERE {
  ?film <http://www.wikidata.org/country/> ?country .
}
""")

### 3d Research question

What is the relationship between independent cinema and the rise of women directors in feature films?

In [None]:
print("Year\tGenre\tNumber of Women Directors")

for year in range(1994, 2025):
    # Construct the query dynamically for each year
    query = f"""
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?genreLabel (COUNT(DISTINCT ?director) AS ?director_count)
    WHERE {{
      # Instance of film
      ?film wdt:P31 wd:Q11424 .

      # Directed by a director
      ?film wdt:P57 ?director .

      # Release year matches the current year
      ?film wdt:P577 "{year}"^^<http://www.w3.org/2000/01/rdf-schema#Literal> .

      # Genre classification
      ?film wdt:P136 ?genre .
      ?genre rdfs:label ?genreLabel .

      # Focus on independent cinema and other genres
      FILTER(LANG(?genreLabel) = "en")
    }}
    GROUP BY ?genreLabel
    ORDER BY DESC(?director_count)
    """

    # Execute the query
    results = g.query(query)

    # Print the results for the current year
    for row in results:
        print(f"{year}\t{row['genreLabel']}\t{row['director_count']}")



Year	Genre	Number of Women Directors
1994	drama film	79
1994	documentary film	45
1994	Unknown Genre	35
1994	comedy film	25
1994	comedy drama	13
1994	LGBT-related film	11
1994	romance film	9
1994	children's film	6
1994	biographical film	6
1994	action film	5
1994	thriller film	5
1994	film based on a novel	4
1994	fantasy film	4
1994	experimental film	4
1994	erotic film	4
1994	horror film	4
1994	romantic comedy	4
1994	teen film	3
1994	compilation film	3
1994	adventure film	2
1994	crime film	2
1994	Christmas film	2
1994	queer film	1
1994	science fiction comedy	1
1994	anthology film	1
1994	historical film	1
1994	concert film	1
1994	comedy horror	1
1994	crossover fiction	1
1994	romantic drama	1
1994	coming-of-age fiction	1
1994	erotic thriller	1
1994	silent film	1
1994	buddy film	1
1994	ghost film	1
1994	cyberpunk	1
1994	science fiction film	1
1994	vampire film	1
1994	slapstick	1
1994	dance film	1
1994	road movie	1
1994	vigilante film	1
1994	prison film	1
1994	family film	1
1994	animated film