<a href="https://colab.research.google.com/github/baizhankyzy/female-directors/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Load and inspect data

In [4]:
import requests
import json

# URL to your dataset
url = "https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/finaldataset.json"

# Fetch and parse the JSON data
response = requests.get(url)
data = response.json()  # Convert the JSON content into a Python object

# Navigate to the relevant data
items = data['results']['bindings']  # Access the list of film entries

# Inspect the structure
print(type(items))  # Should print <class 'list'>
print(items[:3])    # Preview the first 3 records

<class 'list'>
[{'director': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q55418'}, 'directorLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Margarethe von Trotta'}, 'film': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q325355'}, 'filmLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'The Promise'}, 'releaseDate': {'datatype': 'http://www.w3.org/2001/XMLSchema#dateTime', 'type': 'literal', 'value': '1994-01-01T00:00:00Z'}, 'genreLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'drama film'}, 'countryLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Switzerland'}}, {'director': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q55418'}, 'directorLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'Margarethe von Trotta'}, 'film': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q325355'}, 'filmLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'The Promise'}, 'releaseDate': {'datatype': 'http://www.w3.org/2001/XMLSch

## Iterate Over the Data
Update your loop to iterate through the items list:

In [None]:
for item in items:
    # Extract relevant details
    film = item['film']['value']
    director = item['director']['value']
    release_date = item['releaseDate']['value']
    genre_label = item.get('genreLabel', {}).get('value', None)  # Optional field
    country_label = item.get('countryLabel', {}).get('value', None)  # Optional field

    # Print for verification
    print(f"Film: {film}, Director: {director}, Release Date: {release_date}, Genre: {genre_label}, Country: {country_label}")

## RDF Graph creation

In [6]:
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS

g = Graph()

for item in items:
    # Create RDF entities
    film_uri = URIRef(item['film']['value'])
    director_uri = URIRef(item['director']['value'])
    release_date = Literal(item['releaseDate']['value'])
    genre_label = Literal(item['genreLabel']['value']) if 'genreLabel' in item else None
    country_label = Literal(item['countryLabel']['value']) if 'countryLabel' in item else None

    # Add triples to the RDF graph
    g.add((film_uri, RDF.type, URIRef("http://www.wikidata.org/entity/Q11424")))  # Film type
    g.add((film_uri, URIRef("http://schema.org/director"), director_uri))
    g.add((film_uri, URIRef("http://schema.org/releaseDate"), release_date))
    if genre_label:
        g.add((film_uri, URIRef("http://schema.org/genre"), genre_label))
    if country_label:
        g.add((film_uri, URIRef("http://schema.org/countryOfOrigin"), country_label))

# Serialize the graph
g.serialize("output.rdf", format="xml")

<Graph identifier=N0ca8c3e9fb7349f083ca3bebb72acc94 (<class 'rdflib.graph.Graph'>)>

## Download the RDF File (Google Colab)

In [7]:
from google.colab import files
files.download("output.rdf")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Stages of EDA

Parsing JSON to RDF

## Filter

In [None]:
# Query the graph
query_films_before_2000 = g.query(
    """
    SELECT ?filmLabel ?releaseDate
    WHERE {
        ?film rdfs:label ?filmLabel ;
              <http://www.wikidata.org/prop/direct/P577> ?releaseDate .
        FILTER (STR(?releaseDate) < "2000-01-01T00:00:00Z")
    }
    ORDER BY ?releaseDate
    LIMIT 10
    """
)

# Print the results
for film_label, release_date in query_films_before_2000:
    print(f"{film_label} - {release_date}")

Renaissance Man - 1994-01-01T00:00:00Z
The Promise - 1994-01-01T00:00:00Z
Black Beauty - 1994-01-01T00:00:00Z
Priest - 1994-01-01T00:00:00Z
The Little Rascals - 1994-01-01T00:00:00Z
Corrina, Corrina - 1994-01-01T00:00:00Z
I Like It Like That - 1994-01-01T00:00:00Z
Embrace of the Vampire - 1994-01-01T00:00:00Z
Kommt Mausi raus?! - 1994-01-01T00:00:00Z
Dreamplay - 1994-01-01T00:00:00Z
