<a href="https://colab.research.google.com/github/baizhankyzy/female-directors/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Stages of EDA

In [9]:
import requests
import json
import rdflib
from rdflib import Namespace, URIRef, Literal
from rdflib.namespace import RDF, RDFS

# URL of the raw JSON file on GitHub
url = 'https://raw.githubusercontent.com/baizhankyzy/female-directors/refs/heads/main/data1.json'  # Replace with the actual URL

# Fetch the JSON file
response = requests.get(url)
if response.status_code == 200:
    data = response.json()  # Parse JSON data
else:
    raise Exception(f"Failed to fetch JSON file. HTTP Status Code: {response.status_code}")

# Bind namespaces
wd = Namespace("http://www.wikidata.org/entity/")
wdt = Namespace("http://www.wikidata.org/prop/direct/")
art = Namespace("https://w3id.org/artchives/")

# Create an empty graph
g = rdflib.Graph()

# Process the JSON data
for entry in data:
    # Extract variables
    director = URIRef(entry['director'])
    film = URIRef(entry['film'])
    release_date = Literal(entry['releaseDate'])
    director_label = Literal(entry['directorLabel'])
    film_label = Literal(entry['filmLabel'])

    # Add triples to the graph
    g.add((director, RDF.type, wd['Q5']))  # Assuming director is a person (Q5 in Wikidata)
    g.add((director, RDFS.label, director_label))
    g.add((director, wdt['P57'], film))  # P57 = directed film
    g.add((film, RDFS.label, film_label))
    g.add((film, wdt['P577'], release_date))  # P577 = publication date

# Serialize the graph to check the results
print(g.serialize(format='turtle'))

@prefix ns1: <http://www.wikidata.org/prop/direct/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://www.wikidata.org/entity/Q100265400> a <http://www.wikidata.org/entity/Q5> ;
    rdfs:label "Debs Paterson" ;
    ns1:P57 <http://www.wikidata.org/entity/Q100268366>,
        <http://www.wikidata.org/entity/Q4689603> .

<http://www.wikidata.org/entity/Q100272989> a <http://www.wikidata.org/entity/Q5> ;
    rdfs:label "Lauriane Escaffre" ;
    ns1:P57 <http://www.wikidata.org/entity/Q114081248> .

<http://www.wikidata.org/entity/Q100276107> a <http://www.wikidata.org/entity/Q5> ;
    rdfs:label "Krischka Stoffels" ;
    ns1:P57 <http://www.wikidata.org/entity/Q58950315> .

<http://www.wikidata.org/entity/Q100348028> a <http://www.wikidata.org/entity/Q5> ;
    rdfs:label "Marta Pessoa" ;
    ns1:P57 <http://www.wikidata.org/entity/Q112684634> .

<http://www.wikidata.org/entity/Q100415> a <http://www.wikidata.org/entity/Q5> ;
    rdfs:label "Saralisa Volm" ;
    ns1:P57 <h

In [16]:
# Query the graph
query_films_before_2000 = g.query(
    """
    SELECT ?filmLabel ?releaseDate
    WHERE {
        ?film rdfs:label ?filmLabel ;
              <http://www.wikidata.org/prop/direct/P577> ?releaseDate .
        FILTER (STR(?releaseDate) < "2000-01-01T00:00:00Z")
    }
    ORDER BY ?releaseDate
    """
)

# Print the results
for film_label, release_date in query_films_before_2000:
    print(f"{film_label} - {release_date}")

Renaissance Man - 1994-01-01T00:00:00Z
The Promise - 1994-01-01T00:00:00Z
Black Beauty - 1994-01-01T00:00:00Z
Priest - 1994-01-01T00:00:00Z
The Little Rascals - 1994-01-01T00:00:00Z
Corrina, Corrina - 1994-01-01T00:00:00Z
I Like It Like That - 1994-01-01T00:00:00Z
Embrace of the Vampire - 1994-01-01T00:00:00Z
Kommt Mausi raus?! - 1994-01-01T00:00:00Z
Dreamplay - 1994-01-01T00:00:00Z
A Business Affair - 1994-01-01T00:00:00Z
The Beans of Egypt, Maine - 1994-01-01T00:00:00Z
Mixed Nuts - 1994-01-01T00:00:00Z
Go Fish - 1994-01-01T00:00:00Z
Scooby-Doo! in Arabian Nights - 1994-01-01T00:00:00Z
Oh God - 1994-01-01T00:00:00Z
Grande Petite - 1994-01-01T00:00:00Z
Q3040399 - 1994-01-01T00:00:00Z
I Can't Sleep - 1994-01-01T00:00:00Z
Oublie-moi - 1994-01-01T00:00:00Z
The Butterfly Lifts the Cat Up - 1994-01-01T00:00:00Z
Angie - 1994-01-01T00:00:00Z
Q3212546 - 1994-01-01T00:00:00Z
Mina Tannenbaum - 1994-01-01T00:00:00Z
Q3258654 - 1994-01-01T00:00:00Z
Je t'aime quand même - 1994-01-01T00:00:00Z
La Pis

## Filter

In [15]:
# Query the graph
query_films_before_2000 = g.query(
    """
    SELECT ?filmLabel ?releaseDate
    WHERE {
        ?film rdfs:label ?filmLabel ;
              <http://www.wikidata.org/prop/direct/P577> ?releaseDate .
        FILTER (?releaseDate < "2000-01-01T00:00:00Z"^^xsd:dateTime)
    }
    ORDER BY ?releaseDate
    """
)

# Print the results
for film_label, release_date in query_films_before_2000:
    print(f"{film_label} - {release_date}")