# Querying [tmdb](https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata) movie information from aperturedb.

This notebook will work on an instance of ApertureDB, which can be on the [cloud](https://cloud.aperturedata.io), or running as a [local docker container(s)](https://docs.aperturedata.io/Setup/server/Local)

The dataset is hosted on kaggle, and available via a mlcroissant link.


In [None]:
%pip install --quiet mlcroissant pandas dotenv

## Import all the modules needed

In [None]:
import pandas as pd
from IPython.display import display


from aperturedb.CommonLibrary import (
    execute_query,
    create_connector
)
from aperturedb.Utils import Utils


In [None]:
client=create_connector()
utils = Utils(client)
utils.summary()


## Query time!
### Find all the movies where Tom Hanks as been a part of 

In [None]:
q = [
    {
        "FindEntity": {
            "_ref": 1,
            "with_class": "Professional",
            "constraints": {
                "name": ["==", "Tom Hanks"]
            },
            "results": {
                "all_properties": True
            }
        }
    },
    {
        "FindEntity": {
            "_ref": 2,
            "is_connected_to": {
                "ref": 1
            },
            "with_class": "Movie",
            "results": {
                "list": ["_uniqueid", "movie_id", "title", "popularity", "budget"]
                # "all_properties": True
            }
        }
    }
]

_, response, _ = execute_query(client, q)

display(pd.json_normalize(response[0]["FindEntity"]["entities"]))
display(pd.json_normalize(response[1]["FindEntity"]["entities"]))

movie_ids = [e["movie_id"] for e in response[1]["FindEntity"]["entities"]]
display(movie_ids)


### Get more info.

This response from cast and movies entities still misses the character information, because it's been encoded on the properties on connection between the 2. Let's merge that info in and get more richer details about the movies Tom Hanks has been a part of.

In [None]:
professional = pd.json_normalize(response[1]["FindEntity"]["entities"])

professional_details = []
for p in response[0]["FindEntity"]["entities"]:
    src = p["_uniqueid"]
    for m in response[1]["FindEntity"]["entities"]:
        dst = m["_uniqueid"]
        q = [{
            "FindEntity": {
                "_ref": 1,
                "with_class": "Professional",
                "constraints": {
                    "_uniqueid": ["==", src]
                },
                "results": {
                    "all_properties": True
                }
            }
        },
        {
            "FindEntity": {
                "_ref": 2,
                "is_connected_to": {
                    "ref": 1
                },
                "with_class": "Movie",
                "constraints": {
                    "_uniqueid": ["==", dst]
                },
                "results": {
                    "all_properties": True
                }
            }
        },{
            "FindConnection": {
                "src": 2,
                "dst": 1,
                "results": {
                    "all_properties": True
                }
            }
        }]
        _, responsec, _ = execute_query(client, q)

        if responsec[2]["FindConnection"]["returned"] > 0:
            c = responsec[2]["FindConnection"]["connections"][0]
            if "character" in c:
                professional_details.append(f"as character: {c['character']}")
            else:
                professional_details.append(f"as {c['job']} in {c['department']}")

display(len(professional_details))
professional['details'] = professional_details

display(professional)

### Find 2 cast poeple. Find the movies in which they both appear (Logical AND)

Here we search for Tom Hanks and Meg Ryan. The All in the Find Connected entity means that find a set which is connected to both the cast people. 

In [None]:
from aperturedb.CommonLibrary import execute_query

q = [
    {
        "FindEntity": {
            "_ref": 1,
            "with_class": "Professional",
            "constraints":{
                "name": ["in", ["Tom Hanks"]]
            },
            "results": {
                "all_properties": True
                # "list": ["name", "_uniqueid"]
            }
        }
    },
    {
        "FindEntity": {
            "_ref": 2,
            "with_class": "Professional",
            "constraints":{
                "name": ["in", [ "Meg Ryan"]]
            },
            "results": {
                "all_properties": True
                # "list": ["name", "_uniqueid"]
            }
        }
    },
    {
        "FindEntity": {
            "is_connected_to": {
                "all": [
                {"ref": 1},
                {"ref": 2}
                ]
            },
            "with_class": "Movie",
            "results": {
                # "list": ["id", "title"],
                # "group_by_source": True
                "all_properties": True
            }
        }
    }
]

_, response, _ = execute_query(client, q)

pd.json_normalize(response[2]["FindEntity"]["entities"])


## We can write the same queries in SPARQL.

Trying the above examples (whatever is possible), as sparql does not deal with properties on relations.

In [None]:
from aperturedb.SPARQL import SPARQL
import json


sparql = SPARQL(client, debug=True)
print("namespaces:", json.dumps({k: str(v) for k, v in sparql.namespaces.items()}, indent=2))

print("properties:", json.dumps({sparql.graph.qname(k): str(v)
      for k, v in sparql.properties.items()}, indent=2))

print("connections:", json.dumps({sparql.graph.qname(k): str(v)
      for k, v in sparql.connections.items()}, indent=2))


### Find all the movies where Tom Hanks as been a part of 

In [None]:

query = """
SELECT  ?title ?pop ?budget
WHERE {
  ?p p:name "Tom Hanks" .
  ?m c:HasCast ?p .
  ?m p:title ?title ;
    p:popularity ?pop ;
    p:budget ?budget .
}
"""

results = sparql.query(query)
df = sparql.to_dataframe(results)
display(df)


### Find 2 cast people. Find the movies they have been part of.

In [None]:
query = """
SELECT  ?title ?pop ?budget ?wTitle
WHERE {
  ?m c:HasCast [p:name "Tom Hanks"] , [p:name "Meg Ryan"] ;
    p:title ?title ;
    p:popularity ?pop ;
    p:budget ?budget .
}
"""

results = sparql.query(query)
df = sparql.to_dataframe(results)
display(df)
# print(json.dumps(sparql.input_query, indent=2))

# Augment the SPARQL data with federated query and response.

## Merge info from Wikidata

Given the above query, let's write a query that fetches the location of the selected movie above. That data is not present in the dataset.

In [None]:
query = """
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
SELECT  ?title ?pop ?budget ?wTitle ?wmovie ?wTitle ?genreLabel ?loclabel
WHERE {
  ?m c:HasCast [p:name "Tom Hanks"] , [p:name "Meg Ryan"] ;
    p:title ?title ;
    p:popularity ?pop ;
    p:budget ?budget .
  ?m c:HasGenre ?genre.
  ?genre p:name ?genreLabel.
    BIND(strlang(?title, "en") AS ?stitle)
    SERVICE <https://query.wikidata.org/sparql> {
      ?wmovie wdt:P31 wd:Q11424.
      ?wmovie rdfs:label ?wTitle.
      ?wmovie wdt:P840 ?location.
      ?location rdfs:label ?loclabel.
      ?wmovie rdfs:label ?stitle.
      FILTER(lang(?wTitle) = "en")
      FILTER(lang(?loclabel) = "en")
    }
}
"""

results = sparql.query(query)
df = sparql.to_dataframe(results)
display(df)