In [4]:
import os
from SPARQLWrapper import SPARQLWrapper, JSON
import random
from tqdm import tqdm

In [5]:
def generate_triplet_samples_sports(num_samples):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = """
    SELECT DISTINCT ?subject ?subjectName ?predicate ?object ?objectName
    WHERE {
        ?subject ?predicate ?object.
        FILTER (isIRI(?subject) && isIRI(?object) &&
                (CONTAINS(STR(?predicate), "sports") || CONTAINS(STR(?predicate), "athlete")))
        ?subject foaf:name ?subjectName.
        ?object foaf:name ?objectName.
    }
    LIMIT %d
    """ % num_samples

    sparql.setQuery(query)
    results = sparql.query().convert()

    triplets = []
    subject_set = set()
    for result in results["results"]["bindings"]:
        subject = result["subject"]["value"]
        subject_name = result["subjectName"]["value"]
        predicate = result["predicate"]["value"]
        object = result["object"]["value"]
        object_name = result["objectName"]["value"]
        
        if subject not in subject_set:
            triplets.append((subject_name, predicate, object_name))
            subject_set.add(subject)
            if len(triplets) == num_samples:
                break

    return triplets


In [17]:
def generate_triplet_samples_movies(num_samples):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

#     query = """
#     SELECT DISTINCT ?movie ?movieName ?predicate ?actor ?actorName
#     WHERE {
#         ?movie ?predicate ?actor.
#         FILTER (isIRI(?movie) && isIRI(?actor) &&
#                 (CONTAINS(STR(?predicate), "film") || CONTAINS(STR(?predicate), "artist")))
#         ?movie foaf:name ?movieName.
#         ?actor foaf:name ?actorName.
#     }
#     LIMIT %d
#     """ % num_samples

    query = """
    SELECT DISTINCT ?movie ?movieName ?predicate ?actor ?actorName
    WHERE {
        ?movie ?predicate ?actor.
        FILTER (isIRI(?movie) && isIRI(?actor) &&
                (CONTAINS(STR(?predicate), "film") || CONTAINS(STR(?predicate), "artist") ||
                 CONTAINS(STR(?predicate), "starring") ||
                 CONTAINS(STR(?predicate), "director") ||
                 CONTAINS(STR(?predicate), "producer") ||
                 CONTAINS(STR(?predicate), "writer") ||
                 CONTAINS(STR(?predicate), "composer") ||
                 CONTAINS(STR(?predicate), "cinematography") ||
                 CONTAINS(STR(?predicate), "editor")))
        ?movie foaf:name ?movieName.
        ?actor foaf:name ?actorName.
    }
    LIMIT %d
    """ % num_samples

    sparql.setQuery(query)
    results = sparql.query().convert()

    triplets = []
    subject_set = set()
    for result in tqdm(results["results"]["bindings"]):
        subject = result["movie"]["value"]
        subject_name = result["movieName"]["value"]
        predicate = result["predicate"]["value"]
        object = result["actor"]["value"]
        object_name = result["actorName"]["value"]
        
        if subject not in subject_set:
            triplets.append((subject_name, predicate, object_name))
            subject_set.add(subject)
            if len(triplets) == num_samples:
                break

    return triplets


In [19]:
%%time
triplets = generate_triplet_samples_movies(10_000_000)
# Print the generated triplet samples
formatted_triplets = []
for triplet in tqdm(triplets):
    triplet = list(triplet)
    triplet[1] = triplet[1].split('/')[-1]
    formatted_triplets.append(triplet)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 2586043.53it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1397/1397 [00:00<00:00, 1870233.86it/s]

CPU times: user 34.9 ms, sys: 20.9 ms, total: 55.8 ms
Wall time: 13.5 s





In [20]:
from collections import defaultdict
kg_dict = defaultdict(list)
for triplet in formatted_triplets:
    kg_dict[triplet[0]].append(triplet)

In [28]:
filtered_kg_dict = defaultdict(int)
for movie in kg_dict.keys():
    if len(kg_dict[movie])>2:
        filtered_kg_dict[movie] = kg_dict[movie]

Generation prompt 

```
Pretend you are a knowledge graph expert and well-versed in linguistics capabilities. Given a set of KG triplets, create meaningful, contextual sentences from every triplet.
```

### OpenAI generation

In [31]:
import openai

# Set up OpenAI API credentials
openai.api_key = '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'

In [34]:
def generate_sentences(movie_name, triplets):
    sentences = []
    prompt = f"""
    Pretend you are a knowledge graph expert and well-versed in linguistics capabilities. Given a set of KG triplets about movies and their artists names, create a single meaningful, contextual paragraph from the set of triplets.
    This paragraph shall represent the information available in the triplets directly. DO NOT HALLUCINATE WHILE GENERATING THE INFORMATION. Let's make sure that we generate what is available in the triplets.
    ```{movie_name}: {triplets}```
    """
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "user", "content": prompt},
            ]
        )

        return response
    
    except Exception as e:
        print(e)
        return None

In [35]:
candidate_paras = []
for movie_name in tqdm(list(filtered_kg_dict.keys())[1:]):
    response = generate_sentences(movie_name, filtered_kg_dict[movie_name])
    if response:
        candidate_paras.append(response['choices'][0]['message']['content'])
    else:
        print(f"Candidate generation failed for {movie_name}")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 43/43 [03:25<00:00,  4.78s/it]


In [39]:
with open('candidates_movies.txt', 'w') as f:
    for para in candidate_paras:
        f.writelines(para+'\n')
    f.close()