In [2]:
import os
from SPARQLWrapper import SPARQLWrapper, JSON
import random

In [73]:
def generate_triplet_samples(num_samples):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setReturnFormat(JSON)

    query = """
    SELECT DISTINCT ?subject ?subjectName ?predicate ?object ?objectName
    WHERE {
        ?subject ?predicate ?object.
        FILTER (isIRI(?subject) && isIRI(?object) &&
                (CONTAINS(STR(?predicate), "sports") || CONTAINS(STR(?predicate), "athlete")))
        ?subject foaf:name ?subjectName.
        ?object foaf:name ?objectName.
    }
    LIMIT %d
    """ % num_samples

    sparql.setQuery(query)
    results = sparql.query().convert()

    triplets = []
    subject_set = set()
    for result in results["results"]["bindings"]:
        subject = result["subject"]["value"]
        subject_name = result["subjectName"]["value"]
        predicate = result["predicate"]["value"]
        object = result["object"]["value"]
        object_name = result["objectName"]["value"]
        
        if subject not in subject_set:
            triplets.append((subject_name, predicate, object_name))
            subject_set.add(subject)
            if len(triplets) == num_samples:
                break

    return triplets


In [74]:
triplets = generate_triplet_samples(1000)
# Print the generated triplet samples
formatted_triplets = []
for triplet in triplets:
    triplet = list(triplet)
    triplet[1] = triplet[1].split('/')[-1]
    formatted_triplets.append(triplet)

In [76]:
formatted_triplets

[['California Baptist University',
  'sportsNickname',
  'California Baptist Lancers'],
 ['California Lutheran University',
  'sportsNickname',
  'Cal Lutheran Kingsmen and Regals'],
 ['', 'sportsNickname', 'Cal Poly Mustangs'],
 ['California State Polytechnic University, Pomona',
  'sportsNickname',
  'Cal Poly Pomona Broncos'],
 ['California State University, Bakersfield',
  'sportsNickname',
  'Cal State Bakersfield Roadrunners'],
 ['California State University, Fullerton',
  'sportsNickname',
  'Cal State Fullerton Titans'],
 ['Sacramento State University', 'sportsNickname', 'Sacramento State hornets'],
 ['California State University, Chico',
  'sportsNickname',
  'Chico State Wildcats'],
 ['California State University, Fresno',
  'sportsNickname',
  'Fresno State Bulldogs'],
 ['California State University, Long Beach',
  'sportsNickname',
  'Long Beach State'],
 ['', 'sportsNickname', 'Cal State Dominguez Hills Toros'],
 ['California State University, East Bay',
  'sportsNickname'

In [70]:
# import requests
# from bs4 import BeautifulSoup

# def extract_triplets_and_neighbors():
#     base_url = "http://dbpedia.org/sparql"
#     query = """
#     PREFIX dbo: <http://dbpedia.org/ontology/>
#     PREFIX dbr: <http://dbpedia.org/resource/>
#     SELECT DISTINCT ?subject ?predicate ?object
#     WHERE {
#         ?subject rdf:type dbo:SportsEvent .
#         ?subject ?predicate ?object .
#     }
#     LIMIT 100
#     """

#     response = requests.get(base_url, params={'format': 'json', 'query': query})
#     if response.status_code != 200:
#         print("Failed to retrieve data from DBpedia.")
#         return []

#     results = response.json().get('results', {}).get('bindings', [])
#     triplets = []
#     for result in results:
#         subject = result['subject']['value']
#         predicate = result['predicate']['value']
#         obj = result['object']['value']
#         triplets.append((subject, predicate, obj))

#     triplets_with_neighbors = []
#     for triplet in triplets:
#         subject = triplet[0]
#         neighbors = get_neighbors(subject)
#         triplets_with_neighbors.append((triplet, neighbors))

#     return triplets_with_neighbors

# def get_neighbors(subject):
#     base_url = f"http://dbpedia.org/page/{subject.split('/')[-1]}"
#     response = requests.get(base_url)
#     if response.status_code != 200:
#         print(f"Failed to retrieve neighbors for {subject}")
#         return []

#     soup = BeautifulSoup(response.content, 'html.parser')
#     neighbor_tags = soup.select('.infobox a')
#     neighbors = [tag.text for tag in neighbor_tags[:10]]
#     return neighbors


In [71]:
# num_triplets = 100
# triplets = extract_triplets_and_neighbors()


In [77]:
from SPARQLWrapper import SPARQLWrapper, JSON

def extract_triplets():
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")

    # Query to get 100 subjects related to sports and their top 10 neighbors
    query = """
    PREFIX dbo: <http://dbpedia.org/ontology/>
    SELECT DISTINCT ?subject ?predicate ?object
    WHERE {
        ?subject a dbo:SportsEvent.
        ?subject ?predicate ?object.
        FILTER(isIRI(?object)).
    }
    LIMIT 100
    """

    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    triplets = []
    for result in results["results"]["bindings"]:
        subject = result["subject"]["value"]
        predicate = result["predicate"]["value"]
        object = result["object"]["value"]
        triplets.append((subject, predicate, object))

    return triplets

Generation prompt 

```
Pretend you are a knowledge graph expert and well-versed in linguistics capabilities. Given a set of KG triplets, create meaningful, contextual sentences from every triplet.
```

### OpenAI generation

In [96]:
import openai

# Set up OpenAI API credentials
openai.api_key = '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'

In [110]:
def generate_sentences(triplets):
    sentences = []
    prompt = f"""
    Pretend you are a knowledge graph expert and well-versed in linguistics capabilities. Given a set of KG triplets, create meaningful, contextual sentences from every triplet.
    {triplets}
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt},
        ]
    )

    return response


In [112]:
response = generate_sentences(formatted_triplets[0:5])

In [118]:
response['choices'][0]['message']['content'].split('\n')

['1. The California Baptist University has a sports nickname, the California Baptist Lancers.',
 '2. The students of California Lutheran University are often referred to as the Cal Lutheran Kingsmen and Regals for their athletic prowess.',
 '3. The name of the sports team at an unknown institution in California is the Cal Poly Mustangs.',
 '4. The athletes representing the California State Polytechnic University, Pomona proudly call themselves the Cal Poly Pomona Broncos.',
 '5. The Cal State Bakersfield Roadrunners are the chosen sports nickname for the California State University, Bakersfield.']