# WonderCat Notebook

In [36]:
import requests, base64, warnings, re
import pandas as pd
import numpy as np

warnings.filterwarnings('ignore')


## Call API and Store Data

In [37]:
%%time

# Gather all QID's from dataframe.
def get_QIDS(df):
    # Gather QIDS and validate with regular expression.
    QIDS = df['QID'].unique()
    regex = re.compile(r'Q\d+')
    QIDS = [s for s in QIDS if regex.match(s)]

    # Append 'wd:' prefix for sparql query.
    QIDS = ' '.join(['wd:' + x for x in QIDS if isinstance(x, str)])

    return QIDS


# Build SPARQL query.
def build_query_call_api(QIDS):
    QIDS = QIDS

    # Build SPARQL Query.
    sparql_query = """
    SELECT DISTINCT
        ?item ?itemLabel
        (group_concat(DISTINCT(?dateLabel); separator=',') as ?pubDates)
        (group_concat(DISTINCT(?genreLabel); separator=',') as ?genres)
        (group_concat(DISTINCT(?countryOriginLabel); separator=',') as ?origin)
        (group_concat(DISTINCT(?coordinatesLabel); separator=',') as ?coordinates)

        WHERE {
            VALUES ?item { %s }
            ?item wdt:P31 ?instanceof.
            OPTIONAL{?item wdt:P577 ?pubDate}.
            OPTIONAL{?item wdt:P136 ?genre}.
            ?item wdt:P495 ?origin.
            ?origin wdt:P625 ?coordinates.

            SERVICE wikibase:label {
            bd:serviceParam wikibase:language 'en,en'.
            ?item rdfs:label ?itemLabel.
            ?pubDate rdfs:label ?dateLabel.
            ?genre rdfs:label ?genreLabel.
            ?origin rdfs:label ?countryOriginLabel.
            ?coordinates rdfs:label ?coordinatesLabel.
            }
        }
        GROUP BY ?item ?itemLabel
    """ % (QIDS)

    # Call API
    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    res = requests.get(url, params={'query': sparql_query, 'format': 'json'}).json()

    return res

# Create dataframe from API results.
def api_to_dataframe(res):
    wiki_df =[]

    # Loop through WikiQuery Results.
    for i in res['results']['bindings']:
        # Build empty dictionary.
        wiki_item = {}
        # Loop through each item's keys.
        for k in i.keys():
            # Append values to wiki_item
            wiki_item[k] = i[k]['value']

        # Once item's keys looped, append new dictionary to list for dataframe.
        wiki_df.append(wiki_item)

    wiki_df = pd.DataFrame(wiki_df)

    # Clean up item/QID field.
    wiki_df['item'] = wiki_df['item'].str.replace(r'.*/(Q\d+)', '\\1', regex = True)
    wiki_df = wiki_df.rename(columns = {'item':'QID'})

    # Clean up date field. Currently returning only year due to some dates being "out of bounds" (too old).
    wiki_df['pubDates'] = wiki_df['pubDates'].str.replace(r'(\d{4}-\d{2}-\d{2}).*', r'\\1', regex = True)
    wiki_df['pubDates'] = pd.to_datetime(wiki_df['pubDates'], errors = 'coerce')

    # Create Longitude and Latitude columns.
    reg_pattern = r'Point\(([-]?\d+\.?\d+)\s([-]?\d+\.?\d+)\)'
    wiki_df['lon'] = wiki_df['coordinates'].str.replace(reg_pattern, r'\\1', regex = True)
    wiki_df['lat'] = wiki_df['coordinates'].str.replace(reg_pattern, r'\\2', regex = True)

    return wiki_df

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 17.9 µs


## Write WonderCat API Results to File

In [38]:
%%time

# Call Data from WordPress API
wp_call = read_wordpress_post_with_pagination()

# Reshape wp_call (json) as dataframe.
data = transform_to_dataframe(wp_call)

data.head()

CPU times: user 2.23 s, sys: 350 ms, total: 2.58 s
Wall time: 10.3 s


Unnamed: 0,id,author,date,benefit,experience,technology,text,QID
0,924,1,2025-06-16,Belief,Identification,xxx-I need to enter something new,he loves her?! She loves him?! She has real fe...,Q125881442
1,922,6,2025-06-15,Resilience,Being Wrong,Plot Twist,A completely unexpected twist immediately afte...,Q125881442
2,921,6,2025-06-15,Intimacy,Wonder,Secret Discloser,"""Have you ever had a really good day?"" (Escola...",Q125881442
3,916,10,2025-06-05,,Confusion,I Voice,"“I awoke to two sweaty, meaty hands shaking th...",Q1150792
4,915,10,2025-06-05,,Curiosity,Suspense,Alaska finished her cigarette and flicked it i...,Q1150792


## WikiData Functions

In [45]:
%%time

# Gather all QID's from dataframe.
def get_QIDS(df):
    # Gather QIDS and validate with regular expression.
    QIDS = df['QID'].unique()
    regex = re.compile('Q\d+')
    QIDS = [s for s in QIDS if regex.match(s)]

    # Append 'wd:' prefix for sparql query.
    QIDS = ' '.join(['wd:' + x for x in QIDS if isinstance(x, str)])

    return QIDS


# Build SPARQL query.
def build_query_call_api(QIDS):
    QIDS = QIDS

    # Build SPARQL Query.
    sparql_query = """
    SELECT DISTINCT
        ?item ?itemLabel
        (group_concat(DISTINCT(?dateLabel); separator=',') as ?pubDates)
        (group_concat(DISTINCT(?genreLabel); separator=',') as ?genres)
        (group_concat(DISTINCT(?countryOriginLabel); separator=',') as ?origin)
        (group_concat(DISTINCT(?coordinatesLabel); separator=',') as ?coordinates)

        WHERE {
            VALUES ?item { %s }
            ?item wdt:P31 ?instanceof.
            OPTIONAL{?item wdt:P577 ?pubDate}.
            OPTIONAL{?item wdt:P136 ?genre}.
            ?item wdt:P495 ?origin.
            ?origin wdt:P625 ?coordinates.

            SERVICE wikibase:label {
            bd:serviceParam wikibase:language 'en,en'.
            ?item rdfs:label ?itemLabel.
            ?pubDate rdfs:label ?dateLabel.
            ?genre rdfs:label ?genreLabel.
            ?origin rdfs:label ?countryOriginLabel.
            ?coordinates rdfs:label ?coordinatesLabel.
            }
        }
        GROUP BY ?item ?itemLabel
    """ % (QIDS)

    # Call API
    url = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql'
    res = requests.get(url, params={'query': sparql_query, 'format': 'json'}).json()

    return res

# Create dataframe from API results.
def api_to_dataframe(res):
    wiki_df =[]

    # Loop through WikiQuery Results.
    for i in res['results']['bindings']:
        # Build empty dictionary.
        wiki_item = {}
        # Loop through each item's keys.
        for k in i.keys():
            # Append values to wiki_item
            wiki_item[k] = i[k]['value']

        # Once item's keys looped, append new dictionary to list for dataframe.
        wiki_df.append(wiki_item)

    wiki_df = pd.DataFrame(wiki_df)

    # Clean up item/QID field.
    wiki_df['item'] = wiki_df['item'].str.replace(r'.*/(Q\d+)', r'\1', regex = True)
    wiki_df = wiki_df.rename(columns = {'item':'QID'})

    # Clean up date field. Currently returning only year due to some dates being "out of bounds" (too old).
    wiki_df['pubDates'] = wiki_df['pubDates'].str.replace(r'(\d{4}-\d{2}-\d{2}).*', r'\1', regex = True)
    wiki_df['pubDates'] = pd.to_datetime(wiki_df['pubDates'], errors = 'coerce')

    # Create Longitude and Latitude columns.
    reg_pattern = r'Point\(([-]?\d+\.?\d+)\s([-]?\d+\.?\d+)\)'
    wiki_df['lon'] = wiki_df['coordinates'].str.replace(reg_pattern, r'\1', regex = True)
    wiki_df['lat'] = wiki_df['coordinates'].str.replace(reg_pattern, r'\2', regex = True)

    return wiki_df

    # # (Old method for concatenating genres) Concatenate genres.
    # genres = wikidata[['QID', 'genreLabel']]
    # genres['genreLabel'].replace('', np.nan, inplace = True)
    # genres.dropna(subset=['genreLabel'], inplace=True)
    # genres = genres.groupby('QID')['genreLabel'].apply(lambda x: ','.join(x)).reset_index()

    # # Re-merge concatenated rows with rest of data.
    # wiki_df = wikidata.drop(['genreLabel'], axis=1).merge(genres, how = 'inner', on = 'QID')

    return wiki_df

CPU times: user 7 µs, sys: 1e+03 ns, total: 8 µs
Wall time: 10 µs


In [46]:
%%time

# Get QIDS.
qids = get_QIDS(data)

# Call Wikidata API.
api_results = build_query_call_api(qids)

# Convert API data to dataframe.
wikidata = api_to_dataframe(api_results)

# # Merge with WonderCat dataframe.
# wikidata = data[['QID', 'title']].merge(wikidata, how = 'inner', on = 'QID')

# # Save dataframe as .tsv
# wikidata.to_csv("wikidata.tsv", sep = "\t", index = False)

# # See if columns that have lists are recognized.
# print (wikidata.map(lambda x: isinstance(x, list)).all())

wikidata.head()


CPU times: user 11.6 ms, sys: 2.83 ms, total: 14.5 ms
Wall time: 74.9 ms


Unnamed: 0,QID,itemLabel,pubDates,genres,origin,coordinates,lon,lat
0,Q1170769,The Essence of Christianity,1841-01-01,,Germany,Point(10.0 51.0),10.0,51.0
1,Q2446285,Pippi Longstocking,1945-11-01,"fantasy,children's fiction",Sweden,Point(15.0 61.0),15.0,61.0
2,Q16733998,Our Sister Killjoy,1977-01-01,fiction,Ghana,"Point(-1.08 8.03),Point(-1.2 8.1)","-1.08,-1.2","8.03,8.1"
3,Q5477055,Fox in Socks,1965-06-19,,United States,Point(-98.5795 39.828175),-98.5795,39.828175
4,Q265954,Slaughterhouse-Five,1969-03-01,"science fiction,black comedy,metafiction,anti-...",United States,Point(-98.5795 39.828175),-98.5795,39.828175


## Create Network Data with Arguments

In [None]:
%%time

def create_nodes_and_links(dataframe, column1, column2):
    # Create link/edge pairs.
    title_tech = dataframe[['title', 'technology']]
    title_tech.rename(columns = {'title': 'from', 'technology': 'to'}, inplace = True)

    # Clean pairs of whitespace.
    links['from'] = links['from'].str.replace('\\w', '')
    links['to'] = links['to'].str.replace('\\w', '')

    # Create link/edge weights.
    links = links.groupby(['from', 'to']).size().to_frame(name = 'weight').reset_index()

    # Create nodes from links and rename column name.
    titles = dataframe[['title']]
    titles.rename(columns = {'title': 'label'}, inplace = True)
    titles['category'] = 'title'

    technologies = dataframe[['technology']]
    technologies.rename(columns = {'technology': 'label'}, inplace = True)
    technologies['category'] = 'technology'

    experiences = dataframe[['experience']]
    experiences.rename(columns = {'experience': 'label'}, inplace = True)
    experiences['category'] = 'experience'

    users = dataframe[["author"]]
    users.rename(columns = {'author': 'label'}, inplace = True)
    users['category'] = 'user'

    # Concatenate nodes.
    nodes = pd.concat([titles, technologies, experiences, users]) # users

    # Create node "size" from frequency.
    nodes = nodes.groupby(['label', 'category']).size().to_frame(name = 'size').reset_index()

    # Remove duplicates from nodes.
    nodes.drop_duplicates(inplace = True)

    # Create node "id's."
    nodes['id'] = nodes.index

    # Replace link's 'labels' with node id's.
    label_id_map = pd.Series(nodes['id'].values, index = nodes['label']).to_dict()
    links = links.replace({'from': label_id_map})
    links = links.replace({'to': label_id_map})

    return (links, nodes)

# Create links and nodes.
links, nodes = create_nodes_and_links(data)

# Save data.
links.to_csv("../main/links.tsv", sep = "\t", index = False)
nodes.to_csv("../main/nodes.tsv", sep = "\t", index = False)


## Create Data for Network Graph

In [6]:
%%time

def create_nodes_and_links(dataframe):
    # Create link/edge pairs.
    title_tech = dataframe[['title', 'technology']]
    title_tech.rename(columns = {'title': 'from', 'technology': 'to'}, inplace = True)

    tech_exp = dataframe[['technology', 'experience']]
    tech_exp.rename(columns = {'technology': 'from', 'experience': 'to'}, inplace = True)

    exp_user = dataframe[['experience', 'author']]
    exp_user.rename(columns = {'experience': 'from', 'author': 'to'}, inplace = True)

    # Join pairs.
    links = pd.concat([title_tech, tech_exp, exp_user]) 

    # Clean pairs of whitespace.
    links['from'] = links['from'].str.replace('\\w', '')
    links['to'] = links['to'].str.replace('\\w', '')

    # Create link/edge weights.
    links = links.groupby(['from', 'to']).size().to_frame(name = 'weight').reset_index()

    # Create nodes from links and rename column name.
    titles = dataframe[['title']]
    titles.rename(columns = {'title': 'label'}, inplace = True)
    titles['category'] = 'title'

    technologies = dataframe[['technology']]
    technologies.rename(columns = {'technology': 'label'}, inplace = True)
    technologies['category'] = 'technology'

    experiences = dataframe[['experience']]
    experiences.rename(columns = {'experience': 'label'}, inplace = True)
    experiences['category'] = 'experience'

    users = dataframe[["author"]]
    users.rename(columns = {'author': 'label'}, inplace = True)
    users['category'] = 'user'

    # Concatenate nodes.
    nodes = pd.concat([titles, technologies, experiences, users]) # users

    # Create node "size" from frequency.
    nodes = nodes.groupby(['label', 'category']).size().to_frame(name = 'size').reset_index()

    # Remove duplicates from nodes.
    nodes.drop_duplicates(inplace = True)

    # Create node "id's."
    nodes['id'] = nodes.index

    # Replace link's 'labels' with node id's.
    label_id_map = pd.Series(nodes['id'].values, index = nodes['label']).to_dict()
    links = links.replace({'from': label_id_map})
    links = links.replace({'to': label_id_map})

    return (links, nodes)

# Create links and nodes.
links, nodes = create_nodes_and_links(data)

# Save data.
links.to_csv("../main/links.tsv", sep = "\t", index = False)
nodes.to_csv("../main/nodes.tsv", sep = "\t", index = False)


CPU times: user 123 ms, sys: 6.92 ms, total: 130 ms
Wall time: 158 ms
