# WonderCat Notebook

In [7]:
import requests, base64, warnings, re
import pandas as pd
from qwikidata.sparql  import return_sparql_query_results

import constants
from functions import *

warnings.filterwarnings('ignore')

## Call API and Store Data

In [8]:
%%time

"""
WordPress API Credentials and Functions
"""
api_prefix = 'https://env-1120817.us.reclaim.cloud/wp-json/wp/v2/user-experience'

# Import credentials
WP_USER = constants.WP_USER
WP_KEY = constants.WP_KEY
wp_credentials = WP_USER + WP_KEY
wp_token = base64.b64encode(wp_credentials.encode())
wp_header = {'Authorization': 'Basic ' + wp_token.decode('utf-8')}

def get_total_pagecount():
    api_url = f'{api_prefix}?page=1&per_page=100'
    response = requests.get(api_url)
    pages_count = response.headers['X-WP-TotalPages']
    return int(pages_count)

def read_wordpress_post_with_pagination():
    total_pages = get_total_pagecount()
    current_page = 1
    all_page_items_json = []
    while current_page <= total_pages:
        api_url = f"{api_prefix}?page={current_page}&per_page=100"
        page_items = requests.get(api_url)
        page_items_json = page_items.json()
        all_page_items_json.extend(page_items_json)
        current_page = current_page + 1
    return all_page_items_json

"""
Transform API JSON to Dataframe
"""
def transform_to_dataframe(api_call):
    api_data = pd.DataFrame(api_call)
    api_data = api_data[['id', 'author', 'date', 'benefit', 'experience', 'technology', 'acf']] # Select columns to work with. Add 'wikidata' when ready.
    api_data['title'] = pd.json_normalize(api_data['acf'])['title_of_creative_work']
    api_data['QID'] = pd.json_normalize(api_data['acf'])['wikidata-qid']
    # This should be cleaner...
    api_data['bene_del'] = pd.json_normalize(api_data['benefit'])
    api_data['benefit'] = pd.json_normalize(api_data['bene_del'])['name']
    api_data['exp_del'] = pd.json_normalize(api_data['experience'])
    api_data['experience'] = pd.json_normalize(api_data['exp_del'])['name']
    api_data['tech_del'] = pd.json_normalize(api_data['technology'])
    api_data['technology'] = pd.json_normalize(api_data['tech_del'])['name']
    del api_data['acf'], api_data['bene_del'], api_data['exp_del'], api_data['tech_del']

    return api_data

CPU times: user 25 μs, sys: 16 μs, total: 41 μs
Wall time: 45.8 μs


## WikiData Functions

In [9]:
%%time

# Gather all QID's from dataframe.
def build_query_call_api(df):
    # Gather QIDS and validate with regular expression.
    QIDS = df['QID'].unique()
    regex = re.compile('Q\d+')
    QIDS = [s for s in QIDS if regex.match(s)]

    # Append 'wd:' prefix for sparql query.
    QIDS = ' '.join(['wd:' + x for x in QIDS if isinstance(x, str)])

    # Build SPARQL Query.
    sparql_query = """
    SELECT DISTINCT
        ?item ?pubDate
        ?pubPlace ?countryOriginLabel ?coordinateLocal
        ?genreLabel

    WHERE {
        VALUES ?item { wd:%s }

        ?item wdt:P31 ?instanceof .
        OPTIONAL {?item wdt:P136 ?genre} .
        OPTIONAL {?item wdt:P577 ?pubDate} .
        OPTIONAL {?item wdt:P495 ?countryOrigin} .
    
        SERVICE wikibase:label { bd:serviceParam wikibase:language "en,en". }
    }
    """ % (QIDS)

    # Call API
    res = return_sparql_query_results(sparql_query)

    return res

# Create dataframe from API results.
def api_to_dataframe(res):
    wiki_df =[]

    # Loop through WikiQuery Results.
    for i in res['results']['bindings']:
        # Build empty dictionary.
        wiki_item = {}
        # Loop through each item's keys.
        for k in i.keys():
            # Append values to wiki_item
            wiki_item[k] = i[k]['value']

        # Once item's keys looped, append new dictionary to list for dataframe.
        wiki_df.append(wiki_item)

    wiki_df = pd.DataFrame(wiki_df)

    # Clean up item/QID field.
    wiki_df['item'] = wiki_df['item'].str.replace('.*/(Q\d+)', '\\1', regex = True)
    wiki_df = wiki_df.rename(columns = {'item':'QID'})

    # Clean up date field. Currently returning only year due to some dates being "out of bounds" (too old).
    wiki_df['pubDate'] = wiki_df['pubDate'].str.replace('(\d{4})-\d{2}-\d{2}.*', '\\1', regex = True)

    return wiki_df

CPU times: user 24 μs, sys: 10 μs, total: 34 μs
Wall time: 42 μs


In [10]:
%%time

# Call Data from WordPress API
wp_call = read_wordpress_post_with_pagination()

# Reshape wp_call (json) as dataframe.
data = transform_to_dataframe(wp_call)

data.head()

CPU times: user 11.7 s, sys: 1.13 s, total: 12.9 s
Wall time: 31 s


Unnamed: 0,id,author,date,benefit,experience,technology,title,QID
0,710,1,2025-01-24T19:18:33,xxx-I need to enter something new,xxx-I need to enter something new,xxx-I need to enter something new,testing,Q3456789
1,474,4,2025-01-15T19:26:15,Bias Reduction,Alienation,Almighty Heart + Soliloquy,Test,Q223880
2,362,5,2025-01-09T21:27:04,Faith,Wonder,Enigma,Mystery Plays,Q240911
3,364,5,2025-01-09T20:56:26,Generosity,Wonder,Stretch,Oedipus,Q148643
4,363,5,2025-01-09T20:55:14,Faith,Wonder,Plot Twist,Oedipus,Q148643


In [None]:
%%time

# Call Wikidata API.
api_results = build_query_call_api(data)

# Convert API data to dataframe.
dataframe = api_to_dataframe(api_results)

# Merge with WonderCat dataframe.
dataframe = data.merge(dataframe, how = 'inner', on = 'QID')

# Save dataframe as .tsv
dataframe.to_csv("../main/wonderCat_data.tsv", sep = "\t", index = False)

dataframe.head()

CPU times: user 54.7 ms, sys: 35 ms, total: 89.7 ms
Wall time: 3.83 s


Unnamed: 0,QID,pubDate,countryOriginLabel,genreLabel,id,author,date,benefit,experience,technology,title
0,Q128518,2000,United States,historical film,111,5,2024-11-15T16:10:59,Tend and Befriend,Courage,Human God Voice,Gladiator
1,Q128518,2023,United States,melodrama,111,5,2024-11-15T16:10:59,Tend and Befriend,Courage,Human God Voice,Gladiator
2,Q171048,1995,United States,fantasy film,234,5,2025-01-09T17:16:51,Connection,Identification,Soliloquy in a Soliloquy,Toy Story
3,Q132689,1942,United States,romance film,308,5,2025-01-09T19:13:45,Expanded Inner List of Cultural Norms,Self-Acceptance,Equilateral Love Triange,Casablanca
4,Q132689,1942,United States,war film,308,5,2025-01-09T19:13:45,Expanded Inner List of Cultural Norms,Self-Acceptance,Equilateral Love Triange,Casablanca


## Create Data for Network Graph

In [12]:
%%time

def create_nodes_and_links(dataframe):
    # Create link/edge pairs.
    title_tech = dataframe[['title', 'technology']]
    title_tech.rename(columns = {'title': 'from', 'technology': 'to'}, inplace = True)

    tech_exp = dataframe[['technology', 'experience']]
    tech_exp.rename(columns = {'technology': 'from', 'experience': 'to'}, inplace = True)

    # exp_user = dataframe[['experience', 'author']]
    # exp_user.rename(columns = {'experience': 'from', 'author': 'to'}, inplace = True)

    # Join pairs.
    links = pd.concat([title_tech, tech_exp]) # exp_user

    # Clean pairs of whitespace.
    links['from'] = links['from'].str.replace('\\w', '')
    links['to'] = links['to'].str.replace('\\w', '')

    # Create link/edge weights.
    links = links.groupby(['from', 'to']).size().to_frame(name = 'weight').reset_index()

    # Create nodes from links and rename column name.
    titles = dataframe[['title']]
    titles.rename(columns = {'title': 'label'}, inplace = True)
    titles['category'] = 'title'

    technologies = dataframe[['technology']]
    technologies.rename(columns = {'technology': 'label'}, inplace = True)
    technologies['category'] = 'technology'

    experiences = dataframe[['experience']]
    experiences.rename(columns = {'experience': 'label'}, inplace = True)
    experiences['category'] = 'experience'

    # users = dataframe[["author"]]
    # users.rename(columns = {'author': 'label'}, inplace = True)
    # users['category'] = 'user'

    # Concatenate nodes.
    nodes = pd.concat([titles, technologies, experiences]) # users

    # Create node "size" from frequency.
    nodes = nodes.groupby(['label', 'category']).size().to_frame(name = 'size').reset_index()

    # Remove duplicates from nodes.
    nodes.drop_duplicates(inplace = True)

    # Create node "id's."
    nodes['id'] = nodes.index

    # Replace link's 'labels' with node id's.
    label_id_map = pd.Series(nodes['id'].values, index = nodes['label']).to_dict()
    links = links.replace({'from': label_id_map})
    links = links.replace({'to': label_id_map})

    return (links, nodes)

# Create links and nodes.
links, nodes = create_nodes_and_links(data)

# Save data.
links.to_csv("../main/links.tsv", sep = "\t", index = False)
nodes.to_csv("../main/nodes.tsv", sep = "\t", index = False)


CPU times: user 131 ms, sys: 14.1 ms, total: 145 ms
Wall time: 368 ms
