In [1]:
import json
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from tqdm import tqdm
import os

In [50]:
def form_values(form):
    result = {}
    for input_element in form.find_all("input"):
        result[input_element.get("name")] = input_element.get("value")
    return result


def parse_actions(ul):
    result = []
    for a in ul.find_all("a"):
        assert a.get("href") == "#"
        action_js = a.get("onclick").split(";")[0]
        assert action_js.startswith("PostBack(")
        assert action_js.endswith(")")
        action, value = eval(action_js[len("PostBack("):-1])
        text = a.find_all(text=True)
        result.append((
            " ".join(text), 
            {
                "pbAction": action,
                "pbValue": value
            }
        ))
    return result


def parse_text(page_body):
    result = []
    for item in page_body.select("body>div,body>p"):
        item_text = []
        for subitem in item.find_all(text=True):
            subitem_clean = subitem.strip()
            if subitem_clean:
                item_text.append(subitem_clean)
        item_text = " ".join(item_text)
        if item_text:
            result.append(item_text)
    return "\n".join(result)

In [3]:
def parse_node(url, node_page_response):
    assert node_page_response.status_code == 200
    page = BeautifulSoup(node_page_response.text)
    page_body = page.find("body")
    
    header_h1 = page_body.find("h1")
    header_text = " ".join(header_h1.find_all(text=True)).strip()
    header_h1.decompose()
    if header_text.startswith("Rate"):
        return None
    
    for item_id in ["svbanner", "m_navbar"]:
        item = page_body.find(id=item_id)
        if item:
            item.decompose()
    
    action_form = page_body.find("form")
    action_form_url = "/".join(url.split("/")[:-1]) + "/" + action_form.get("action").split("./")[-1]
    action_form_values = form_values(action_form)
    action_form.decompose()
    
    action_list = page_body.find("ul")
    if action_list:
        actions = parse_actions(action_list)
        action_list.decompose()
    else:
        actions = []    
    text = parse_text(page_body)
    
    return header_text, text, action_form_url, action_form_values, actions

In [4]:
def parse_story_graph(first_page_url, timeout):
    context = []
    
    def __parse_subgraph(page_url, response):
        resp_parsed = parse_node(page_url, response)
        if resp_parsed is None:
            return None
        title, content, form_url, form_state, actions = resp_parsed
        if (title, content) in context:
            return {"title": title, "content": content, "actions": []}
        context.append((title, content))
        action_subgraphs = []
        for action_text, action_variables in actions:
            time.sleep(timeout)
            resp = requests.post(form_url, data=dict(form_state, **action_variables))
            action_subgraphs.append({"action": action_text, "subgraph": __parse_subgraph(form_url, resp)})
        return {"title": title, "content": content, "actions": action_subgraphs}
    
    return __parse_subgraph(first_page_url, requests.get(first_page_url))

In [5]:
def get_game_id_tags(url):
    resp = requests.get(url)
    assert resp.status_code == 200
    page = BeautifulSoup(resp.text)
    button = page.find(class_="storygame-play-button").find("input")
    evt = button.get("onclick")
    assert evt.startswith("window.open(") and evt.endswith("); return false;")
    game_url = eval(evt[len("window.open("):-len("); return false;")])[0]
    assert game_url.startswith("../game/player/?")
    game_id = game_url.replace("../game/player/?", "")
    assert game_id.isdecimal()
    
    tag_links = []
    for header in page.find_all("h3"):
        if "Tags" in " ".join(header.find_all(text=True)):
            tag_links = header.parent.find_all("a")
    tags = []
    for link in tag_links:
        tags.append(" ".join(link.find_all(text=True)))
    
    return game_id, tags

In [6]:
get_game_id_tags("https://chooseyourstory.com/story/A_Witch's_Inheritance.aspx")

('52067', ['Contest Entry', 'Drama', 'Fantasy', 'Female Protagonist'])

In [7]:
def parse_story_list(url, tags_manual, timeout):
    resp = requests.get(url)
    assert resp.status_code == 200
    table = BeautifulSoup(resp.text).find(id="MainContentPlaceHolder_StoriesGridView")
    rows = list(table.find_all("tr"))[1:]
    records = []
    for row in tqdm(rows):
        td_title, td_author, td_length, td_difficulty, td_rating = row.find_all("td")
        link_title = td_title.find("a")
        
        text_title = " ".join(link_title.find_all(text=True))
        url_title = "/".join(url.split("/")[:-2]) + "/" + link_title.get("href").split("../")[-1]
        author = " ".join(td_author.find("a").find_all(text=True))
        length_text = td_length.find("img").get("alt")
        difficulty_text = td_difficulty.find("img").get("alt")
        rating = " ".join(td_rating.find_all(text=True))
        
        time.sleep(timeout)
        game_id, tags = get_game_id_tags(url_title)
        time.sleep(timeout)
        
        records.append({"id": game_id, "title": text_title, "author": author, "length": length_text, "difficulty": difficulty_text, "rating": rating, "tags": ";".join(tags_manual + tags)})
    df = pd.DataFrame.from_records(records)
    for column in df.columns:
        df[column] = df[column].apply(str).str.strip()
    return df

In [8]:
os.makedirs("data", exist_ok=True)
os.makedirs("data/stories", exist_ok=True)

In [9]:
if not os.path.exists("data/stories-list.csv"):
    story_lists = [
        ("https://chooseyourstory.com/Stories/Fantasy.aspx", ["fantasy"]),
        ("https://chooseyourstory.com/Stories/Grimdark_Fantasy.aspx", ["grimdark", "fantasy"]),
        ("https://chooseyourstory.com/Stories/Sci-Fi.aspx", ["scifi"]),
        ("https://chooseyourstory.com/Stories/Modern.aspx", ["modern"]),
        ("https://chooseyourstory.com/Stories/Horror.aspx", ["horror"]),
        ("https://chooseyourstory.com/Stories/Love__0x26__Dating.aspx", ["dating"]),
        ("https://chooseyourstory.com/Stories/Mystery__0x2f__Thriller.aspx", ["mystery", "thriller"]),
        ("https://chooseyourstory.com/Stories/Family_Friendly.aspx", ["family-friendly"]),
        ("https://chooseyourstory.com/Stories/Historical.aspx", ["historical"]),
        ("https://chooseyourstory.com/Stories/Fan_Fiction.aspx", ["fan-fiction"]),
        ("https://chooseyourstory.com/Stories/Edutainment.aspx", ["edutainment"]),
    ]
    dfs = []
    for url, tags in story_lists:
        df = parse_story_list(url, tags, 0.1)
        dfs.append(df)
    df = pd.concat(dfs).reset_index(drop=True)
    df["tags"] = df["tags"].str.lower()\
        .apply(lambda val: val.split(";")) \
        .apply(lambda val: sorted(set(val))) \
        .apply(lambda val: ";".join(val))
    length_to_number = {
        '1 - Make sure not to blink': 1,
        "2 - So short yo' momma thought it was a recipe": 2,
        '3 - A nice jog down the driveway': 3,
        '4 - A well spent lunch break': 4,
        '5 - Not going to lose any sleep': 5,
        "6 - It'll be a while, better grab a Snickers&reg;": 6,
        '7 - It keeps going and going': 7,
        '8 - Even light has to break at the rest stop': 8
    }
    df["length"] = df["length"].apply(length_to_number.get)
    difficulty_to_number = {
        '1 - no possible way to lose': 1,
        '2 - walk in the park': 2,
        '3 - trek through the forest': 3,
        '4 - march in the swamp': 4,
        '5 - run through the jungle': 5,
        '6 - wandering through the desert': 6,
        '7 - wade in shark infested water': 7,
        '8 - mosie through a minefield': 8
    }
    df["difficulty"] = df["difficulty"].apply(difficulty_to_number.get)
    df = df.loc[df["rating"] != "?"]
    df["id"] = df["id"].apply(int)
    df["rating"] = df["rating"].apply(float)
    df.to_csv("data/stories-list.csv", index=False)
else:
    df = pd.read_csv("data/stories-list.csv")
df.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
0,25370,90 Minute Storygame: Prepare To Die,TheSophia,2,3,4.13,fantasy;war
1,12487,A Blaze of Glory: Chapter One,Marmotlord,3,4,4.52,fantasy;part of series;war;zombie
2,50303,A Chosen Hero,EbonVasilis,6,5,5.71,contest entry;fantasy;romance
3,27800,A Dragon's Legend,Wolfmist,2,3,3.72,animal perspective;fantasy
4,60232,A Fiery Winter,lopz66,6,6,5.42,contest entry;fantasy


In [51]:
parse_node("https://chooseyourstory.com/story/viewer/default.aspx?StoryId=38507", requests.get("https://chooseyourstory.com/story/viewer/default.aspx?StoryId=38507"))

('Introduction',
 "TOWER OF DOOM Your village lies in ruins. Destroyed by a mighty dragon. But you have no time to weep. You must avenge your people! The sky is gloomy, the cold wind stinging your bones. The tower stands tall, like a gladiator, it's door open for any potential challenger. Rumour has it that this tower contains on of the most grand treasure present in this world. Anyone who has dared to step into this tower has never returned. People say that it is guarded by the most dangerous creatures of the dark. The treasure is said to be on the top most floor, guarded by a dragon! The same dragon that destroyed your village! You go through the door and see a spiral staircase leading upwards. You pull out your sword, check your possessions, and step forth... Combat: 1/7 Life: 7/7 Coins: 5/100 Page Items Links Inventory",
 'https://chooseyourstory.com/story/viewer/default.aspx?StoryId=38507',
 {'__VIEWSTATE': '/wEPDwUKLTQ5MDI2MjY3NGRkOpXGwF8G1+hYcV19jlDmirYibRWl1SQDZ1oqAknk9sw=',
  

In [53]:
from datetime import datetime

In [75]:
for i, story_id in enumerate(df["id"]):
    print(i, len(df), story_id, datetime.now())
    if story_id in {12792, 60285, 60858, 40688, 58936, 17104, 12196, 38112, 1469, 304, 25338, 26739, 62013, 59840, 65204, 347,
                    66699, 49628, 56501, 65762, 40954, 66471, 65632, 47011, 51069, 29606}:
        continue
    story_url = f"https://chooseyourstory.com/story/viewer/default.aspx?StoryId={story_id}"
    destination = f"data/stories/{story_id}.json"
    if os.path.exists(destination):
        continue
    data = parse_story_graph(story_url, 0.5)
    with open(destination, "w") as target:
        json.dump(data, target)

0 884 25370 2021-10-20 19:53:29.147611
1 884 12487 2021-10-20 19:53:29.147793
2 884 50303 2021-10-20 19:53:29.147857
3 884 27800 2021-10-20 19:53:29.147909
4 884 60232 2021-10-20 19:53:29.147973
5 884 14237 2021-10-20 19:53:29.148034
6 884 14188 2021-10-20 19:53:29.148090
7 884 11951 2021-10-20 19:53:29.149126
8 884 34956 2021-10-20 19:53:29.149323
9 884 36321 2021-10-20 19:53:29.149529
10 884 27880 2021-10-20 19:53:29.149727
11 884 60671 2021-10-20 19:53:29.149911
12 884 61367 2021-10-20 19:53:29.150094
13 884 10489 2021-10-20 19:53:29.150276
14 884 25548 2021-10-20 19:53:29.150446
15 884 45379 2021-10-20 19:53:29.150611
16 884 5587 2021-10-20 19:53:29.150774
17 884 65742 2021-10-20 19:53:29.150936
18 884 13922 2021-10-20 19:53:29.151097
19 884 25584 2021-10-20 19:53:29.151267
20 884 17576 2021-10-20 19:53:29.151448
21 884 1118 2021-10-20 19:53:29.151629
22 884 10721 2021-10-20 19:53:29.151811
23 884 10808 2021-10-20 19:53:29.151989
24 884 106 2021-10-20 19:53:29.152208
25 884 57778 2

In [77]:
to_remove = []
for file in os.listdir("data/stories"):
    if not file.endswith(".json"):
        continue
    try:
        with open(os.path.join("data/stories", file), "r") as src:
            json.load(src)
    except json.JSONDecodeError:
        to_remove.append(file)
len(to_remove)

0