In [1]:
import pandas as pd
import json
import re
import os
from itertools import chain
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
STORIES_DIRECTORY = "data/stories/"

In [3]:
PATTERNS_TO_CUT = [
    "Page Items Links Inventory",
    "Does [0-9]+ damage",
    "Hereâ€™s a rundown of how you did.*((\.)|$)",
    "Current Floor\: [0-9]+",
    "Combat\: [0-9]+\/[0-9]+ Life\: [0-9]+\/[0-9]+ Coins\: [0-9]+\/[0-9]+",
    "(([a-zA-Z0-9\#.]|(\s*,\s*))+\s*\{(\s*[a-zA-Z-]+\s*\:\s*[\!a-zA-Z0-9\#%. ]+\s*;)*\s*\}\s*)+",
    "Your Health\: [0-9]+/[0-9]+",
    "Your Score is [0-9]+",
]

def parse_story_as_dataframe(filename):
    def _process_content(content):
        for pattern in PATTERNS_TO_CUT:
            content = re.sub(pattern, "", content).strip()
        return content
    
    def _parse_node(previous_ids, previous_action, current_id, node):
        content = _process_content(node["content"])
        result = [
            {
                "id": "-".join(previous_ids + [current_id]),
                "parent_id": "-".join(previous_ids),
                "input": previous_action,
                "output": content,
            }
        ]
        for i, item in enumerate(node["actions"]):
            if item["subgraph"]:
                result += _parse_node(
                    previous_ids + [str(current_id)],
                    item["action"],
                    str(i),
                    item["subgraph"]
                )
        return result
    
    
    with open(filename, "r") as src:
        data = json.load(src)
    return pd.DataFrame.from_records(_parse_node([], "[ROOT]", "0", data))

In [4]:
df_stories = pd.read_csv("data/stories-list.csv")
df_stories.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
0,25370,90 Minute Storygame: Prepare To Die,TheSophia,2,3,4.13,fantasy;war
1,12487,A Blaze of Glory: Chapter One,Marmotlord,3,4,4.52,fantasy;part of series;war;zombie
2,50303,A Chosen Hero,EbonVasilis,6,5,5.71,contest entry;fantasy;romance
3,27800,A Dragon's Legend,Wolfmist,2,3,3.72,animal perspective;fantasy
4,60232,A Fiery Winter,lopz66,6,6,5.42,contest entry;fantasy


In [5]:
df_stories.shape

(884, 7)

In [6]:
df_stories_content = []
for story_id in df_stories["id"]:
    file_name = f"data/stories/{story_id}.json"
    if os.path.exists(file_name):
        df_stories_content.append(
            parse_story_as_dataframe(file_name).assign(story_id=story_id)
        )
df_stories_content = pd.concat(df_stories_content).reset_index(drop=True)
df_stories_content.head()

Unnamed: 0,id,parent_id,input,output,story_id
0,0,,[ROOT],"""Alright, men, you are about to be thrown into...",25370
1,0,,[ROOT],"The land of Kronnland is a mythical, wonderful...",12487
2,0-0,0,Start Danny's Campaign,Danny Blaze\nBackground :\nBorn in the summer ...,12487
3,0-0-0,0-0,Continue,With all the townsfolk transformed into mindle...,12487
4,0-0-0-0,0-0-0,Get back to Bren and warn him about the danger.,You run down the hill as Andrew's army regroup...,12487


In [7]:
df_stories_content.loc[df_stories_content["input"] == ""]

Unnamed: 0,id,parent_id,input,output,story_id
46174,0-0-1-0-3,0-0-1-0,,You decide that you need to talk to some more ...,37603
46176,0-0-1-0-5,0-0-1-0,,,37603


In [8]:
df_stories_content = df_stories_content.loc[df_stories_content["input"] != ""]

In [9]:
df_stories_content.shape

(62743, 5)

In [10]:
df_stories_content = df_stories_content.loc[df_stories_content["output"] != ""]
df_stories_content.head()

Unnamed: 0,id,parent_id,input,output,story_id
0,0,,[ROOT],"""Alright, men, you are about to be thrown into...",25370
1,0,,[ROOT],"The land of Kronnland is a mythical, wonderful...",12487
2,0-0,0,Start Danny's Campaign,Danny Blaze\nBackground :\nBorn in the summer ...,12487
3,0-0-0,0-0,Continue,With all the townsfolk transformed into mindle...,12487
4,0-0-0-0,0-0-0,Get back to Bren and warn him about the danger.,You run down the hill as Andrew's army regroup...,12487


In [11]:
df_stories_content = df_stories_content.loc[~df_stories_content["input"].str.contains("^Restart")]
df_stories_content = df_stories_content.loc[~df_stories_content["input"].str.contains("Reset Game")]
df_stories_content = df_stories_content.loc[~df_stories_content["input"].str.contains("\(Restart")]
df_stories_content = df_stories_content.loc[~df_stories_content["input"].str.contains("\(Restart the game")]
df_stories_content = df_stories_content.loc[~df_stories_content["input"].str.contains("Let me restart")]

while True:
    mask_keep = (df_stories_content["parent_id"].isin(df_stories_content["id"])) | (df_stories_content["input"] == "[ROOT]")
    df_stories_content = df_stories_content.loc[mask_keep]
    if not any(~mask_keep):
        break
df_stories_content.head()

Unnamed: 0,id,parent_id,input,output,story_id
0,0,,[ROOT],"""Alright, men, you are about to be thrown into...",25370
1,0,,[ROOT],"The land of Kronnland is a mythical, wonderful...",12487
2,0-0,0,Start Danny's Campaign,Danny Blaze\nBackground :\nBorn in the summer ...,12487
3,0-0-0,0-0,Continue,With all the townsfolk transformed into mindle...,12487
4,0-0-0-0,0-0-0,Get back to Bren and warn him about the danger.,You run down the hill as Andrew's army regroup...,12487


In [12]:
story_tree_node_count = df_stories_content.groupby("story_id")["id"].count()
story_tree_node_count

story_id
22       365
41        51
106       36
140       13
161      102
        ... 
66624     14
66703      1
66709     44
66888     86
66895     19
Name: id, Length: 858, dtype: int64

In [13]:
df_stories_content = df_stories_content.loc[~df_stories_content["story_id"].isin(
    story_tree_node_count[story_tree_node_count == 1].index
)]
df_stories_content.head()

Unnamed: 0,id,parent_id,input,output,story_id
1,0,,[ROOT],"The land of Kronnland is a mythical, wonderful...",12487
2,0-0,0,Start Danny's Campaign,Danny Blaze\nBackground :\nBorn in the summer ...,12487
3,0-0-0,0-0,Continue,With all the townsfolk transformed into mindle...,12487
4,0-0-0-0,0-0-0,Get back to Bren and warn him about the danger.,You run down the hill as Andrew's army regroup...,12487
6,0-0-0-1,0-0-0,Watch the battle from your hideout.,"Although worried, you stay in your hideout and...",12487


In [14]:
df_stories = df_stories.loc[df_stories["id"].isin(set(df_stories_content["story_id"]))]
df_stories.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
1,12487,A Blaze of Glory: Chapter One,Marmotlord,3,4,4.52,fantasy;part of series;war;zombie
2,50303,A Chosen Hero,EbonVasilis,6,5,5.71,contest entry;fantasy;romance
3,27800,A Dragon's Legend,Wolfmist,2,3,3.72,animal perspective;fantasy
4,60232,A Fiery Winter,lopz66,6,6,5.42,contest entry;fantasy
5,14237,"A Hero is Born, Episode 2",hugo23,4,3,4.56,fantasy;part of series;serious;war


In [15]:
df_stories.shape

(835, 7)

In [16]:
df_stories["tags"] = df_stories["tags"].str.replace("family friendly", "family-friendly")
df_stories = df_stories.loc[~df_stories["tags"].str.contains("foreign language")]
df_stories["tags"] = df_stories["tags"].apply(lambda val: ";".join(
    sorted(set(val.split(";")) - {"part of series", "poetry", "rpg", "fan-fiction"}),
))
df_stories.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
1,12487,A Blaze of Glory: Chapter One,Marmotlord,3,4,4.52,fantasy;war;zombie
2,50303,A Chosen Hero,EbonVasilis,6,5,5.71,contest entry;fantasy;romance
3,27800,A Dragon's Legend,Wolfmist,2,3,3.72,animal perspective;fantasy
4,60232,A Fiery Winter,lopz66,6,6,5.42,contest entry;fantasy
5,14237,"A Hero is Born, Episode 2",hugo23,4,3,4.56,fantasy;serious;war


In [17]:
df_stories.shape

(826, 7)

In [18]:
tags = sorted(set(chain(*df_stories["tags"].apply(lambda text: text.split(";")))))
tags

['',
 'action adventure',
 'animal perspective',
 'anti-hero',
 'based off a true story',
 'biblical',
 'contest entry',
 'cyberpunk',
 'cyoa movie',
 'dating',
 'drama',
 'dystopia',
 'edgelord',
 'edutainment',
 'family-friendly',
 'fantasy',
 'female protagonist',
 'grimdark',
 'historical',
 'horror',
 'humor',
 'lgbt',
 'modern',
 'mystery',
 'post-apocalyptic',
 'previously featured',
 'psychological',
 'puzzle',
 'quiz',
 'romance',
 'science fiction',
 'scifi',
 'serious',
 'socially important',
 'spiritual',
 'superhero',
 'supernatural',
 'thriller',
 'villain protagonist',
 'war',
 'western',
 'zombie']

In [19]:
pd.set_option("display.max_columns", 50)

In [20]:
df_tag_corrs = pd.DataFrame.from_records([{tag: tag in row["tags"] for tag in tags} for _, row in df_stories.iterrows()]).corr()
for tag, row in df_tag_corrs.iterrows():
    print(tag)
    print(row.sort_values())
    print()
    print("-" * 80)
    print()


                         NaN
action adventure         NaN
animal perspective       NaN
anti-hero                NaN
based off a true story   NaN
biblical                 NaN
contest entry            NaN
cyberpunk                NaN
cyoa movie               NaN
dating                   NaN
drama                    NaN
dystopia                 NaN
edgelord                 NaN
edutainment              NaN
family-friendly          NaN
fantasy                  NaN
female protagonist       NaN
grimdark                 NaN
historical               NaN
horror                   NaN
humor                    NaN
lgbt                     NaN
modern                   NaN
mystery                  NaN
post-apocalyptic         NaN
previously featured      NaN
psychological            NaN
puzzle                   NaN
quiz                     NaN
romance                  NaN
science fiction          NaN
scifi                    NaN
serious                  NaN
socially important       NaN
spiritual    

In [21]:
df_stories_content.loc[df_stories_content["input"].str.contains("Restart")]

Unnamed: 0,id,parent_id,input,output,story_id
31075,0-0-0-0-0-0-0,0-0-0-0-0-0,You died too early! Restart the game,"Wednesday, 15 May 2015\nLeo\n-----\nThe black ...",40697
31079,0-0-0-0-0-1-0-0-0,0-0-0-0-0-1-0-0,You died too early! Restart the game,"Wednesday, 15 May 2015\nLeo\n-----\nThe black ...",40697
31148,0-0-0-0-0-1-0-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-...,0-0-0-0-0-1-0-1-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-...,You died too early! Restart the game,"Wednesday, 15 May 2015\nLeo\n-----\nThe black ...",40697
31164,0-0-0-0-0-2-0,0-0-0-0-0-2,You died too early! Restart the game,"Wednesday, 15 May 2015\nLeo\n-----\nThe black ...",40697
52385,0-2-0-0-0-0-0-1-1-1-0,0-2-0-0-0-0-0-1-1-1,"Oops, I Failed, Let Me Restart","You turn on your computer, go to agar.io, and ...",32705


In [22]:
df_stories_content.loc[df_stories_content["input"].str.contains("Reset Game")]

Unnamed: 0,id,parent_id,input,output,story_id


In [23]:
counts = []
for _, row in df_stories_content[["id", "story_id"]].drop_duplicates().iterrows():
    sdf = df_stories_content.loc[df_stories_content["story_id"] == row["story_id"]]
    count = sdf.loc[(sdf["id"] != row["id"]) & sdf["id"].str.startswith(row["id"])].shape[0]
    counts.append({"story_id": row["story_id"], "id": row["id"], "children_count": count})
df_stories_content = df_stories_content.merge(
    pd.DataFrame.from_records(counts),
    left_on=["story_id", "id"],
    right_on=["story_id", "id"],
)
df_stories_content.head()

Unnamed: 0,id,parent_id,input,output,story_id,children_count
0,0,,[ROOT],"The land of Kronnland is a mythical, wonderful...",12487,29
1,0-0,0,Start Danny's Campaign,Danny Blaze\nBackground :\nBorn in the summer ...,12487,8
2,0-0-0,0-0,Continue,With all the townsfolk transformed into mindle...,12487,7
3,0-0-0-0,0-0-0,Get back to Bren and warn him about the danger.,You run down the hill as Andrew's army regroup...,12487,0
4,0-0-0-1,0-0-0,Watch the battle from your hideout.,"Although worried, you stay in your hideout and...",12487,5


In [24]:
from sklearn.model_selection import train_test_split

In [25]:
tag_vc = df_stories["tags"].value_counts()
tag_vc

fantasy                                                                                          45
humor;modern                                                                                     32
horror                                                                                           24
modern                                                                                           14
fantasy;serious                                                                                  14
                                                                                                 ..
historical;modern;war                                                                             1
action adventure;anti-hero;contest entry;edgelord;fantasy;grimdark;modern;villain protagonist     1
animal perspective;humor;modern                                                                   1
anti-hero;edgelord;modern;psychological;thriller                                                  1


In [26]:
df_stories_train = df_stories.loc[df_stories["tags"].isin(tag_vc[tag_vc == 1].index)]
df_stories_train.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
1,12487,A Blaze of Glory: Chapter One,Marmotlord,3,4,4.52,fantasy;war;zombie
2,50303,A Chosen Hero,EbonVasilis,6,5,5.71,contest entry;fantasy;romance
9,36321,A Tale of War: Thragnazil,TheBossWriter,3,6,4.33,drama;fantasy;serious;war
11,60671,Anaria: Quest for the Sword of Light,BrighamOlsen24,3,6,4.09,action adventure;fantasy;romance;serious
12,61367,Anaria: The Demon Necromancer,BrighamOlsen24,3,6,3.3,action adventure;fantasy


In [27]:
df_stories_split_bytagcount = df_stories.loc[df_stories["tags"].isin(tag_vc[tag_vc > 1].index)]
df_stories_split_bytagcount.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
3,27800,A Dragon's Legend,Wolfmist,2,3,3.72,animal perspective;fantasy
4,60232,A Fiery Winter,lopz66,6,6,5.42,contest entry;fantasy
5,14237,"A Hero is Born, Episode 2",hugo23,4,3,4.56,fantasy;serious;war
6,14188,A Hero is Born..,hugo23,2,4,3.97,fantasy;serious
8,34956,A Magical Tale,adventurer,2,3,3.37,fantasy


In [28]:
df_stories_split_bytagcount_train, df_stories_test = train_test_split(df_stories_split_bytagcount, stratify=df_stories_split_bytagcount["tags"], random_state=42, test_size=0.3)
df_stories_split_bytagcount_train.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
241,7794,Terra Proxima: The Lines are Drawn,Rommel,6,4,5.78,scifi;serious;war
20,17576,Bob and His Adventure,ohlookitstrang,2,3,3.33,fantasy;humor
766,22537,The Maze Runner,jkid14,2,3,3.0,humor;serious
559,13208,My Sweet Anna,ZombieAuthor,2,3,3.66,dating;drama;romance
657,48788,Surviving Pre-School,Ogre11,3,2,5.2,family-friendly;humor


In [29]:
df_stories_test.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
664,24586,The Dreamcage,TheSophia,3,3,3.85,family-friendly;humor
867,27879,The Land of Bad Writing,Will11,2,1,5.16,edutainment;humor;socially important
480,10359,Survive the Zombies,Killa_Robot,5,6,6.22,horror;humor;zombie
446,61040,house among the thorns,sunnypony03,3,3,3.5,horror
377,22578,The Lemonade Business 2,MegLuvMTrench,1,3,3.2,humor;modern


In [30]:
df_stories_train_concat = pd.concat([df_stories_train, df_stories_split_bytagcount_train])
df_stories_train_concat = df_stories_train_concat.sample(len(df_stories_train_concat), random_state=42).reset_index(drop=True)
df_stories_train_concat.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
0,34134,Harry Potter and the Philosopher's Drone,jbranch1998,2,5,3.72,contest entry;fantasy;romance;socially important
1,49850,The Exploits of Fail-Man,RoyalGhost_007,6,4,4.66,contest entry;drama;humor;modern;serious;super...
2,51067,Duo,Digit,3,6,4.42,contest entry;dating;dystopia;post-apocalyptic
3,44543,Airport Nightmare,MsGwinn,4,3,4.56,fantasy;modern;romance
4,54559,The Maize Runner,jodithewitch,2,2,3.8,contest entry;humor


In [31]:
df_stories_content["input"] = df_stories_content["input"].str.replace("\xa0", " ")
df_stories_content["output"] = df_stories_content["output"].str.replace("\xa0", " ")

In [32]:
os.makedirs("data/cleaned", exist_ok=True)
df_stories.reset_index(drop=True).to_csv("data/cleaned/stories.csv", index=False)
df_stories_train_concat.reset_index(drop=True).to_csv("data/cleaned/stories-train.csv", index=False)
df_stories_test.reset_index(drop=True).to_csv("data/cleaned/stories-test.csv", index=False)
df_stories_content.reset_index(drop=True).to_csv("data/cleaned/story-trees.csv", index=False)