In [1]:
import nltk.data
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
df_story_trees = pd.read_csv("data/cleaned/story-trees.csv")
df_story_trees.head()

Unnamed: 0,id,parent_id,input,output,story_id,children_count
0,0,,[ROOT],"The land of Kronnland is a mythical, wonderful...",12487,29
1,0-0,0,Start Danny's Campaign,Danny Blaze\nBackground :\nBorn in the summer ...,12487,8
2,0-0-0,0-0,Continue,With all the townsfolk transformed into mindle...,12487,7
3,0-0-0-0,0-0-0,Get back to Bren and warn him about the danger.,You run down the hill as Andrew's army regroup...,12487,0
4,0-0-0-1,0-0-0,Watch the battle from your hideout.,"Although worried, you stay in your hideout and...",12487,5


In [3]:
df_stories_train = pd.read_csv("data/cleaned/stories-train.csv")
df_stories_train.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
0,34134,Harry Potter and the Philosopher's Drone,jbranch1998,2,5,3.72,contest entry;fantasy;romance;socially important
1,49850,The Exploits of Fail-Man,RoyalGhost_007,6,4,4.66,contest entry;drama;humor;modern;serious;super...
2,51067,Duo,Digit,3,6,4.42,contest entry;dating;dystopia;post-apocalyptic
3,44543,Airport Nightmare,MsGwinn,4,3,4.56,fantasy;modern;romance
4,54559,The Maize Runner,jodithewitch,2,2,3.8,contest entry;humor


In [4]:
df_stories_test = pd.read_csv("data/cleaned/stories-test.csv")
df_stories_test.head()

Unnamed: 0,id,title,author,length,difficulty,rating,tags
0,24586,The Dreamcage,TheSophia,3,3,3.85,family-friendly;humor
1,27879,The Land of Bad Writing,Will11,2,1,5.16,edutainment;humor;socially important
2,10359,Survive the Zombies,Killa_Robot,5,6,6.22,horror;humor;zombie
3,61040,house among the thorns,sunnypony03,3,3,3.5,horror
4,22578,The Lemonade Business 2,MegLuvMTrench,1,3,3.2,humor;modern


In [5]:
df_guttenberg_sentences = pd.read_csv("data/guttenberg-sentences.csv")
df_guttenberg_sentences.head()

Unnamed: 0,text
0,"""Yessir, I do."
1,!+« riefen sie.
2,!chaleur du !
3,!« sagte Bebel.
4,!” I like that.


In [6]:
df_story_trees["parent_id"] = df_story_trees["parent_id"].fillna("")
df_story_trees.head()

Unnamed: 0,id,parent_id,input,output,story_id,children_count
0,0,,[ROOT],"The land of Kronnland is a mythical, wonderful...",12487,29
1,0-0,0,Start Danny's Campaign,Danny Blaze\nBackground :\nBorn in the summer ...,12487,8
2,0-0-0,0-0,Continue,With all the townsfolk transformed into mindle...,12487,7
3,0-0-0-0,0-0-0,Get back to Bren and warn him about the danger.,You run down the hill as Andrew's army regroup...,12487,0
4,0-0-0-1,0-0-0,Watch the battle from your hideout.,"Although worried, you stay in your hideout and...",12487,5


In [7]:
df_story_trees.shape

(60882, 6)

In [8]:
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [9]:
def get_stories_sentences(df_story_trees):
    stories_sentences = set()
    for text in tqdm(pd.concat([df_story_trees["input"], df_story_trees["output"]]).drop_duplicates()):
        sents = sentence_tokenizer.tokenize(text)
        for sent in sents:
            if sent:
                stories_sentences.add(sent)
    stories_sentences = sorted(stories_sentences, key=len)
    stories_sentences = pd.Series(stories_sentences)
    stories_sentences_lengths = stories_sentences.apply(len)
    stories_sentences = stories_sentences[(stories_sentences_lengths >= stories_sentences_lengths.quantile(0.05)) & \
                                          (stories_sentences_lengths <= stories_sentences_lengths.quantile(0.99))]
    df = pd.DataFrame({
        "id": range(len(stories_sentences)),
        "text": stories_sentences.tolist(),
    })
    df = df.set_index("id")
    return df

In [10]:
df_story_sentences = get_stories_sentences(df_story_trees)
df_story_sentences.head()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75333/75333 [00:15<00:00, 5015.01it/s]


Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,"""Sorry, Soren."""
1,You were eaten.
2,I’m sorry Suzy.
3,the voice asks.
4,"Yeah, I guess."""


In [11]:
from joblib.parallel import Parallel, delayed

In [12]:
def get_usable_contexts(sentence_id, sentence, df_story_trees):
    mask = df_story_trees["input"].str.contains(sentence, regex=False) | df_story_trees["output"].str.contains(sentence, regex=False)
    result = []
    for _, row in df_story_trees.loc[mask].iterrows():
        sid = row["story_id"]
        cid = row["id"]
        sdf = df_story_trees.loc[df_story_trees["story_id"] == sid]
        cids = [cid] + sdf.loc[sdf["parent_id"].str.startswith(cid), "id"].tolist()
        result.append((sid, cids))
    return result


def get_df_usable_contexts(df_story_trees, df):
    result = []
    for sentence_id, row in df.iterrows():
        result.append((sentence_id, get_usable_contexts(sentence_id, row["text"], df_story_trees)))
    return result

In [13]:
story_sentences_usable_contexts = list(Parallel(n_jobs=-1)(
    delayed(get_df_usable_contexts)(df_story_trees, df_story_sentences.iloc[i * 2048 : (i + 1) * 2048])
    for i in tqdm(range(int(np.ceil(len(df_story_sentences) / 2048))))
))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 305/305 [51:50<00:00, 10.20s/it]


In [14]:
context_sentence_mapping = []
for block in tqdm(story_sentences_usable_contexts):
    for sentence_id, relations in block:
        for story_id, context_ids in relations:
            for context_id in context_ids:
                context_sentence_mapping.append({
                    "story_id": story_id,
                    "context_id": context_id,
                    "sentence_id": sentence_id
                })

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 305/305 [00:04<00:00, 68.49it/s]


In [15]:
df_context_sentence_mapping = pd.DataFrame.from_records(context_sentence_mapping)
df_context_sentence_mapping.head(10)

Unnamed: 0,story_id,context_id,sentence_id
0,37696,0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-...,0
1,37696,0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-1-0-0-0-0-0-0-0-...,0
2,14312,0-0-0-0-0-0-0-0-1-0-0-1-0-1-1-1-2,1
3,14312,0-0-0-0-0-0-0-0-1-0-0-1-0-1-1-1-2-0,1
4,31013,0-2-0-0-0-0-0-1-0-0-1-0-0-0-0,2
5,31013,0-2-0-0-0-0-0-1-0-0-1-0-0-0-0-0,2
6,31013,0-2-0-0-0-0-0-1-0-0-1-0-0-0-0-1,2
7,31013,0-2-0-0-0-0-0-1-0-0-1-0-0-0-0-1-0,2
8,31013,0-2-0-0-0-0-0-1-0-0-1-0-0-0-0-1-0-0,2
9,31013,0-2-0-0-0-0-0-1-0-0-1-0-0-0-0-1-0-0-0,2


In [18]:
df_story_sentences["length"] = df_story_sentences["text"].apply(len)
df_story_sentences = df_story_sentences.sort_values("length")
del df_story_sentences["length"]
df_story_sentences.head()

Unnamed: 0_level_0,sentence
id,Unnamed: 1_level_1
0,"""Sorry, Soren."""
3421,Are they alive?
3420,What DO you do?
3419,"""This is yours."
3418,Leave the halls


In [20]:
df_story_sentences.to_csv("data/cleaned/story-sentences.csv", index=True)
df_context_sentence_mapping.to_csv("data/cleaned/story-context-sentence-mapping.csv", index=False)