# Lowlands analysis

## Imports
Loading necessary modules and initialise the parser

In [56]:
import os
import csv
import re
import random as rd
import pandas as pd
from frog import Frog, FrogOptions


# set up frog
frog = Frog(FrogOptions(parser=False, morph=False,
                            chunking=False, mwu=True,
                            ner=False))

# assign home dir
wd = os.getcwd()

20210730:113147:681:config read from: /home/bma-vandijk/lama/share/frog/nld/frog.cfg
20210730:113147:681:configuration version = 0.12
20210730:113147:681: MWU disabled, because the Parser is deselected
20210730:113147:682:mblem-:Initiating lemmatizer...
ucto: textcat configured from: /home/bma-vandijk/lama/share/ucto/textcat.cfg
20210730:113149:216:tagger-tagger-:reading subsets from /home/bma-vandijk/lama/share/frog/nld//subsets.cgn
20210730:113149:216:tagger-tagger-:reading constraints from /home/bma-vandijk/lama/share/frog/nld//constraints.cgn
20210730:113149:216:Fri Jul 30 11:31:49 2021 Initialization done.


## Preprocessing
We load all the stories in a list of lists, with the latter consisting of story id and story string.
There is some notational information extra in the transcription that is removed with the regexp function remove().


In [140]:
# T

def remove(raw):
    raw = re.sub(r'(X|x)x{2,}', '', raw)                        # removes unclear speech codes (Xxx-xxx)
    raw = re.sub(r'\w+\.?\*n','', raw)                          # removes self-made words, onomatopoeias (*n)
    raw = re.sub(r'(\w+\.?\*a)', '', raw)                       # removes broken off words (*a)
    raw = re.sub(r'\.?\*v', '', raw)                            # removes notation for foreign words (*v)
    raw = re.sub(r'(\w+\-)+(\w+)\.?\*u', r'\2', raw)            # removes restarts of fully pronounced words (*u)
    #raw = re.sub(r"([A-Z]+(\w)*-?(\w)+('(\w)+)?)", '', raw)    # removes capitalized words, single letters
    raw = re.sub(r'[eE]hm', '', raw)                            # removes standard interjection
    raw = re.sub(r'\?{2,}', '', raw)                            # removes two or more ?? in text
    return raw
    


def extract_storystrings():
    
    stories = []
    
    themes  = ['01', '11', '21'], ['02', '12', '22'], ['03', '13', '23'], ['04', '14', '24'], ['05', '15', '25']
    
    for i in themes:
        story_ids = [f for f in os.listdir(f'{wd}/Lowlands_transcripts') if f[:2] in i]
        story_ids.sort()

        for s in story_ids:
            with open(f'{wd}/Lowlands_transcripts/{s}', mode='r', encoding='utf-8-sig') as f:
                stories.append([s[:4], remove(f.read())])
                
    return dict(stories)

In [141]:
stories = extract_storystrings()

## Analysis
First we like to see how type/token ratios develop once story chains progress. Is this different for the three different stories?

In [146]:
def ttr(storydict):
    stats = []
    
    for i, s  in storydict.items():
        types = len(set(s.lower().split()))
        tokens = len(s.lower().split())
        stats.append((i, round(types/tokens, 2)))
    
    return dict(stats)

In [147]:
ttr(stories)

{'0100': 0.54,
 '0101': 0.51,
 '0102': 0.43,
 '0103': 0.47,
 '0104': 0.48,
 '0105': 0.52,
 '0106': 0.45,
 '1100': 0.54,
 '1101': 0.47,
 '1102': 0.4,
 '1103': 0.46,
 '1104': 0.48,
 '1105': 0.59,
 '1106': 0.59,
 '1107': 0.58,
 '2100': 0.54,
 '2101': 0.64,
 '2102': 0.52,
 '2103': 0.67,
 '2104': 0.7,
 '2105': 0.69,
 '2106': 0.71,
 '0200': 0.53,
 '0201': 0.47,
 '0202': 0.5,
 '0203': 0.56,
 '0204': 0.74,
 '0205': 0.62,
 '0206': 0.64,
 '1200': 0.53,
 '1201': 0.44,
 '1202': 0.44,
 '1203': 0.48,
 '1204': 0.46,
 '1205': 0.44,
 '1206': 0.48,
 '2200': 0.53,
 '2201': 0.47,
 '2202': 0.42,
 '2203': 0.45,
 '2204': 0.5,
 '2205': 0.54,
 '2206': 0.63,
 '0300': 0.5,
 '0301': 0.43,
 '0302': 0.46,
 '0303': 0.53,
 '0304': 0.5,
 '0305': 0.7,
 '0306': 0.66,
 '1300': 0.5,
 '1301': 0.48,
 '1302': 0.44,
 '1303': 0.43,
 '1304': 0.5,
 '1305': 0.49,
 '1306': 0.55,
 '2300': 0.5,
 '2301': 0.48,
 '2302': 0.53,
 '2303': 0.44,
 '2304': 0.47,
 '2305': 0.59,
 '2306': 0.58,
 '0400': 0.54,
 '0401': 0.45,
 '0402': 0.41,
 '040