In [None]:
# This notebooks output is redacted due to containing personal data
import pandas as pd
import re
import numpy as np

In [None]:
date_pattern = re.compile(".*[0-9]+[.][0-9]+[.][0-9]+.*")
with open("dreams.txt", encoding='utf-8') as file:
    lines = []
    content = file.read()
    content = re.split('([\s,;()]+)', content)
    date = None
    line = ""
    for word in content:
        if date_pattern.match(word) and date is not None:
            lines.append({"date": date, "content": line})
            date = word
            line = ""
        elif date_pattern.match(word):
            date = word
        else:
            line += word
    dreams = pd.DataFrame(lines)
dreams.head()
# DataFrame with columns 'date' and 'content'

In [None]:
dreams["date"].unique()

In [None]:
dreams[dreams["date"].str.contains("-")]

In [None]:
# Convert date column to datetime
dreams["date"] = pd.to_datetime(dreams["date"], dayfirst=True)
dreams["date"]

In [None]:
# Extracting time of day and status from first line
number_idx = dreams["content"].apply(lambda x: bool(re.match(r'.*[0-9].*', x)))
times = dreams[number_idx].content.str.split("\n", expand=True)[0].str.split("-", expand=True)
times

In [None]:
times[times[2].notna()]

In [None]:
dreams.loc[times[times[2].notna()].index, "status"] = times[times[2].notna()][2]
times.drop(2, axis=1, inplace=True)
times

In [None]:
times[0].unique()

In [None]:
# Replacing additional keywords
times[0] = times[0].str.replace("?", "")
times[0].unique()

In [None]:
times[0] = times[0].str.replace("Uhr", "")
times[0].unique()

In [None]:
times[0] = times[0].str.replace(" ", "")
times[0].unique()

In [None]:
dreams["start_time"] = times[0]
dreams[dreams["start_time"].notna()]["start_time"]

In [None]:
times2 = times[1].apply(lambda x: x.lstrip()).str.split(" ", expand=True)
times2

In [None]:
times2[0].unique()

In [None]:
temp = times2[times2[0].apply(lambda x: bool(re.match(r".*[^0-9:]", x)))][0]
dreams.loc[temp.index, "status"] = temp.values[0][-1]

In [None]:
times2.loc[temp.index, 0] = times2.loc[temp.index, 0][:-1]

In [None]:
dreams["end_time"] = times2[0]
dreams[dreams["end_time"].notna()]["end_time"]

In [None]:
# Additional entries corrospond to dream status
times2[1].unique()

In [None]:
times2.loc[times2[1] == "Uhr", 1] = None
times2[1].unique()

In [None]:
# SE = 'weal memory', KE = 'no memory'
dreams.loc[times2[times2[1] == "SE"].index, "dream"] = "SE"
dreams.loc[times2[times2[1] == "KE"].index, "dream"] = "KE"
times2.loc[times2[1] == "SE", 1] = None
times2.loc[times2[1] == "KE", 1] = None
times2[1].unique()

In [None]:
dreams.loc[times2[times2[1].notna()].index, "status"] = times2[times2[1].notna()][1]
dreams[dreams["status"].notna()]

In [None]:
times2[2].unique()

In [None]:
dreams.loc[times2[times2[2] == "KE"].index, "dream"] = "KE"
times2.loc[times2[2] == "KE", 2] = None
times2[2].unique()

In [None]:
dreams.loc[times2[times2[2].notna()].index, "status"] = times2[times2[2].notna()][2]
dreams[dreams["status"].notna()]

In [None]:
# Two entries with two statuses
# Ignoring this anomaly
times2[times2[3].notna()]

In [None]:
dreams.loc[times2[times2[3] == "KE"].index, "dream"] = "KE"
times2.loc[times2[3] == "KE", 3] = None
times2[3].unique()

In [None]:
dreams.loc[number_idx, "content"] = dreams[number_idx].content.str.split("\n").apply(lambda x: "\n".join(x[1:]))

In [None]:
number_idx = dreams["content"].apply(lambda x: bool(re.match(r'.*[0-9].*', x)))
dreams[number_idx]

In [None]:
# Checking if all the content still containing numbers is valid and no time of day information was missed
dreams[number_idx]["content"].iloc[17]

In [None]:
dreams[dreams["dream"].notna()]

In [None]:
dreams.iloc[80]

In [None]:
index = dreams[dreams["dream"].notna()].index[1:]
dreams.loc[index]

In [None]:
# Removing extracted status, start_time and end_time information from content column
dreams.loc[index, "content"] = dreams.loc[index, "dream"]
dreams.drop("dream", axis=1, inplace=True)
dreams.loc[index]

In [None]:
len_index = dreams.content.str.len().sort_values(ascending=False).index
dreams.reindex(len_index)

In [None]:
# Splitting on keyword 'status'
status = dreams.content.str.split("Zustand:",expand=True)
status

In [None]:
dreams["content"] = status[0]

In [None]:
dreams.loc[status[1].notna(), "status"] = status[1]

In [None]:
# Splitting on keyword 'notes'
notes = dreams.content.str.split("Notizen:", expand=True)
notes

In [None]:
dreams["content"] = notes[0]

In [None]:
dreams.loc[notes[1].notna(), "notes"] = notes[1]

In [None]:
dreams[dreams["notes"].notna()]

In [None]:
notes2 = dreams.status.str.split("Notizen:", expand=True)
notes2

In [None]:
dreams["status"] = notes2[0]

In [None]:
dreams.loc[notes2[1].notna(), "notes"] = notes2[1]

In [None]:
notes3 = dreams.content.str.split("Notiz:", expand=True)
notes3

In [None]:
dreams["content"] = notes3[0]

In [None]:
dreams.loc[notes3[1].notna(), "notes"] = notes3[1]

In [None]:
dreams["status"].unique()

In [None]:
dreams["notes"].unique()

In [None]:
dreams[dreams.content.str.contains(":")]

In [None]:
# New data format
daily = pd.read_csv("daily.csv", index_col=0)
daily.head()

In [None]:
daily = daily[["Traumtagebuch", "Notes"]]
daily.index.name = "date"
daily = daily.reset_index().rename({"Traumtagebuch": "content", "Notes": "notes"}, axis=1)
daily.head()

In [None]:
daily.date = pd.to_datetime(daily.date)
daily.head()

In [None]:
dreams = pd.concat([dreams, daily]).sort_values("date").reset_index(drop=True)

In [None]:
# Remove newlines
for col in ["content", "status", "notes"]:
    dreams[col] = dreams[col].str.replace("\n", "")
dreams.head()

In [None]:
# Replacing empty lines with nan
dreams = dreams.replace(r'^\s*$', np.nan, regex=True)

In [None]:
# Splitting multiple dreams in one night by using the keyphrase '...'
dream_split = dreams.content.str.split(r'[.][.][.]',expand=True)
dream_split

In [None]:
dream_split[dream_split[6].notna()][6]

In [None]:
dream_split.drop(6, axis=1, inplace=True)

In [None]:
dream_split[dream_split[5].notna()][5]

In [None]:
dream_split.iloc[398]

In [None]:
dreams.content = dreams.content.str.split(r'[.][.][.]')

In [None]:
# Every dream gets its own entry
dreams = dreams.explode("content")

In [None]:
dreams

In [None]:
dreams[dreams.content.isna()]

In [None]:
# Replace nan entries with 'no memory'
dreams.loc[dreams.content.isna(), "content"] = "KE"

In [None]:
dreams.content = dreams.content.apply(lambda x: x.lstrip())

In [None]:
dreams.groupby("content").count()["date"].sort_values(ascending=False).head(20)

In [None]:
timestamp = dreams[dreams["content"] == ""]["date"].iloc[0]
dreams[dreams["date"] == timestamp]

In [None]:
dreams = dreams[dreams["content"] != ""]

In [None]:
dreams.groupby("content").count()["date"].sort_values(ascending=False).head(10)

In [None]:
dreams.sample(20)

In [None]:
dreams.groupby("content").count()["date"].sort_values(ascending=False).head(10)

In [None]:
dreams[dreams["content"] == "- "]

In [None]:
dreams.loc[98:100]

In [None]:
index = dreams[dreams["content"] == "- "].index +1
dreams.loc[index]

In [None]:
dreams.groupby("content").count()["date"].sort_values(ascending=False).head(10)

In [None]:
# Replacing several no memory keywors with 'no memory'
no_memory = ["KE", "Keine Erinnerung", "keine Erinnerung", "- ", "N/A"]
dreams.loc[dreams["content"].isin(no_memory), "content"] = "no memory"

In [None]:
dreams.groupby("content").count()["date"].sort_values(ascending=False).head(10)

In [None]:
# Replacing several weak memory keywords with 'weak memory'
weak_memory = ["SE", "Schwache Erinnerung"]
dreams.loc[dreams["content"].isin(weak_memory), "content"] = "weak memory"

In [None]:
dreams.groupby("content").count()["date"].sort_values(ascending=False).head(10)

In [None]:
dreams = dreams.reset_index(drop=True)

In [None]:
# Marking wet dreams
wet_index = dreams.content.str.contains("feucht", case=False)
dreams[wet_index].content

In [None]:
wet_dream = ["Feuchttraum", "(Feuchttraum)", "Schwache Erinnerung (feucht)", "(Feuchttraum) "]
dreams.loc[dreams["content"].isin(wet_dream), "content"] = "wet dream"

In [None]:
index = dreams.content.str.contains("feucht", case=False)
dreams[index].content

In [None]:
dreams.loc[1178].content

In [None]:
wet_index[1178] = False

In [None]:
# TODO Mark wet dreams
#dreams.loc[index, "content"] = "weak memory"

In [None]:
dreams.groupby("content").count()["date"].sort_values(ascending=False).head(10)

In [None]:
dreams.sample(10)

In [None]:
# Data cleaning
for index in [86,152,99,772]:
    dreams.loc[index,"content"] = "no memory"
dreams.loc[99,"status"] = "k"
dreams.loc[598,"content"] = "weak memory"
dreams.loc[84,"note"] = dreams.loc[84,"content"]

In [None]:
content_filter = content_filter = (dreams["content"] != "no memory") & (dreams["content"] != "weak memory")
dreams[(content_filter)].sample(30)["content"]

In [None]:
# Mapping keywords to categories
# Redacted due to contained personal data
from mappings import categories
dreams["category"] = [[] for _ in range(dreams.shape[0])]
for category, keywords in categories.items():
    index = dreams.content.str.contains('|'.join(keywords), case=False)
    dreams.loc[index, "category"] = dreams[index]["category"].apply(lambda x: x + [category])

In [None]:
dreams[(dreams.category.str.len() == 0) & (content_filter)].sample(1)[["content", "category"]].style.set_properties(subset = ['content'], **{'width': '800px'})

In [None]:
wet_index

In [None]:
# Cleaning
dreams.loc[wet_index, "category"] = ["wet"] * wet_index.sum()
dreams.at[413,"category"] = ["gaming"]
dreams.at[787,"category"] = ["gaming"]
dreams.at[224, "category"] = ["gaming"]
dreams.at[72,"category"] = ["driving"]
dreams.at[299,"category"] = ["work", "biking"]
dreams.at[963,"category"] = ["workout"]
dreams.at[364,"category"] = ["media"]
dreams.at[546,"category"] = ["nightmare", "gaming", "vacation"]
dreams.at[881,"category"] = ["violence"]

In [None]:
# Using spacy entity recognition to extract people and locations
import spacy
from collections import Counter
try:
    nlp = spacy.load('de_core_news_lg')
except OSError:
    from spacy.cli import download
    download('de_core_news_lg')
    nlp = spacy.load('de_core_news_lg')

In [None]:
doc = nlp(dreams[content_filter].sample(1)["content"].iloc[0])
print([(X.text, X.label_) for X in doc.ents])

In [None]:
def apply_ner(x):
    doc = nlp(x)
    return [(X.label_, X.text) for X in doc.ents]

In [None]:
dreams.loc[content_filter, "entities"] = dreams[content_filter].content.apply(apply_ner)
dreams["entities"]

In [None]:
dreams.loc[1490, "entities"] = ""

In [None]:
dreams.loc[dreams.entities.isna(), "entities"] = ""

In [None]:
dreams["people"] =dreams["entities"].apply(lambda x: [item[1] for item in x if item[0] == "PER"])
dreams["people"]

In [None]:
# TODO Map new individuals

In [None]:
dreams["people"].explode().unique()[150:]

In [None]:
# Mappings for misclasified entities
# Redacted due to containing personal data
from mappings import change_cat, remove_name
def map_category(x):
    if x[1] in change_cat.keys():
        return (change_cat[x[1]], x[1])
    elif x[1] in remove_name:
        return ("MISC_PER", x[1])
    return x

In [None]:
dreams["entities"] = dreams["entities"].apply(lambda x: [map_category(item) for item in x])

In [None]:
dreams["people"] =dreams["entities"].apply(lambda x: [item[1] for item in x if item[0] == "PER"])

In [None]:
# Mapping name synonyms
# Redacted due to containg personal data
from mappings import name_map
def map_people(x):
    if x in name_map.keys():
        return name_map[x]
    return x

In [None]:
len(name_map)

In [None]:
dreams["people"] = dreams["people"].apply(lambda x: [map_people(item) for item in x])

In [None]:
dreams.explode("people").groupby("people").count()["content"].sort_values(ascending=False).head(30)

In [None]:
dreams[dreams.people.notna()]["people"].explode().unique()

In [None]:
from mappings import female, male
def map_gender(x):
    if x in female:
        return True
    if x in male:
        return False
    return x[-1] in ["a", "e", "u"]

In [None]:
# Mark gender if name ends in a,e or u
dreams["female"] = dreams["people"].apply(lambda x: [map_gender(item.split(" ")[0]) for item in x])
dreams["female"].explode().sum(), dreams["female"].explode().count() - dreams["female"].explode().sum()

In [None]:
people = dreams[dreams.people.str.len() != 0].explode("people")
people.loc[:,"female"] = dreams[dreams.people.str.len() != 0].explode("female")

In [None]:
index = 5
people.groupby("people")["female"].agg(pd.Series.unique).iloc[index*28:(index+1)*28]

In [None]:
# Stopped here

In [None]:
dreams["locations"] =dreams["entities"].apply(lambda x: [item[1] for item in x if item[0] == "LOC"])
dreams["locations"]

In [None]:
dreams["locations"].explode().unique()

In [None]:
dreams["entities"].apply(lambda x: [item[0] for item in x]).explode().unique()

In [None]:
dreams["entities"].apply(lambda x: [item[1] for item in x if item[0] == "ORG"]).explode().unique()

In [None]:
dreams["entities"].apply(lambda x: [item[1] for item in x if item[0] == "MISC"]).explode().unique()

In [None]:
dreams

In [None]:
dreams.to_csv("dreams.csv")