In [206]:
import os
import pandas as pd

In [207]:
import spacy
model = spacy.load("en_core_web_lg")


In [208]:
path_clinical = "/Users/johannesalkofer/Downloads/pdf_to_text/schizophrenia_output_txt"
path_control = "/Users/johannesalkofer/Downloads/pdf_to_text/one_folder_control_output_txt"

In [209]:
# defines seperator constant

SEPARATOR = "=================================================="

# collects file names and their content

data = []
for filename in os.listdir(path_clinical):
    if filename.endswith(".txt"):
        file_path = os.path.join(path_clinical, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            full_text = f.read()
        
        # splits the text and only keeps the part after the separator
        # if the separator isn't found, it defaults to the full text

        parts = full_text.split(SEPARATOR)
        clean_text = parts[-1].strip() 
        
        data.append({"filename": filename, "text": clean_text})

# convert to DataFrame

clinical = pd.DataFrame(data)

In [210]:
# defines separator constant
SEPARATOR = "=================================================="

# collects file names and their content

data = []
for filename in os.listdir(path_control):
    if filename.endswith(".txt"):
        file_path = os.path.join(path_control, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            full_text = f.read()
        
        # splits the text and keeps only the part after the separator
        # if the separator isn't found, it defaults to the full text

        parts = full_text.split(SEPARATOR)
        clean_text = parts[-1].strip() 
        
        data.append({"filename": filename, "text": clean_text})

# convert to DataFrame
control = pd.DataFrame(data)

In [211]:
control

Unnamed: 0,filename,text
0,2025-12-19_personal_account_from_shame_to_stor...,I have spent countless nights trapped in my ow...
1,2025-12-19_personal_account_art_heals_combatin...,"From an early age, I struggled with unrealisti..."
2,2025-12-19_personal_account_the_pendulum_of_my...,If there was ever a chance I would’ve been tol...
3,2025-12-19_personal_account_dreaming_beyond_my...,I couldn’t dream. Not in the sleeping sense bu...
4,2025-12-19_personal_account_doing_it_afraid.txt,The first notable experience that I had with a...
5,2025-12-19_personal_account_an_unrecognized_sy...,Bipolar rage is a waking nightmare for the per...
6,2025-12-19_personal_account_from_darkness_to_l...,Early Childhood\n\nI encountered my first obse...
7,2025-12-19_personal_account_my_journey_through...,"For most of my life, I lived under the weight ..."
8,2025-12-19_personal_account_from_shame_to_stor...,I sat across from the psychiatrist at my unive...
9,2025-12-19_personal_account_teetering_on_a_tig...,Have you ever seen a person walking on a tight...


In [212]:
clinical

Unnamed: 0,filename,text
0,2025-12-18_personal_account_sharing_my_paranoi...,Last night my internet went down. I tried disc...
1,2025-12-18_personal_account_exposure_therapy.txt,During both my schizoaffective disorder episod...
2,2025-12-18_personal_account_creativity_and_sch...,"When I was a small child, I worked on fistfuls..."
3,2025-12-18_personal_account_my_experience_with...,My experience with psychiatric services has be...
4,2025-12-18_personal_account_recovery_champions...,Having a mental health problem can be a fright...
5,2025-12-18_personal_account_schizophrenia_and_...,"When I had my first psychotic episode, I was s..."
6,2025-12-18_personal_account_the_onset_of_my_sc...,My name is Leif Gregersen and I have been diag...
7,2025-12-18_personal_account_a_psychotic_experi...,It’s been roughly one month since I had a suic...
8,2025-12-18_personal_account_day_to_day_living_...,When I was initially diagnosed with schizophre...
9,2025-12-18_personal_account_my_49-year_recover...,Although I was a very insecure girl and young ...


In [213]:
clinical["group"] = "scz"
control["group"] = "hc"

In [214]:
clinical

Unnamed: 0,filename,text,group
0,2025-12-18_personal_account_sharing_my_paranoi...,Last night my internet went down. I tried disc...,scz
1,2025-12-18_personal_account_exposure_therapy.txt,During both my schizoaffective disorder episod...,scz
2,2025-12-18_personal_account_creativity_and_sch...,"When I was a small child, I worked on fistfuls...",scz
3,2025-12-18_personal_account_my_experience_with...,My experience with psychiatric services has be...,scz
4,2025-12-18_personal_account_recovery_champions...,Having a mental health problem can be a fright...,scz
5,2025-12-18_personal_account_schizophrenia_and_...,"When I had my first psychotic episode, I was s...",scz
6,2025-12-18_personal_account_the_onset_of_my_sc...,My name is Leif Gregersen and I have been diag...,scz
7,2025-12-18_personal_account_a_psychotic_experi...,It’s been roughly one month since I had a suic...,scz
8,2025-12-18_personal_account_day_to_day_living_...,When I was initially diagnosed with schizophre...,scz
9,2025-12-18_personal_account_my_49-year_recover...,Although I was a very insecure girl and young ...,scz


In [215]:
df = pd.concat([clinical, control], axis=0)

In [216]:
df = df.replace("\n", " ", regex=True)

In [217]:
# adds custom stopwords

custom_stops = {"er", "erm", "oh", "yeah", "mm", "mhm", "y"}

for w in custom_stops:
    model.Defaults.stop_words.add(w)
    model.vocab[w].is_stop = True


In [218]:
model.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'er',
 'erm',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',


In [219]:
def get_cleaned_tokens(text):
    spacy_text = model(text)
    list_of_tokens = [
        token.text
        for token in spacy_text
        if (not token.is_punct)
        and (not token.is_stop)
        and token.text.strip()              # removes whitespace-only tokens
    ]
    return list_of_tokens

def get_chunks(list_of_tokens):
    chunks = [list_of_tokens[i:i+10] for i in range(0, len(list_of_tokens), 10)]
    return chunks


In [220]:
get_cleaned_tokens("hello world how are you doing")

['hello', 'world']

In [221]:
df["list_tokens"] = df["text"].apply(get_cleaned_tokens)

In [222]:
df

Unnamed: 0,filename,text,group,list_tokens
0,2025-12-18_personal_account_sharing_my_paranoi...,Last night my internet went down. I tried disc...,scz,"[night, internet, went, tried, disconnecting, ..."
1,2025-12-18_personal_account_exposure_therapy.txt,During both my schizoaffective disorder episod...,scz,"[schizoaffective, disorder, episodes, obsessiv..."
2,2025-12-18_personal_account_creativity_and_sch...,"When I was a small child, I worked on fistfuls...",scz,"[small, child, worked, fistfuls, clay, grandfa..."
3,2025-12-18_personal_account_my_experience_with...,My experience with psychiatric services has be...,scz,"[experience, psychiatric, services, bad, good,..."
4,2025-12-18_personal_account_recovery_champions...,Having a mental health problem can be a fright...,scz,"[Having, mental, health, problem, frightening,..."
5,2025-12-18_personal_account_schizophrenia_and_...,"When I had my first psychotic episode, I was s...",scz,"[psychotic, episode, sick, seven, long, months..."
6,2025-12-18_personal_account_the_onset_of_my_sc...,My name is Leif Gregersen and I have been diag...,scz,"[Leif, Gregersen, diagnosed, schizoaffective, ..."
7,2025-12-18_personal_account_a_psychotic_experi...,It’s been roughly one month since I had a suic...,scz,"[roughly, month, suicide, attempt, trying, hol..."
8,2025-12-18_personal_account_day_to_day_living_...,When I was initially diagnosed with schizophre...,scz,"[initially, diagnosed, schizophrenia, meds, ho..."
9,2025-12-18_personal_account_my_49-year_recover...,Although I was a very insecure girl and young ...,scz,"[insecure, girl, young, woman, managed, functi..."


In [223]:
df["list_of_cunks"] = df["list_tokens"].apply(get_chunks)

In [224]:
df.iloc[0,-1]

[['night',
  'internet',
  'went',
  'tried',
  'disconnecting',
  'reconnecting',
  'clicked',
  'diagnostic',
  'know',
  'DSN'],
 ['schizo-',
  'phrenia',
  'entered',
  'mind',
  'acquaintance',
  'met',
  'neighborhood',
  'looking',
  'fiancée',
  'shoulder'],
 ['hacking',
  'computer',
  'messing',
  'internet',
  'entered',
  'mind',
  'thought',
  'called',
  'stupid',
  'wanted'],
 ['prove',
  'schizophrenia',
  'control',
  'suddenly',
  'pops',
  'head',
  'experience',
  'learned',
  'react',
  'false'],
 ['thoughts',
  'paranoia',
  'ordeal',
  'internet',
  'called',
  'mom',
  'tried',
  'help',
  'problem',
  'advised'],
 ['internet',
  'provider',
  'called',
  're-',
  'corded',
  'message',
  'told',
  'internet',
  'neighborhood',
  'problem'],
 ['solved',
  'neighbor',
  'paranoid',
  'thinking',
  'schiz-',
  'ophrenia',
  'control',
  'thinking',
  'enters',
  'exits'],
 ['mind',
  'specific',
  'delusions',
  'far',
  'acquaintance',
  'messing',
  'internet',


In [225]:
df_exploded = df.explode("list_of_cunks").reset_index(drop=True)

In [226]:
df_exploded

Unnamed: 0,filename,text,group,list_tokens,list_of_cunks
0,2025-12-18_personal_account_sharing_my_paranoi...,Last night my internet went down. I tried disc...,scz,"[night, internet, went, tried, disconnecting, ...","[night, internet, went, tried, disconnecting, ..."
1,2025-12-18_personal_account_sharing_my_paranoi...,Last night my internet went down. I tried disc...,scz,"[night, internet, went, tried, disconnecting, ...","[schizo-, phrenia, entered, mind, acquaintance..."
2,2025-12-18_personal_account_sharing_my_paranoi...,Last night my internet went down. I tried disc...,scz,"[night, internet, went, tried, disconnecting, ...","[hacking, computer, messing, internet, entered..."
3,2025-12-18_personal_account_sharing_my_paranoi...,Last night my internet went down. I tried disc...,scz,"[night, internet, went, tried, disconnecting, ...","[prove, schizophrenia, control, suddenly, pops..."
4,2025-12-18_personal_account_sharing_my_paranoi...,Last night my internet went down. I tried disc...,scz,"[night, internet, went, tried, disconnecting, ...","[thoughts, paranoia, ordeal, internet, called,..."
...,...,...,...,...,...
1264,2025-12-19_personal_account_content_with_a_cau...,"Five years ago, I faced one of the toughest de...",hc,"[years, ago, faced, toughest, decisions, life,...","[medication, helped, stabilize, moods, allowin..."
1265,2025-12-19_personal_account_content_with_a_cau...,"Five years ago, I faced one of the toughest de...",hc,"[years, ago, faced, toughest, decisions, life,...","[enabled, function, thrive, personal, life, pr..."
1266,2025-12-19_personal_account_content_with_a_cau...,"Five years ago, I faced one of the toughest de...",hc,"[years, ago, faced, toughest, decisions, life,...","[forward, work, love, advocating, PiZetta, Med..."
1267,2025-12-19_personal_account_content_with_a_cau...,"Five years ago, I faced one of the toughest de...",hc,"[years, ago, faced, toughest, decisions, life,...","[connections, empowering, voices, fostering, w..."


In [227]:
df_exploded.to_parquet("tabelle.parquet")

In [228]:
text = df.iloc[1,3]

In [229]:
len(text)

525

In [230]:
text

['schizoaffective',
 'disorder',
 'episodes',
 'obsessive',
 'behaviors',
 'detrimental',
 'health',
 'obsessively',
 'cleaning',
 'every\xad',
 'thing',
 'owned',
 'possibly',
 'repeatedly',
 'emptied',
 'room',
 'cleaned',
 'single',
 'thing',
 'owned',
 'including',
 'room',
 'scrubbed',
 'ceilings',
 'walls',
 'windows',
 'hired',
 'carpet',
 'cleaner',
 'professionally',
 'clean',
 'carpet',
 'twice',
 'cleaned',
 'hands',
 'repetitively',
 'thoroughly',
 'tons',
 'little',
 'cuts',
 'looked',
 'run',
 'buckets',
 'glass',
 'shards',
 'germaphobe',
 'believed',
 'messiah',
 'thought',
 'external',
 'purity',
 'created',
 'internal',
 'purity',
 'save',
 'world',
 'needed',
 'pure',
 'Immediately',
 'second',
 'episode',
 'germ\xad',
 'aphobe',
 'Overcoming',
 'abnormal',
 'fear',
 'germs',
 'took',
 'mental',
 'flexibility',
 'worked',
 'doctor',
 'talk',
 'therapy',
 'searched',
 'reasons',
 'abnormal',
 'fear',
 'eventually',
 'disclosed',
 'thought',
 'messiah',
 'correctly',
 

In [231]:
for element in text:
    print(element)

schizoaffective
disorder
episodes
obsessive
behaviors
detrimental
health
obsessively
cleaning
every­
thing
owned
possibly
repeatedly
emptied
room
cleaned
single
thing
owned
including
room
scrubbed
ceilings
walls
windows
hired
carpet
cleaner
professionally
clean
carpet
twice
cleaned
hands
repetitively
thoroughly
tons
little
cuts
looked
run
buckets
glass
shards
germaphobe
believed
messiah
thought
external
purity
created
internal
purity
save
world
needed
pure
Immediately
second
episode
germ­
aphobe
Overcoming
abnormal
fear
germs
took
mental
flexibility
worked
doctor
talk
therapy
searched
reasons
abnormal
fear
eventually
disclosed
thought
messiah
correctly
pure
person
couple
years
stopped
obsessively
hand
washing
uncovered
deeper
reasons
hand
washing
came
wanting
con­
trol
life
schizoaffective
disorder
epi­
sode
disorienting
precarious
felt
control
material
possessions
clean
mind
cleaner
able
think
clearly
control
took
time
face
fears
generated
hand
washing
determined
reasons
excessively
w

In [232]:
text = " ".join(text)   # combines list into one string
words = text.split()


In [233]:
sents = [sent.text for sent in model(text).sents]

In [234]:
sents

['schizoaffective disorder episodes obsessive behaviors detrimental health obsessively cleaning every\xad thing owned possibly repeatedly emptied room cleaned single thing owned including room scrubbed ceilings walls windows hired carpet cleaner professionally clean carpet twice cleaned hands repetitively thoroughly tons little cuts looked run buckets glass shards germaphobe believed messiah thought external purity created internal purity save world needed pure Immediately second episode',
 'germ\xad aphobe Overcoming abnormal fear germs took mental flexibility worked doctor talk therapy searched reasons abnormal fear eventually disclosed thought messiah correctly pure person couple years stopped obsessively hand washing uncovered deeper reasons hand washing came wanting con\xad trol life schizoaffective disorder epi\xad sode disorienting precarious felt control material possessions clean mind',
 'cleaner able think clearly control took time face fears generated hand washing determined

In [235]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span


def build_nlp():
    nlp = spacy.load("en_core_web_lg")
    matcher = Matcher(nlp.vocab)

    # 1. symptom onset expressions

    matcher.add(
        "SYMPTOM_ONSET",
        [
            [{"LOWER": "since"}, {"LOWER": {"IN": ["childhood", "infancy", "puberty", "teenage", "teens"]}}],
            [{"LOWER": "since"}, {"LOWER": "i"}, {"LOWER": "was"}, {"IS_DIGIT": True}],
            [{"LOWER": "back"}, {"LOWER": "in"}, {"LOWER": {"IN": ["college", "school", "university", "high", "uni"]}}],
            [{"LOWER": "in"}, {"LOWER": "my"}, {"LOWER": {"IN": ["early", "mid", "late"]}},
             {"LOWER": {"IN": ["teens", "twenties", "thirties"]}}],
        ],
    )

    # 2. episode-level markers

    matcher.add(
        "EPISODE_TEMPORAL",
        [
            [{"LOWER": "last"}, {"LOWER": "episode"}, {"LOWER": "in"}, {"ENT_TYPE": "DATE"}],
            [{"LOWER": "during"}, {"LOWER": "my"},
             {"LOWER": {"IN": ["worst", "last", "previous"]}},
             {"LOWER": {"IN": ["period", "episode"]}}, {"LOWER": "in"}, {"ENT_TYPE": "DATE"}],
        ],
    )

    # 3. treatment-related
    
    matcher.add(
        "TREATMENT_TEMPORAL",
        [
            [{"LOWER": {"IN": ["started", "began", "quit", "stopped"]}},
             {"LOWER": {"IN": ["meds", "medications", "therapy"]}},
             {"LOWER": "in"}, {"ENT_TYPE": "DATE"}],

            [{"LOWER": {"IN": ["hospitalised", "hospitalized"]}},
             {"IS_DIGIT": True}, {"LOWER": {"IN": ["years", "months", "weeks"]}},
             {"LOWER": "ago"}],
        ],
    )


    # 4. relapse
    
    matcher.add(
        "RELAPSE_TEMPORAL",
        [
            [{"LOWER": {"IN": ["relapsed", "relapse"]}},
             {"LOWER": "again"}, {"LOWER": {"IN": ["this", "last"]}},
             {"LOWER": {"IN": ["year", "month", "week"]}}],

            [{"LOWER": {"IN": ["episode", "episodes"]}},
             {"LOWER": "last"}, {"LOWER": {"IN": ["month", "year"]}}],
        ],
    )

    # 5. durations (corrected)

    matcher.add(
        "DURATION_TEMPORAL",
        [
            # for six months/for months/for a year
            [
                {"LOWER": "for"},
                {"LOWER": {"IN": ["a", "an"]}, "OP": "?"},
                {"IS_DIGIT": True, "OP": "?"},
                {"LOWER": {"IN": ["day", "days", "week", "weeks", "month",
                                   "months", "year", "years", "decade"]}}
            ],

            # past few months
            [
                {"LOWER": "past"},
                {"LOWER": {"IN": ["few", "couple", "several"]}},
                {"LOWER": {"IN": ["days", "weeks", "months", "years"]}}
            ]
        ],
    )

    # 6. life-event anchors

    matcher.add(
        "LIFE_EVENT_TEMPORAL",
        [
            [{"LOWER": {"IN": ["after", "before", "around", "during"]}},
             {"LOWER": {"IN": ["the", "my"]}, "OP": "?"},
             {"LOWER": {"IN": [
                 "divorce", "breakup", "accident", "pregnancy", "diagnosis",
                 "hospitalisation", "hospitalization", "finals", "lockdown"
             ]}}],
        ],
    )

    # 7. calendar markers

    seasons = ["spring", "summer", "autumn", "fall", "winter"]
    holidays = ["christmas", "easter", "ramadan", "new", "new-year", "new year"]

    matcher.add(
        "CALENDAR_TEMPORAL",
        [
            [{"LOWER": {"IN": ["this", "last"]}}, {"LOWER": {"IN": seasons}}],
            [{"LOWER": {"IN": ["this", "last"]}}, {"LOWER": {"IN": holidays}}],
            [{"LOWER": {"IN": ["earlier", "end"]}}, {"LOWER": "this"},
             {"LOWER": {"IN": ["week", "month", "year"]}}],
        ],
    )

    # 8. vague expressions
    
    matcher.add(
        "VAGUE_TEMPORAL",
        [
            [{"LOWER": "a"}, {"LOWER": "while"}, {"LOWER": "ago"}],
            [{"LOWER": "ages"}, {"LOWER": "ago"}],
            [{"LOWER": "the"}, {"LOWER": "other"}, {"LOWER": "day"}],
            [{"LOWER": "recently"}],
        ],
    )

    # 9. numeric date formats
    
    matcher.add(
        "NUMERIC_DATE",
        [
            # 12/2020, 05/21
            [{"SHAPE": "dd"}, {"TEXT": "/"}, {"SHAPE": {"IN": ["dd", "dddd"]}}],
            # 12.05.2020
            [{"SHAPE": "dd"}, {"TEXT": "."}, {"SHAPE": "dd"},
             {"TEXT": ".", "OP": "?"}, {"SHAPE": "dddd", "OP": "?"}],
            # 2020–21, 2019/2020
            [{"SHAPE": "dddd"}, {"TEXT": {"IN": ["–", "-", "/"]}}, {"SHAPE": {"IN": ["dd", "dddd"]}}],
        ],
    )

    # 10. diagnosis + date
    
    matcher.add(
        "DIAGNOSIS_DATE",
        [
            [{"LOWER": {"IN": ["diagnosed", "dx"]}}, {"LOWER": "in"}, {"ENT_TYPE": "DATE"}],
            [{"LOWER": {"IN": ["diagnosed", "dx"]}}, {"IS_DIGIT": True}],  # diagnosed at 14
            [{"LOWER": {"IN": ["got", "received"]}}, {"LOWER": "my"},
             {"LOWER": "diagnosis"}, {"LOWER": "in"}, {"ENT_TYPE": "DATE"}],
        ],
    )

    nlp.matcher = matcher
    return nlp


def extract_time_spans(text, nlp):
    doc = nlp(text)
    spans = []

    # built-in DATE entities

    for ent in doc.ents:
        if ent.label_ == "DATE":
            spans.append(ent.text)

    # rule-based matches

    for match_id, start, end in nlp.matcher(doc):
        spans.append(doc[start:end].text)

    # dedupe and preserve order
    
    seen = set()
    final = []
    for s in spans:
        if s not in seen:
            seen.add(s)
            final.append(s)

    return final


if __name__ == "__main__":
    nlp = build_nlp()

    text = """
    I was diagnosed in 2018 after the accident.
    Symptoms have been there since childhood and got worse last winter.
    I relapsed again this year after quitting meds two months ago.
    Started therapy in June. Hospitalised three years ago.
    Had another episode last month and during lockdown things escalated.
    I struggled for six months before the diagnosis.
    
    """

    print(extract_time_spans(text, nlp))


['2018', 'last winter', 'this year', 'two months ago', 'June', 'three years ago', 'last month', 'six months', 'diagnosed in 2018', 'after the accident', 'since childhood', 'relapsed again this year', 'Started therapy in June', 'episode last month', 'during lockdown', 'before the diagnosis']
