In [27]:
import pandas as pd
import re
from collections import Counter
import numpy as np
import glob

In [287]:
def preprocess_text(path):
    with open(path, "r") as f:
        text = f.read()
    text = re.sub(r"[.,-;!?–']", "", text)
    text = text.lower().split()
    length = len(text)
    cnt = Counter(text)
    for k,v in cnt.items():
        cnt[k] = v/length
    return cnt
    

def count_words_in_docs(paths):
    counteddocuments = []
    for path in paths:
        cnt = preprocess_text(path)
        counteddocuments.append(cnt)
    return counteddocuments
        

def z_standardization(word, document, counteddocuments):
    
    freqs_across_docs = [doc[word] for doc in counteddocuments]
    
    mue = np.mean(freqs_across_docs)
    sigma = np.std(freqs_across_docs)
    
    z = (document[word] - mue)/sigma
    
    return(z)

def create_doc_term_matrix(paths, n):
    counteddocuments = count_words_in_docs(paths)
    
    #n most frequent words in corpus
    mfw = Counter()
    for cnt in counteddocuments:
        mfw += cnt
    nmfw = [tupel[0] for tupel in mfw.most_common(n)]
    
    
    df = pd.DataFrame(columns=[re.sub(r"\w+\/|\.txt", "", path) for path in paths], index=nmfw)
    
    for path in paths:
        cnt = preprocess_text(path)  
        zs = []
        for w in nmfw:
            z = z_standardization(w, cnt, counteddocuments)
            zs.append(z)
        df[re.sub(r"\w+\/|\.txt", "", path)] = zs
    return(df)
    
    

In [290]:
paths = glob.glob("gerdracor_txt/*.txt")

In [291]:
create_doc_term_matrix(paths, 50)

Unnamed: 0,laube-monaldeschi,schnitzer-der-zigeunerbaron,muellner-die-schuld,delpons-der-ahnherr,chezy-euryanthe,birch-pfeiffer-vatersorgen,angely-prinz-tu-ta-tu,goethe-die-wette,wildgans-in-ewigkeit-amen,krueger-die-candidaten,...,eichendorff-die-freier,moser-krieg-oder-frieden,arnim-jerusalem,schnitzler-literatur,grillparzer-weh-dem-der-luegt,brentano-ponce-de-leon,kotzebue-die-spanier-in-peru,grillparzer-ein-bruderzwist-in-habsburg,heine-william-ratcliff,vischer-faust
ich,-0.599752,-0.496574,-1.061413,-0.275814,-0.540995,0.443428,0.801935,-0.049233,-1.158212,1.183618,...,-0.019931,0.775427,0.066495,1.532504,0.402704,0.825886,-0.231683,-0.978117,0.215023,-1.415424
und,1.520093,1.296983,0.632124,-0.689775,-0.951483,-0.495353,0.152829,1.245897,-0.172863,-0.624981,...,-0.127142,-1.394917,-0.699115,-1.416989,1.042528,0.381364,0.015214,1.205424,2.465358,-0.338442
die,0.100664,0.091467,0.115687,-1.249183,0.026458,0.266866,-0.537098,0.792005,-0.254466,0.157301,...,0.543975,-0.723092,0.447501,-0.882569,-0.864599,0.23742,-0.306261,1.329531,0.452753,0.485606
der,0.310714,0.458365,1.36289,0.025129,0.748212,0.026157,0.106321,-1.431741,0.438818,-0.486779,...,0.277684,-0.963221,0.638095,-0.991971,-0.086008,-0.710646,0.172344,1.612056,0.538213,1.048289
sie,-0.509012,-1.002036,-0.699733,-0.624187,-0.896786,0.932275,-1.165129,1.259844,1.972563,1.272361,...,0.456323,-0.265999,-0.594817,-0.656446,-0.741571,-0.250438,-0.994242,-0.891225,-0.963108,-0.970758
nicht,0.653827,-0.643956,0.249741,-0.867074,-0.99487,0.361085,-0.546727,0.242436,0.191016,0.258772,...,-0.370994,-0.572144,-0.737786,2.026975,0.901877,0.605473,-0.067217,0.090067,-0.431264,-0.289892
das,-0.023142,1.897442,0.061264,-0.742135,-2.140342,0.895919,-0.338151,-0.530875,0.855457,-1.383935,...,0.027779,-1.714108,-0.314039,1.171847,0.606649,-0.415306,-0.288306,0.105186,-0.483722,-0.453007
du,-0.066064,-1.00562,-0.130037,0.354982,1.799128,-0.281322,0.508786,-0.610769,-1.696177,-1.560362,...,-0.753314,-0.902496,-0.265668,2.701312,0.891427,0.488936,1.605479,-1.061609,-0.210092,-0.606737
ist,0.708087,0.087612,0.716183,-1.061924,-0.636117,0.414157,-0.256948,-0.549751,-0.475034,0.118444,...,-0.238637,-1.435449,-0.155929,1.010708,0.381731,0.686986,0.442553,-0.11058,-0.098229,0.302436
zu,0.135359,-0.27945,0.328177,-0.483278,-0.651267,0.512132,0.919995,1.687441,-0.159914,2.781928,...,-0.461769,-0.162717,-0.3032,0.15773,-0.891661,-0.173811,-0.28553,-0.158409,-0.918991,-0.057161
