In [1]:
import re
from collections import Counter
import spacy
from tqdm import tqdm
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset

  VERSION_SPEC = originalTextFor(_VERSION_SPEC)("specifier")
  MARKER_EXPR = originalTextFor(MARKER_EXPR())("marker")


In [5]:
nlp = spacy.load('de_core_news_lg')

In [103]:
def word_cnt(text):
    text = re.sub('[^A-Za-zäöüÄÖÜß]', ' ',text)
    text_lst = [word for word in text.split() if word]
    print(len(text_lst))
    return Counter(text_lst)

In [104]:
text_path = 'data/perplexity/gpt2_training_gutenberg.txt'
with open(text_path) as f:
    text = f.read()



In [105]:
word_stat = word_cnt(text)

3630271


In [63]:
pos_stat = {}
for word in tqdm(word_stat.keys()):
    doc = nlp(word)
    pos = doc[0].pos_
    if pos in pos_stat:
        pos_stat[pos].append(word_stat[word])
    else:
        pos_stat[pos] = [word_stat[word]]

100%|██████████| 181121/181121 [13:34<00:00, 222.30it/s]


In [65]:
sum(pos_stat['NOUN'])/len(pos_stat['NOUN'])

9.908719396207891

In [None]:
import numpy as np
import matplotlib.pyplot as plt


for pos in pos_stat.keys():
    print(pos)
    stat = np.asarray(pos_stat[pos])

    n, bins, patches = plt.hist(stat, 3000,  facecolor='g')


    plt.xlabel('Smarts')
    plt.ylabel('Probability')
    plt.title('Histogram of ' + pos)
    plt.xlim(0, 100)
    plt.grid(True)
    plt.show()

In [None]:
dict(sorted(word_stat.items(), key=lambda item: item[1]))

In [2]:
generated_poems = pd.read_csv('data/generated_poems.csv')

generated_poems.head()

Unnamed: 0.1,Unnamed: 0,LLM,rating:,poem
0,0,GPT2-large,13.842811,"berlin Nicht wahr , seit alten Zeiten schon\nJ..."
1,1,GPT2-large,26.905119,"Und doch , ihr gutes Recht , man muß sie wähle..."
2,2,GPT2-large,11.222012,"Es rinnet , rinnt das Blut vom weizen\nWir wie..."
3,3,GPT2-large,211.366778,Ein Nichts ist niemals auch kein Etwas\nEin Ni...
4,4,GPT2-large,18.153678,Denn ihrem Wahn verfallen allzu lose\nWas will...


In [7]:
GPT3_poems_df = generated_poems[generated_poems.LLM == 'GPT3']

GPT2_poems_df = generated_poems[generated_poems.LLM != 'GPT3']


In [5]:
len(GPT3_poems)

87

In [8]:
GPT3_generated_poems = '\n\n'.join(list(GPT3_poems_df['poem']))
GPT2_generated_poems = '\n\n'.join(list(GPT2_poems_df['poem']))

In [None]:
generated_poems = '\n\n'.join(list(generated_poems['poem']))

In [9]:
with open('data/perplexity/GPT3_generated_poems.txt', 'w') as f:
    f.write(GPT3_generated_poems)
    
with open('data/perplexity/GPT2_generated_poems.txt', 'w') as f:
    f.write(GPT2_generated_poems)

In [109]:
generated_word_stat = word_cnt(generated_poems)
print(len(generated_word_stat.keys()))
print(len(word_stat.keys()))

54711
6642
181121


In [100]:
gen_pos_stat = {}
for word in tqdm(generated_word_stat.keys()):
    doc = nlp(word)
    pos = doc[0].pos_
    if pos in gen_pos_stat:
        gen_pos_stat[pos].append(generated_word_stat[word])
    else:
        gen_pos_stat[pos] = [generated_word_stat[word]]

100%|██████████| 6642/6642 [00:30<00:00, 218.86it/s]


In [102]:
sum(gen_pos_stat['VERB'])/len(gen_pos_stat['VERB'])

2.9234234234234235

In [88]:
generated_reff_cnt = []
generated_words = []
for word in generated_word_stat.keys():
    cnt = word_stat.get(word)
    if cnt is None:
        cnt = 0

    generated_reff_cnt.append(cnt)
    generated_words.append((word,cnt))


In [89]:
sorted = np.argsort(np.asarray(generated_reff_cnt))

[generated_words[i] for i in sorted]

[('tagbefehl', 0),
 ('browsin', 0),
 ('rätselherde', 0),
 ('befleißet', 0),
 ('verheißer', 0),
 ('saphirrangel', 0),
 ('muren', 0),
 ('rächtte', 0),
 ('wesensgrunde', 0),
 ('spurenlos', 0),
 ('zeugenschleier', 0),
 ('verfällen', 0),
 ('versöhnungsstätte', 0),
 ('verbürgst', 0),
 ('hirnger', 0),
 ('erf', 0),
 ('dreiundsiebzig', 0),
 ('eingebundnen', 0),
 ('weiterreichen', 0),
 ('lich', 0),
 ('teufelsspein', 0),
 ('geglücktem', 0),
 ('losgelöscht', 0),
 ('dräht', 0),
 ('schürts', 0),
 ('drachenmeister', 0),
 ('make', 0),
 ('oak', 0),
 ('online', 0),
 ('götterhass', 0),
 ('abasch', 0),
 ('niedermetzeln', 0),
 ('klapse', 0),
 ('wahrheitsfrüchte', 0),
 ('rummels', 0),
 ('flammenwerfern', 0),
 ('bärenladen', 0),
 ('vogelweib', 0),
 ('böttest', 0),
 ('vergangnern', 0),
 ('ichen', 0),
 ('gesprießen', 0),
 ('naja', 0),
 ('traumgesichtes', 0),
 ('marmorhöhlen', 0),
 ('erdensterne', 0),
 ('wortverlangen', 0),
 ('erdenwege', 0),
 ('umtauchst', 0),
 ('unneue', 0),
 ('erwählend', 0),
 ('selbstzweck'