In [1]:
import pandas as pd
import re, math
from tqdm import tqdm
text_df = pd.read_csv('gutenberg/gutenberg.csv')

text_df.head()

Unnamed: 0,ID,author,title,subtitle,text,Unnamed: 5
0,0,alexis,Entführung,,"O Lady Judith, spröder Schatz,\n Drückt...",
1,0,alexis,Entführung,,"Hart ist der Sitz und knapp und schmal,\n ...",
2,0,alexis,Entführung,,Sechs Nächte lag ich in Sumpf und Moor\n ...,
3,1,alexis,Walpurgisnacht,,"Liebe Mutter, heut' Nacht heulte Regen und Win...",
4,1,alexis,Walpurgisnacht,,"Liebe Mutter, es donnerte auf dem Brocken drob...",


In [2]:
verse_name = 'text'

# Preprocessing for Bert Training

In [5]:
# EOV: End of verse; SOV: Start of Verse; SST: Start of Strophe; EST: End of Strophe
# SOP: Start of poem; EOP: End of Poem

text_df_prep = text_df.copy()
text_df_prep[verse_name] = text_df_prep[verse_name].apply(lambda x: re.sub(r'[^a-zäöüA-ZÄÖÜ,.\n]',' ', str(x)))
text_df_prep[verse_name] = text_df_prep[verse_name].apply(lambda x: (' [EOV] [SOV] ').join([line.lstrip() for line in str(x).split('\n') if line] )+ ' [EST] ')

#text_df_prep = text_df_prep.groupby(['title']).sum()



text = ''
for _, row in text_df_prep.iterrows():
    
    text += row[verse_name]
    
with open('bert_training_gutenberg.txt', 'w') as f:
    f.write(text)

# Preprocessing for GPT2 Training

In [11]:
text_df = pd.read_csv('gutenberg/gutenberg.csv')
print(len(text_df))


text_df = text_df.drop(text_df[text_df.author == 'schiller'].index)
print(len(text_df))
text_df.head()


105990
104841


Unnamed: 0,ID,author,title,subtitle,text,Unnamed: 5
0,0,alexis,Entführung,,"O Lady Judith, spröder Schatz,\n Drückt...",
1,0,alexis,Entführung,,"Hart ist der Sitz und knapp und schmal,\n ...",
2,0,alexis,Entführung,,Sechs Nächte lag ich in Sumpf und Moor\n ...,
3,1,alexis,Walpurgisnacht,,"Liebe Mutter, heut' Nacht heulte Regen und Win...",
4,1,alexis,Walpurgisnacht,,"Liebe Mutter, es donnerte auf dem Brocken drob...",


In [12]:
title_name = 'title'
verse_name = 'text'


text_df_prep = text_df.copy()
text_df_prep[verse_name] = text_df_prep[verse_name].apply(lambda x: re.sub(r'[^a-zäöüA-ZÄÖÜß?!,;:.\n ]','', str(x)))
text_df_prep[verse_name] = text_df_prep[verse_name].apply(lambda x: ('\n').join([line.lstrip() for line in str(x).split('\n') if line] )+ '\n')

text_df_strophes = text_df_prep.groupby(['ID']).sum()[verse_name]

text_df_titles = text_df_prep.groupby(['ID']).first()[title_name]

strophes_df = pd.concat([text_df_titles, text_df_strophes], axis=1)

text = ''
for _ , row in strophes_df.iterrows():
    
    if 'h4' in row[title_name]:
        title = 'Gedicht'
    else: 
        title = row[title_name]
    text += 'Titel: ' + str(title) + '\n \n' + row[verse_name] + '<|endoftext|>'+'\n'
    
    
text = re.sub('<\|endoftext\|>','',text)
with open('gpt2_training_no_schiller.txt', 'w') as f:
    f.write(text)

# Preprocessing for GPT2 Training with Train/Test Split

In [3]:
text_df = pd.read_csv('gutenberg/gutenberg.csv')
print(len(text_df))

105990


In [17]:
title_name = 'title'
verse_name = 'text'


text_df_prep = text_df.copy()
text_df_prep[verse_name] = text_df_prep[verse_name].apply(lambda x: re.sub(r'[^a-zäöüA-ZÄÖÜß?!,;:.\n ]','', str(x)))
text_df_prep[verse_name] = text_df_prep[verse_name].apply(lambda x: ('\n').join([line.lstrip() for line in str(x).split('\n') if line] )+ '\n')

text_df_strophes = text_df_prep.groupby(['ID']).sum()[verse_name]

text_df_titles = text_df_prep.groupby(['ID']).first()[title_name]

text_df_authors = text_df_prep.groupby(['ID']).first()['author']


strophes_df = pd.DataFrame(list(zip(list(text_df_authors),list(text_df_titles),list(text_df_strophes))),columns=['author','title','strophe'])#pd.concat([text_df_authors,text_df_titles, text_df_strophes], axis=1)

strophes_df['text'] = strophes_df.apply(lambda x: 'Titel: ' + x.title + '\n\n' + x.strophe +'\n',axis=1)




In [66]:
train_test_frac = 0.14
train_test_frac_large = 0.25
authors = set(list(strophes_df['author']))

text_train = ''
text_test =''

for author in authors:
    if author not in ['schiller','hoelderl']:
        frac_used = train_test_frac
    else:
        print('large')
        print('author')
        frac_used = train_test_frac_large

    author_df = strophes_df[strophes_df.author == author]
    author_df = author_df.sample(frac=1).reset_index(drop=True)

    author_test_cnt = math.ceil(len(author_df)*frac_used)

    author_df_test = author_df[:author_test_cnt+1]
    author_df_train = author_df[author_test_cnt:]

    author_new_test_text = ''.join(list(author_df_test.text))
    author_new_train_text = ''.join(list(author_df_train.text))

    text_test += author_new_test_text
    text_train += author_new_train_text





In [69]:
with open('gpt2_training_gutenberg_train.txt', 'w') as f:
    f.write(text_train)

with open('gpt2_training_gutenberg_test.txt', 'w') as f:
    f.write(text_test)

In [13]:
with open('gutenberg/gpt2_training_gutenberg_train.txt') as f:
    text_train = f.read()

author = 'hoelderl'
strophes_df_author = pd.DataFrame(list(zip(list(text_df_authors),list(text_df_titles),list(text_df_strophes))),columns=['author','title','strophe'])

strophes_author = list(strophes_df_author[strophes_df_author.author == author].strophe)
strophes_title = list(strophes_df_author[strophes_df_author.author == author].title)
strophes_chosen = []
for idx, strophe in tqdm(enumerate(strophes_author)):
    verse_lst = strophe.strip().split('\n')
    found = False
    cnt = 0
    for verse in verse_lst:
        if verse in text_train:
            found = True
            cnt += 1
    if cnt < 4:
        for verse in verse_lst:
            if verse in text_train:
                pass
    if not found:
        strophes_chosen.append('Titel: ' +strophes_title[idx] +'\n\n' + strophe)


322it [01:27,  3.68it/s]


In [14]:
author_text_test = '\n'.join(strophes_chosen)

In [16]:
with open('no_hoel_no_train.txt', 'w') as f:
    f.write(author_text_test)

In [6]:
with open('gutenberg/gpt2_training_only_hoelderlin.txt') as f:
    text_exclude = f.read()
    
with open('gutenberg/gpt2_training_gutenberg_test.txt') as f:
    test_lines = f.readlines()

lines_out = []
for line in test_lines:
    if line == '\n':
        lines_out.append(line)

    else:
        if not line.strip() in text_exclude:
            lines_out.append(line)

lines_out_lb = [lines_out[0]]
for idx, line in enumerate(lines_out[1:]):
    if not(line == '\n' and lines_out[idx-1] == '\n'):
        lines_out_lb.append(line)

lines_out = lines_out_lb




In [7]:
len(lines_out)

103003

In [8]:
len(test_lines)

107848

In [5]:
test_text_clean = ''.join(lines_out)

with open('gpt2_training_gutenberg_no_hoelderlin_test.txt', 'w') as f:
    f.write(test_text_clean)

In [109]:
with open('gutenberg/gpt2_training_gutenberg_train.txt') as f:
    text_exclude = f.read()

with open('gutenberg/training/gpt2_training_gutenberg_no_schiller_train.txt') as f:
    text_2= f.read()

In [110]:
len(text_exclude)

18710292

In [111]:
len(text_2)

18710292