# Preprocessing of the DTA-Rhyme Corpus

Corpus: https://github.com/tnhaider/german-rhyme-corpus/tree/master/Diachron_Sample_DTA_DTR_Rhyme_Annotated

This notebook takes the anotated corpus as input and outputs a list of rhyming and not rhyming pair of words. 

There are some problems due to inconsistencies in the 

In [3]:
import pandas as pd
import os
from bs4 import BeautifulSoup as bs
import re
import itertools
import pickle

In [4]:
fpath = '/home/andreas/copyrighted_data/rhyme_corpus'

filenames = [os.path.join(fpath,fname) for fname in os.listdir(fpath) if fname.endswith('.xml')]

In [21]:
def get_text(raw_line):
    raw_line = re.sub('<[^>]+>', ' ', str(raw_line))
    raw_line = re.sub('\s\s+' , ' ', str(raw_line))
    raw_line = re.sub('ſ', 's', str(raw_line))
    raw_line = re.sub('uͤ', 'ü', str(raw_line))
    raw_line = re.sub('aͤ', 'ä', str(raw_line))
    raw_line = re.sub('oͤ', 'ö', str(raw_line)).lower()
    raw_line = re.sub(r'[^a-zäöü ]', '', str(raw_line))
    
    return raw_line

line_lst = []
rhyme_lst = []
problem_lst = []

corr_lst = []
f_lst = []

error_cnt = 0
solved_cnt = 0
no_rhyme_cnt = 0

for file in filenames:
    with open(file, "r",encoding="utf-8") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")

    result = bs_content.find_all("lg")
    for item in result:
        solved = 0
        bs_item = bs(str(item),'lxml')
        
        subitems = bs_item.find_all('lg')
        if len(subitems) <= 1:
            try:
                rhyme = re.search(r'rhyme=\"(.*?)\"', str(item)).group(1)
            except: 
                rhyme = 'z'
            lines = bs_item.find_all('l')
            
            lines_clean = []
            for line in lines:
                valid = True
                text = get_text(line)
                if re.search(r'<l><hi rendition="c',str(line)):   # rendition c indicates the use of blocks
                                                                      # but contains content at author Holz Arno
                    line_body = re.sub('<[^>]+>', '', str(line))
                    if len(line_body < 7) or line[:-1].isdecimal():  #[:-1] because of dot
                        valid = False
                    if re.search(r'<hi rendition="#et',str(line)) and line_body < 10:
                        valid = False
                
                        
                if re.search(r'<l><hi rendition="#et',str(line)):   #rendition et at beginning of line indicates
                                                                        #that it might be appended to the previous line
                    words = re.sub('<[^>]+>', '', str(line)).split()
                    
                    
                        
                    if len(words) < 3 and len(lines_clean) > 0:
                        lines_clean[-1] += ' ' + text
                        valid = False
                        solved = 1
                        
                if valid:
                    lines_clean.append(text)

            lines = [line for line in lines_clean if line] 
            
            
            if len(lines) != len(rhyme):
                print('######## error #########')
                print('rhyme:')
                print(rhyme)
                print('lines:')
                print(lines)
                print('file:')
                print(file)
                print('item:')
                print(str(item))
                print('\n')
                solved = 0
                error_cnt +=1
                
                '''lines_corr = []
                cnt = 0
                while cnt < len(lines): 
                    if cnt < len(lines)-1:
                        if len(lines[cnt+1].split()) < 3:
                            lines_corr.append(lines[cnt] + ' ' + lines[cnt + 1])
                            cnt += 2
                        else:
                            lines_corr.append(lines[cnt])
                            cnt +=1
                    else: 
                        lines_corr.append(lines[cnt])
                        cnt +=1                   
                
                
                if len(lines_corr) != len(rhyme): 
                    problem_lst.append(2)
                    cnt_p1 +=1
                    line_lst.append(lines)
                else: 
                    problem_lst.append(1)
                    line_lst.append(lines_corr)'''
            elif rhyme == 'z':
                no_rhyme_cnt += 1
            else:
                problem_lst.append(0)
                line_lst.append(lines)
                rhyme_lst.append(rhyme)
                f_lst.append(file)
                
            solved_cnt += solved
        else:
            pass


In [4]:
no_rhyme_cnt

168

# Just for illustration purposes

The cell belows shows the inconsistent use for "<l><\l>" in the corpus.
    
Sometimes "<\l>" comes inside a vers and sometimes indeed it marks the complete verse even if it spreads on two lines

In [14]:
for file in filenames:
    with open(file, "r",encoding="utf-8") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")

    result = bs_content.find_all("lg")
    for item in result:
        bs_item = bs(str(item),'lxml')
        
        subitems = bs_item.find_all('lg')
        if len(subitems) <= 1:
            try:
                rhyme = re.search(r'rhyme=\"(.*?)\"', str(item)).group(1)
            except: 
                rhyme = 'z'
            lines = bs_item.find_all('l')
            
            for i in range(len(lines)):
                if re.search(r'<l><hi rendition="#et">',str(lines[i])):
                    print('#et at beginning of verse')
                    print('verse:')
                    print(str(lines[i]))
                    print('file:')
                    print(str(file))
                    
            for i in range(len(lines)):
                if (not re.search(r'<l><hi rendition="#et">',str(lines[i]))) and re.search(r'rendition="#et">',str(lines[i])):
                    print('\n')
                    print('#et inside verse')
                    print('verse:')
                    print(str(lines[i]))
                    print('file:')
                    print(str(file))



#et inside verse
verse:
<l>Daſs dich, mit goldenen Städten und<lb></lb>
<hi rendition="#et">Schlachten,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Ramler_KarlWilhelm_1767_gold_p9_s79_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Zum hohen Kapitol dein ſtolzer Wagen<lb></lb>
<hi rendition="#et">trug. —</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Ramler_KarlWilhelm_1767_gold_p9_s79_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l><hi rendition="#in">O</hi> du, dem glühend Eiſen, donnernd<lb></lb>
<hi rendition="#et">Feuer</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Ramler_KarlWilhelm_1767_gold_p9_s79_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Die frommen Dichter zu zerſchmettern,<lb></lb>
<hi rendition="#et">Ungeheuer,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Ramler_KarlWilhelm_1767_gold_p9_s79_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Wer zur Verheerung b

<l>Boruſſiens gerechter Held ſoll ſie-<lb></lb>
<hi rendition="#et">gen!</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Ramler_KarlWilhelm_1767_gold_p9_s79_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Bald wird er im Triumph zu ſeinen Kin-<lb></lb>
<hi rendition="#et">dern fliegen.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Ramler_KarlWilhelm_1767_gold_p9_s79_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Er kömmt, das Haupt mit Stralen<lb></lb>
<hi rendition="#et">rund umwunden,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Ramler_KarlWilhelm_1767_gold_p9_s79_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Als er den Python ſchlug und ihm mit<lb></lb>
<hi rendition="#et">tauſend Wunden</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Ramler_KarlWilhelm_1767_gold_p9_s79_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Eilt, ihn in Erz den Enkeln auf-<lb></lb>
<hi rendition="#et



#et inside verse
verse:
<l>doch goͤnnt ſie mir die erſten Freuden ſtun-<lb></lb>
<hi rendition="#et">den</hi><lb></lb>
<fw place="bottom" type="catch">Jch</fw><lb></lb>
<pb facs="#f0335" n="275"></pb>
<fw place="top" type="header"><hi rendition="#b">Sinnreden.</hi></fw></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Stieler_Kasparvon_1660_gold_p63_s282_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">heit zieren;</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weckherlin_GeorgRodolf_1641_gold_p37_s117_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">trug/ vntrew/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weckherlin_GeorgRodolf_1641_gold_p37_s117_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">dienſt verkehret;</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weckherlin_GeorgRodolf_1641_gold_p37_s117_TEI-P5.xml' mode='r' encodi

#et at beginning of verse
verse:
<l><hi rendition="#et">kund/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weckherlin_GeorgRodolf_1641_gold_p37_s117_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">glick/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weckherlin_GeorgRodolf_1641_gold_p37_s117_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">ſchaiden.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weckherlin_GeorgRodolf_1641_gold_p37_s117_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Des Einen tod hat dich mit flucht/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weckherlin_GeorgRodolf_1641_gold_p37_s117_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Des andern flucht mit tod belohnet.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weckherlin_GeorgRodolf_1641_gold_p37_

#et at beginning of verse
verse:
<l><hi rendition="#et">cket,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Uz_JohannPeter_1755_gold_p42_s297_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">chen</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Uz_JohannPeter_1755_gold_p42_s297_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">chen!</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Uz_JohannPeter_1755_gold_p42_s297_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Glieder:</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Uz_JohannPeter_1755_gold_p42_s297_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">ſchmuͤckt,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Uz_JohannPeter_1755_gold_p42_s297_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#e



#et inside verse
verse:
<l>Wird dir der Schlaf ein gleiches Schreckniß<lb></lb>
<hi rendition="#et">bringen,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fontane_Theodor_1851_gold_p57_s346_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Und ſchwand dann, rückwärts ſchreitend, in der<lb></lb>
<hi rendition="#et">Thür.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fontane_Theodor_1851_gold_p57_s346_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Was ſchreckt das Traumbild mich des todten<lb></lb>
<hi rendition="#et">Mannes</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fontane_Theodor_1851_gold_p57_s346_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Das Schwert des Henkers wär’ wie Glas zer-<lb></lb>
<hi rendition="#et">ſprungen,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fontane_Theodor_1851_gold_p57_s346_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Es ſoll nicht mehr ſei

#et at beginning of verse
verse:
<l><hi rendition="#et">das Licht/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/DOPPELT Silesius_Angelus_1675_gold_p32_s32_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">pflicht.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/DOPPELT Silesius_Angelus_1675_gold_p32_s32_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">dem HErꝛn.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/DOPPELT Silesius_Angelus_1675_gold_p32_s32_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Ding/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/DOPPELT Silesius_Angelus_1675_gold_p32_s32_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">und blitzt.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/DOPPELT Silesius_Angelus_1675_gold_p32_s32_TEI-P5.xml' mode='r' encoding='utf-



#et inside verse
verse:
<l>Die ich manchem Liebes-Strick’/ auch ſo manchen groſſen<lb></lb>
<hi rendition="#et">Hauffen</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Muß nun rennen/ aͤchtzen/ ſchreyen: Ach mein hochbetruͤb-<lb></lb>
<hi rendition="#et">tes Hertz/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Schaffet zwey verliebten Hertzen/ meinem und dem eurem<lb></lb>
<hi rendition="#et">Ruh’.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Fuͤg es allezeit alſo/ daß ſich gleich und gleich ge-<lb></lb>
<hi rendition="#et">ſellt/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside vers



#et inside verse
verse:
<l>Und vor Abbruch/ Schimpf und Spott deines of<supplied>-</supplied><lb></lb>
<hi rendition="#et">fentlichen Feindes.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Geht dir gleich ein Laſter an/ bilde dir daruͤm<lb></lb>
<hi rendition="#et">nicht ein/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Daß dem allerhoͤchſten Gott koͤnne was verbor-<lb></lb>
<hi rendition="#et">gen ſeyn.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Dann/ wenn noch die Jugend bluͤht/ faſſe weiſer<lb></lb>
<hi rendition="#et">Leute Lehren/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>


#et i

#et at beginning of verse
verse:
<l><hi rendition="#et">ein/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Munde/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">bekriegt/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l><hi rendition="#fr"><hi rendition="#et">gebohren.</hi></hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l><hi rendition="#c">9.</hi>
<hi rendition="#et">(ſey/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>

#et at beginning of verse
verse:
<l><hi rendition="#et">That/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">glich iſt/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Bruͤder/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Hoͤhe/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l><hi rendition="#c">2.</hi>
<hi rendition="#et">(Erde/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside ve

#et at beginning of verse
verse:
<l><hi rendition="#et"><hi rendition="#fr">reich iſt/</hi></hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Zesen_Philippvon_1640_gold_p15_s19_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et"><hi rendition="#fr">gleich iſt.</hi></hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Zesen_Philippvon_1640_gold_p15_s19_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Doch irrſt du, Freund, ſo bald du ſagſt, ſie ſchwanke hin<lb></lb>
<hi rendition="#et">und her!</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Platen_Augustvon_1828_gold_p33_s112_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Der Winter iſt ein Greis, doch ſchickt der Lenz den<lb></lb>
<hi rendition="#et">Duft</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Platen_Augustvon_1828_gold_p33_s112_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>
<hi rendition="#in">D</hi>a, wie fa

#et at beginning of verse
verse:
<l><hi rendition="#et">Tropffe</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Logau_Friedrichvon_1654_gold_p63_s63_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Kopffe?</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Logau_Friedrichvon_1654_gold_p63_s63_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Freyer</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Logau_Friedrichvon_1654_gold_p63_s63_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Dreyer?</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Logau_Friedrichvon_1654_gold_p63_s63_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">ginge.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Logau_Friedrichvon_1654_gold_p63_s63_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><

#et at beginning of verse
verse:
<l><hi rendition="#et">ſtallen/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weise_Christian_1701_gold_p49_s308_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">gelang,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weiсe_ChristianFelix_1767_gold_p32_s100_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">ruht,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weiсe_ChristianFelix_1767_gold_p32_s100_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Thier!</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weiсe_ChristianFelix_1767_gold_p32_s100_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">groß,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Weiсe_ChristianFelix_1767_gold_p32_s100_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
ver



#et inside verse
verse:
<l>
<hi rendition="#et">Trutz, blanke Hans.</hi>
</l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Liliencron_Detlevvon_1883_gold_p34_s153_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>
<hi rendition="#et">Trutz, blanke Hans.</hi>
</l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Liliencron_Detlevvon_1883_gold_p34_s153_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>
<hi rendition="#et">Trutz, blanke Hans.</hi>
</l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Liliencron_Detlevvon_1883_gold_p34_s153_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>
<hi rendition="#et">Trutz, blanke Hans.</hi>
</l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Liliencron_Detlevvon_1883_gold_p34_s153_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>
<hi rendition="#et">Trutz, blanke Hans.</hi>
</l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Liliencron_Detlevvon_1883_gold_p34_s153_TEI-P5.xml' mode='r' enc

#et at beginning of verse
verse:
<l><hi rendition="#et">ſende,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Spindler_ChristianGotthold_1745_gold_p31_s58_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">auf,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Spindler_ChristianGotthold_1745_gold_p31_s58_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Haͤhnen,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Spindler_ChristianGotthold_1745_gold_p31_s58_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">drehen,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Spindler_ChristianGotthold_1745_gold_p31_s58_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">ſchreibet,</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Spindler_ChristianGotthold_1745_gold_p31_s58_TEI-P5.xml' mode='r' encoding='utf

#et at beginning of verse
verse:
<l><hi rendition="#et">das Licht/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Silesius_Angelus_1657_gold_p33_s33_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">pflicht.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Silesius_Angelus_1657_gold_p33_s33_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">dem HErꝛn.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Silesius_Angelus_1657_gold_p33_s33_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Ding/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Silesius_Angelus_1657_gold_p33_s33_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">und blitzt.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Silesius_Angelus_1657_gold_p33_s33_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l>



#et inside verse
verse:
<l>Weil ihn mein treuſter Freund mit ſeinem Blut<lb></lb>
<hi rendition="#et">benetzt.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Canitz_FriedrichRudolphLudwigvon_1700_gold_p19_s66_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Wohin erſt mancher kaum nach langem Schweiß ge-<lb></lb>
<hi rendition="#et">diehen/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Canitz_FriedrichRudolphLudwigvon_1700_gold_p19_s66_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Doch halt/ es moͤchte mich der Schmertz zu weit verlei-<lb></lb>
<hi rendition="#et">ten/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Canitz_FriedrichRudolphLudwigvon_1700_gold_p19_s66_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Wie manchen der ſein Grab mit Lorbeer denckt zu<lb></lb>
<hi rendition="#et">kroͤnen/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Canitz_FriedrichRudolphLudwigvon_1700_gold_p19_s66_TEI-P5

#et at beginning of verse
verse:
<l><hi rendition="#et">gen/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Hofmannswaldau_ChristianHofmannvon_1679_gold_p25_s29_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Muth/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Hofmannswaldau_ChristianHofmannvon_1679_gold_p25_s29_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">kan.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Hofmannswaldau_ChristianHofmannvon_1679_gold_p25_s29_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">kraͤncken/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Hofmannswaldau_ChristianHofmannvon_1679_gold_p25_s29_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">dencken.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Hofmannswaldau_ChristianHofmannvon_1679_gold_p25_s2

#et at beginning of verse
verse:
<l><hi rendition="#et">in das wolbeflam̃te Feuer/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fleming_Paul_1642_gold_p47_s226_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">und der Himmel/ der Euch paart/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fleming_Paul_1642_gold_p47_s226_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et"><hi rendition="#et">geb auch Art</hi></hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fleming_Paul_1642_gold_p47_s226_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">aus der ſchoͤnen Gluht noch heuer.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fleming_Paul_1642_gold_p47_s226_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l><hi rendition="#fr"><hi rendition="#et">gieſſe Gunſt</hi></hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fleming_

#et at beginning of verse
verse:
<l><hi rendition="#et">daß euer Mund mit unſerm hertzlich lache.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fleming_Paul_1642_gold_p47_s226_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">Die Doris rufft mit tauſent Najadinnen/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fleming_Paul_1642_gold_p47_s226_TEI-P5.xml' mode='r' encoding='utf-8'>
#et at beginning of verse
verse:
<l><hi rendition="#et">auff/ Chloris/ auff/ mit tauſent Napeinnen.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fleming_Paul_1642_gold_p47_s226_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l><hi rendition="#fr"><hi rendition="#et">durch manches Land und Meer</hi></hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Fleming_Paul_1642_gold_p47_s226_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l><hi rendition="#fr"><hi rendition="#et">zu eurem frommen her/</hi></hi></l>
fil



#et inside verse
verse:
<l>Der auch trotz Noth und Tod beſtaͤndig iſt und<lb></lb>
<hi rendition="#et">bleibt.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/MБhlpfort_Heinrich_1686_gold_p60_s478_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>Verklaͤrter Leiber Glantz/ der Sonn und Sternen<lb></lb>
<hi rendition="#et">gleich/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/MБhlpfort_Heinrich_1686_gold_p60_s478_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l><hi rendition="#in">S</hi>O hat nun die Gedult <hi rendition="#fr">Herr
                Grundmanns</hi> uͤber-<lb></lb>
<hi rendition="#et">wunden/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/MБhlpfort_Heinrich_1686_gold_p60_s478_TEI-P5.xml' mode='r' encoding='utf-8'>


#et inside verse
verse:
<l>So brechen nunmehr an die hoͤchſt- erwuͤnſchte Stun-<lb></lb>
<hi rendition="#et">den/</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/MБhlpfort_Heinrich_1686_gold_p60_s478

# Rendition test

Find and display uses of different renditions

In [25]:
rendition ='rendition="#c'

for file in filenames:
    with open(file, "r",encoding="utf-8") as file:
        # Read each line in the file, readlines() returns a list of lines
        content = file.readlines()
        # Combine the lines in the list into a string
        content = "".join(content)
        bs_content = bs(content, "lxml")

    result = bs_content.find_all("lg")
    for item in result:
        bs_item = bs(str(item),'lxml')
        
        subitems = bs_item.find_all('lg')
        if len(subitems) <= 1:
            try:
                rhyme = re.search(r'rhyme=\"(.*?)\"', str(item)).group(1)
            except: 
                rhyme = 'z'
            lines = bs_item.find_all('l')
            
            for i in range(len(lines)):
                if re.search(rendition,str(lines[i])):
                    print('\n')
                    print('verse:')
                    print(str(lines[i]))
                    print('file:')
                    print(str(file))
                    line = re.sub('<[^>]+>', '', str(lines[i]))
                    print(line)
                    if line[:-1].isdecimal():
                        print('numeric')
                        
                    



verse:
<l><hi rendition="#c">4.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>
4.
numeric


verse:
<l><hi rendition="#c">1.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>
1.
numeric


verse:
<l><hi rendition="#c">2.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>
2.
numeric


verse:
<l><hi rendition="#c">3.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>
3.
numeric


verse:
<l><hi rendition="#c">4.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8'>
4.
numeric


verse:
<l><hi rendition="#c">5.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Neumark_Georg_1666_gold_p63_s89_TEI-P5.xml' mode='r' encoding='utf-8



verse:
<l><hi rendition="#c">1.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
1.
numeric


verse:
<l><hi rendition="#c">2.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
2.
numeric


verse:
<l><hi rendition="#c">3.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
3.
numeric


verse:
<l><hi rendition="#c">4.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
4.
numeric


verse:
<l><hi rendition="#c">5.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
5.
numeric


verse:
<l><hi rendition="#c">6.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5



verse:
<l><hi rendition="#c">4.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
4.
numeric


verse:
<l><hi rendition="#c">5.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
5.
numeric


verse:
<l><hi rendition="#c">6.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
6.
numeric


verse:
<l><hi rendition="#c">7.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
7.
numeric


verse:
<l><hi rendition="#c">1.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
1.
numeric


verse:
<l><hi rendition="#c">2.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5



verse:
<l><hi rendition="#c">9.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
9.
numeric


verse:
<l><hi rendition="#c">1.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
1.
numeric


verse:
<l><hi rendition="#c">2.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
2.
numeric


verse:
<l><hi rendition="#c">3.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
3.
numeric


verse:
<l><hi rendition="#c">4.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
4.
numeric


verse:
<l><hi rendition="#c">5.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5



verse:
<l><hi rendition="#c">1.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
1.
numeric


verse:
<l><hi rendition="#c">2.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
2.
numeric


verse:
<l><hi rendition="#c">3.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
3.
numeric


verse:
<l><hi rendition="#c">4.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
4.
numeric


verse:
<l><hi rendition="#c">5.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
5.
numeric


verse:
<l><hi rendition="#c">6.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5



verse:
<l><hi rendition="#c">10.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
10.
numeric


verse:
<l><hi rendition="#c">2.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
2.
numeric


verse:
<l><hi rendition="#c">2.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
2.
numeric


verse:
<l><hi rendition="#c">3.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
3.
numeric


verse:
<l><hi rendition="#c">4.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
4.
numeric


verse:
<l><hi rendition="#c">5.</hi></l>
file:
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-

KeyboardInterrupt: 

In [22]:
rhyme_df = pd.DataFrame(list(zip(line_lst, rhyme_lst,problem_lst,f_lst)),
               columns =['text', 'rhyme','problem','file'])

In [23]:
rhyme_df.to_csv('rhyme_df.csv')

In [24]:
picklefile = open('rhyme_df.pkl', 'wb')
pickle.dump(rhyme_df[['text','rhyme']], picklefile)

In [6]:
rhyme_df

Unnamed: 0,text,rhyme,problem,file
0,"[o du dem glühend eisen donnerndfeuer, aus off...",abab,0,<_io.TextIOWrapper name='rhyme_corpus/Ramler_K...
1,"[wer zur verheerung blühender geschlechter, di...",abab,0,<_io.TextIOWrapper name='rhyme_corpus/Ramler_K...
2,"[ganz nahe war ich schon dem styxganz nahe, de...",abab,0,<_io.TextIOWrapper name='rhyme_corpus/Ramler_K...
3,"[verdammt zum spott bey bodenlosenfässern, und...",abab,0,<_io.TextIOWrapper name='rhyme_corpus/Ramler_K...
4,"[voll tapfrer brennen sah ich ihrelieder, ihr ...",abab,0,<_io.TextIOWrapper name='rhyme_corpus/Ramler_K...
...,...,...,...,...
8359,"[quält schmerz und krankheit deine glieder, ma...",abab,0,<_io.TextIOWrapper name='rhyme_corpus/Kerner_J...
8360,"[hier legt natur mit linden armen, dich an die...",abab,0,<_io.TextIOWrapper name='rhyme_corpus/Kerner_J...
8361,"[der wasser gute geister singen, hier aus krys...",abab,0,<_io.TextIOWrapper name='rhyme_corpus/Kerner_J...
8362,"[ja kranker wie ein kind ans herze, der mutter...",abab,0,<_io.TextIOWrapper name='rhyme_corpus/Kerner_J...


In [7]:
from preprocess_rhymes import rhyming_words

In [8]:
rhyming_words(rhyme_df)

Unnamed: 0,word1,word2,rhyme
0,donnerndfeuer,zerschmetternungeheuer,1
1,flammt,stammt,1
2,geschlechter,seinetöchter,1
3,gebracht,umgebracht,1
4,nahe,rasselnsahe,1
...,...,...,...
43440,bringen,vertraut,0
43441,herze,legt,0
43442,schmerze,bewegt,0
43443,durchbeben,heit,0


In [64]:

rhyme_pairs = []
no_rhyme_pairs = []

rhyme_df_clean = rhyme_df.drop(rhyme_df[rhyme_df['problem']>0].index)

for index, row in rhyme_df_clean.iterrows():
    
    scheme = row['rhyme']
    sent = row['text']
    file = row['file']
    new_pairs = []
    letters = set(scheme)
    for letter in letters:
        indices = [idx for idx, char in enumerate(scheme) if char == letter]
        
        if len(indices) >= 2: 
            new_pair = [sent[i].split()[-1] for i in indices]
            new_pairs.append(new_pair)
                
            if len(indices) > 2:
                rhyme_pairs += [list(item) for item in list(itertools.combinations(new_pair,2))]
            elif len(indices) == 2:
                rhyme_pairs.append(new_pair)

        else:
            #print(sent)
            #print(scheme)
            #print(indices)
            pass
        
    for idx in range(len(new_pairs)-1):                                          # ugly, but does the job fast
        try:
            no_rhyme_pairs.append([new_pairs[idx][0],new_pairs[idx+1][0]])
            no_rhyme_pairs.append([new_pairs[idx][1],new_pairs[idx+1][1]])
        except:
            pass
    

['oanfang sonder ende', 'du grosses a und o', 'wir küssen deine hände', 'und sind von hertzen froh', 'weil du uns noch ein jahr', 'mit segen läst beschlüssen', 'da wir bekennen müssen', 'dein thun sey wunderbar']
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
['der letzte wochentag                            ist hin', 'davor ich dir verbunden bin', 'du bist mein gott das a und o', 'und deine weiheit schafft es so']
<_io.TextIOWrapper name='rhyme_corpus/Schmolck_Benjamin_1712_gold_p44_s420_TEI-P5.xml' mode='r' encoding='utf-8'>
['mein freund ein schüler von apollen', 'ein meister in der homelie herr professor e  b                                 d', 'glaubt da die tauben küssen wollen', 'aus zärtlichkeit und harmonie']
<_io.TextIOWrapper name='rhyme_corpus/Karsch_AnnaLuise_1792_gold_p7_s50_TEI-P5.xml' mode='r' encoding='utf-8'>
['so lange eine menschheit ist', 'so lange jesus bleibt der christ', 'so bleibet di das a und 

In [45]:
rhyme_stat = [1]*len(rhyme_pairs) + [0]*len(no_rhyme_pairs)
pairs = rhyme_pairs + no_rhyme_pairs

for pair in pairs:
    
    if len(pair) < 1:
        print(pair)
word_1 = [pair[0] for pair in pairs]
word_2 = [pair[1] for pair in pairs]

rhyme_word_df = pd.DataFrame(list(zip(word_1, word_2,rhyme_stat)),
               columns =['word1', 'word2','rhyme'])

In [46]:
rhyme_word_df

Unnamed: 0,word1,word2,rhyme
0,donnerndfeuer,zerschmetternungeheuer,1
1,flammt,stammt,1
2,geschlechter,seinetöchter,1
3,gebracht,umgebracht,1
4,nahe,rasselnsahe,1
...,...,...,...
43440,bringen,vertraut,0
43441,herze,legt,0
43442,schmerze,bewegt,0
43443,durchbeben,heit,0


In [61]:
stat_df = rhyme_word_df.copy()

stat_df['len word1'] = stat_df['word1'].apply(lambda x: len(x))
stat_df['len word2'] = stat_df['word2'].apply(lambda x: len(x))
pd.set_option('display.max_rows', 200)
stat_df.sort_values(by=['len word2']).head(100)

Unnamed: 0,word1,word2,rhyme,len word1,len word2
43041,keim,a,0,4,1
20259,a,a,1,1,1
20081,a,a,1,1,1
28217,jahr,o,0,4,1
42818,hus,a,0,3,1
20281,la,a,1,2,1
20156,ha,a,1,2,1
42842,gern,i,0,4,1
41848,ist,u,0,3,1
19279,du,u,1,2,1


In [17]:
shuffled_df = rhyme_word_df.sample(frac=1).reset_index(drop=True)


In [19]:
ratio = 0.8

df_train = shuffled_df[:int(len(shuffled_df)*ratio)]
df_val = shuffled_df[int(len(shuffled_df)*ratio):]

df_train.to_csv('data/train.csv',index=False)
df_val.to_csv('data/val.csv',index=False)

shuffled_df.to_csv('data/rhymes_clean.csv', index=False)

In [14]:
label_dict = {1:'y',0:'n'}

train_data_df = shuffled_df.copy()

train_data_df['rhyme'] = train_data_df['rhyme'].apply(lambda x: label_dict[x])

train_data_df.to_csv('data/rhymes_clean.tsv', index=False, sep='\t',header = None)


train_data_df

Unnamed: 0,word1,word2,rhyme
0,geblühm,dräut,n
1,gleicht,anbeut,n
2,saamen,namen,y
3,bliebe,liebe,y
4,wiedere,niedere,y
...,...,...,...
42633,geschlacht,gemacht,y
42634,rächet,schwächet,y
42635,bereit,ewigkeit,y
42636,leise,brot,n


'aber oh  du'