### Bibliotheken importieren

In [23]:
import pandas as pd
import string
from g2p_en import G2p
from pyphonetics import Soundex

# pip install g2p_en <- erforderlich für die Phonemtranskribierung analog zur TwistList
# pip install pyphonetics <- Soundex

### Datenimport

In [24]:
# Rohdaten einlesen und Dataframes mit vorgegebenen Spalten erstellen
cols_twistlist = ['text', 'phonemes', 'keywords', 'source']
twistList = pd.read_csv(r'data source 2\raw-data\raw-data\twister_dataset.csv', delimiter=',',nrows= 3000,header=None, names=cols_twistlist)
cols_kaggle = ['0', 'text']
kaggle = pd.read_csv(r'data source 2\raw-data\raw-data\datasetKaggle.csv', delimiter=',',nrows= 1000,header=None, usecols=['text'], names=cols_kaggle)

# Leerzeilen entfernen
kaggle = kaggle.drop(0)
twistList = twistList.drop(0)

### Datenzusammenführung

In [25]:
# Beide DataFrames "zuschneiden" (nicht relevante Spalten entfernen)
dfTL = twistList.drop(['keywords', 'source'], axis=1)
dfKaggle = kaggle.copy()

# Zungenbrecher herausfiltern, welche noch nicht in der TwistList sind
duplic = dfKaggle[~dfKaggle['text'].isin(dfTL['text'])] 

# Dataframes zusammenführen
finDF = pd.concat([dfTL, duplic], ignore_index=True)


### Phonem- und Soundex-Spalten hinzufügen

In [26]:
# SOUNDEX

soundex = Soundex()
sx_list = []

for i in range(finDF.shape[0]):
    # print(i)
    word_sx_list = []
    text = finDF['text'].iloc[i].strip()
    words = text.split(' ')
    for word in words:
        if any(char.isalpha() for char in word):
            word_sx_list.append(soundex.phonetics(word)) # Anm.: Satzzeichen werden von Soundexcodierung ignoriert.
    sx_list.append(word_sx_list)

finDF['soundex'] = sx_list

In [27]:
# PHONEMES

g2p = G2p()
pn_list = [] 

punctuation_chars = string.punctuation # String mit allen Satzzeichen erstellen
for i in range(finDF.shape[0]):
    text = finDF['text'].iloc[i].strip()
    text = ''.join(char for char in text if char not in punctuation_chars) # Alle Satzzeichen entfernen (optional)
    words = text.split(' ')
    text_phonemes = []
    for word in words:
        text_phonemes.append(g2p(word))
    pn_list.append(text_phonemes)
    
# print(dfKaggleP)
# print(len(dfKaggleP))

# Liste als neue Spalte an das Dataframe anhängen
finDF['phonemes'] = pn_list

### Finales Dataframe speichern

In [28]:
finDF

Unnamed: 0,text,phonemes,soundex
0,Peter Piper picked a peck of pickled peppers. ...,"[[P, IY1, T, ER0], [P, AY1, P, ER0], [P, IH1, ...","[P360, P160, P230, A000, P200, O100, P243, P16..."
1,I saw Susie sitting in a shoe shine shop. Wher...,"[[AY1], [S, AO1], [S, UW1, Z, IY0], [S, IH1, T...","[I000, S000, S200, S352, I500, A000, S000, S50..."
2,How many boards Could the Mongols hoard If the...,"[[HH, AW1], [M, EH1, N, IY0], [B, AO1, R, D, Z...","[H000, M500, B632, C430, T000, M524, H630, I10..."
3,How can a clam cram in a clean cream can?,"[[HH, AW1], [K, AE1, N], [AH0], [K, L, AE1, M]...","[H000, C500, A000, C450, C650, I500, A000, C45..."
4,Send toast to ten tense stout saints' ten tall...,"[[S, EH1, N, D], [T, OW1, S, T], [T, UW1], [T,...","[S530, T230, T000, T500, T520, S330, S532, T50..."
...,...,...,...
2671,Then step up mister and twist your tongue,"[[DH, EH1, N], [S, T, EH1, P], [AH1, P], [M, I...","[T500, S310, U100, M236, A530, T230, Y600, T520]"
2672,Now Kissle will whistle at busty Miss. Russell...,"[[N, AW1], [K, IH1, S, AH0, L], [W, IH1, L], [...","[N000, K240, W400, W234, A300, B230, M200, R24..."
2673,"Purple paper people, purple paper people, purp...","[[P, ER1, P, AH0, L], [P, EY1, P, ER0], [P, IY...","[P614, P160, P140, P614, P160, P140, P614, P16..."
2674,De doorgaans dappere Durgerdammer drukker Dirk...,"[[D, IY1], [D, AO1, R, G, AH0, N, Z], [D, AE1,...","[D000, D625, D160, D626, D626, D620, D636, D00..."


In [29]:
# DataFrame als .csv exportieren

filename = f'tt_dataset.csv'
finDF.to_csv(filename, index= False)