### Examine phoneme proportions in words of specific length, using a training database
Actual exploration starts after defining the sort_dict_val(dict_value_list) function.
Each cell has only one function (usually in this order): 
 1. either it defines a function (usually in order to create or sort a dictionary)
 2. or it creates lists that can be used to make nice graphs
 3. or it creates graphs from the above created lists

In [1]:
import os
import glob
import re
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import speech_rate
from scipy.stats.stats import pearsonr
from collections import defaultdict

In [2]:
# This is the link to the data on which you want to explore
# You need it in the two cells defining some sort of build...dictionary, in the line starting with: os.chdir
training_db = "C:/Users/alexutza_a/Abschlussarbeit/DB_Verbmobil/Evaluation/Training"

In [3]:
# Creates a dict of phoneme proportions in words of specific lengths (as no of phonemes) - w_dur in msec.
# Looks like: {"a" : {1pho_w : [w_dur1, pho_prop1, pho_dur1, w_dur2, pho_prop2, ...], 2pho_w : [...], ...}, 
#              "b" : {1pho_w : [w_dur1, pho_prop1, pho_dur1, w_dur2, pho_prop2, ...], 2pho_w: [...], ...}, 
#               ...}
def build_simple_dict():
    simple_phoprop_dict = defaultdict(dict) # the dict to be returned
    simple_prop_dict = defaultdict(list)   # the value dict
    os.chdir(training_db)
    
    #Iterate over the training files
    for datei in glob.glob("*.par"):
        work_file = open(datei)
        for line in work_file:
            if re.match("MAU", line):
                w_dur, phon_count, syl_count = speech_rate.word_duration(datei, int(line.split()[3]))
                key = str(phon_count)+"pho_w"
                simple_prop_dict[key].append(round(w_dur*0.0625, 2))                 # w_dur used in msec.
                simple_prop_dict[key].append(round(int(line.split()[2])/w_dur, 3))   # calc pho prop as pho_dur/w_dur
                #simple_prop_dict[key].append(round(int(line.split()[2])*0.0625), 3)
                if key in simple_phoprop_dict[line.split()[4]].keys():
                    simple_phoprop_dict[line.split()[4]][key] += simple_prop_dict[key]
                else: 
                    simple_phoprop_dict[line.split()[4]][key] = simple_prop_dict[key]
                simple_prop_dict.clear()
        work_file.close()
    return simple_phoprop_dict

In [4]:
# Example call of the simple phoneme proportions dict
%time print(build_simple_dict()["OY"])

{'16pho_w': [1009.0, 0.129, 1019.0, 0.128], '3pho_w': [329.81, 0.637, 299.81, 0.533, 249.81, 0.6], '13pho_w': [709.19, 0.183], '5pho_w': [429.69, 0.372, 359.69, 0.417, 239.69, 0.417], '4pho_w': [519.75, 0.5]}
Wall time: 7.72 s


In [12]:
# Help function for getting name and duration of other long pho in word (vowels & nasals & /x/ & /s/)
corr_phonemes = ["a", "a~", "e", "E", "I", "i", "O", "o", "U", "u", "Y", "y", "9", "2", "a:", "a~:", "e:", "E:", "i:",
                 "o:", "u:", "y:", "2:", "OY", "aU", "aI", "@", "6", "m", "n", "N", "x", "s"]
def get_longP(datei, word_no, zeile):
    work_file = open(datei)
    long_Plist = []
    z = 0
    for line in work_file:
        if re.match("MAU", line) and word_no == int(line.split()[3]):
            if (line.split()[4] in corr_phonemes) and (z != zeile):
                w_dur, phon_count, syl_count = speech_rate.word_duration(datei, int(line.split()[3]))
                #print("z: " + str(z))
                #print("zeile: " + str(zeile))
                long_Plist.append(line.split()[4])
                long_Plist.append(round(int(line.split()[2])/w_dur, 3))
            elif z == zeile:
                continue
        z += 1
    work_file.close()
    return long_Plist

In [13]:
# Creates a dict of phoneme proportions in words of specific lengths (as no of phonemes) - w_dur in msec.
# It considers also the presence of other correlating phonemes in word
# Looks like: {"a" : {1pho_w : [filename, word_nr, 0, [], w_dur2, pho_dur2, ...], 
#                     2pho_w : [w_dur1, pho_dur1, no._other_long_pho, [otherPho1, prop_otherPho1, ...], w_dur2, pho_dur2, [...], ...}, 
#              "b" : {1pho_w : [w_dur1, pho_dur1, 0, [], w_dur2, pho_dur2, ...], 2pho_w: [...], ...}, 
#               ...}
def build_complex_dict():
    simple_phoprop_dict = defaultdict(dict) # the dict to be returned
    simple_prop_dict = defaultdict(list)   # the value dict
    
    os.chdir(training_db)
    
    #Iterate over the training files
    for datei in glob.glob("*.par"):
        work_file = open(datei)
        zeile = 0                              # a row counter; counting starts at beginning of file
        for line in work_file:
            if re.match("MAU", line):
                w_dur, phon_count, syl_count = speech_rate.word_duration(datei, int(line.split()[3]))
                key = str(phon_count)+"pho_w"
                simple_prop_dict[key].append(str(datei)[-20:])       # file name 
                simple_prop_dict[key].append(int(line.split()[3]))   # word number
                simple_prop_dict[key].append(str(line.split()[4]))   # phoneme
                simple_prop_dict[key].append(round(w_dur*0.0625, 2))                 # w_dur used in msec.
                simple_prop_dict[key].append(round(int(line.split()[2])/w_dur, 3))   # calc pho prop as pho_dur/w_dur
                simple_prop_dict[key].append(get_longP(datei, int(line.split()[3]), zeile))  # list of other pho in word
                if key in simple_phoprop_dict[line.split()[4]].keys():
                    simple_phoprop_dict[line.split()[4]][key] += simple_prop_dict[key]
                else: 
                    simple_phoprop_dict[line.split()[4]][key] = simple_prop_dict[key]
                simple_prop_dict.clear()
            zeile += 1
        work_file.close()
    return simple_phoprop_dict

In [14]:
# Example call of the complex phoneme proportions dict
%time print(build_complex_dict()["OY"])

{'16pho_w': ['g376acn2_009_AKX.par', 28, 'OY', 1009.0, 0.129, ['n', 0.059], 'g376acn2_069_AKX.par', 13, 'OY', 1019.0, 0.128, ['n', 0.039]], '3pho_w': ['g376acn2_030_AKX.par', 34, 'OY', 329.81, 0.637, ['n', 0.182], 'g376acn2_035_AKX.par', 15, 'OY', 299.81, 0.533, ['n', 0.133], 'g378acn2_028_AKX.par', 5, 'OY', 249.81, 0.6, ['n', 0.24]], '13pho_w': ['g376acn2_041_AKX.par', 3, 'OY', 709.19, 0.183, ['n', 0.085]], '5pho_w': ['g378acn2_014_AKX.par', 29, 'OY', 429.69, 0.372, ['n', 0.093], 'g378acn2_102_AKX.par', 0, 'OY', 359.69, 0.417, ['n', 0.139], 'g378acn2_102_AKX.par', 3, 'OY', 239.69, 0.417, ['n', 0.208]], '4pho_w': ['g379acn2_106_AKX.par', 1, 'OY', 519.75, 0.5, []]}
Wall time: 19.1 s


In [15]:
# Build the simple phoneme proportions dictionary
%time simple_pprop_dict = build_simple_dict()

Wall time: 7.18 s


In [16]:
# Bild the complex phoneme proportions dictionary
%time complex_pprop_dict = build_complex_dict()

Wall time: 17.9 s


In [17]:
from operator import itemgetter

# sort needed dict_list from the simple (!) dict
def sort_dict_val(dict_value_list):
#    dict_value_list = simple_pprop_dict["a:"]["2pho_w"]
    l_gr = [dict_value_list[i:i+2] for i in range(0, len(dict_value_list),2)] # transform list in list of 2-elem-lists
    l_sorted = sorted(l_gr,key=itemgetter(0)) # sort list of of 2-elem-lists
    org_simpleDict_val = []
    for i in range(len(l_sorted)):    # flatten sorted list
        org_simpleDict_val += l_sorted[i]
    return org_simpleDict_val
#print(sort_dict_val(simple_pprop_dict["a:"]["2pho_w"])[:14])

In [19]:
# Test data for plot /a:/-durations in 2pho_w against w_dur
simple_wdur_list = sort_dict_val(complex_pprop_dict["a:"]["2pho_w"])[0::4]
#simple_wdur_list = [ round(x/1000, 3) for x in simple_wdur_list]     #transform w_dur in sec (from msec)
simple_pdur_list = sort_dict_val(complex_pprop_dict["a:"]["2pho_w"])[1::4]
print(len(simple_pdur_list))
print(simple_pdur_list[:5])
print(simple_wdur_list[:5])

TypeError: '<' not supported between instances of 'float' and 'str'

In [None]:
# Test data for plot /a:/-proportions in 2pho_w against w_dur
simple_dur_list = sort_dict_val(simple_pprop_dict["a:"]["2pho_w"])[::2]
simple_dur_list = [ round(x/1000, 3) for x in simple_dur_list]     #transform w_dur in sec (from msec)
simple_prop_list = sort_dict_val(simple_pprop_dict["a:"]["2pho_w"])[1::2]
print(len(simple_dur_list))

In [None]:
# This Graph shows that the duration of /a:/ in 2-phoneme-words correlates very well with word duration
x = range(100)
plt.plot(x, simple_wdur_list, label = "word duration (ms)")
plt.plot(x, simple_pdur_list, label = "duration of /a:/ (ms)")
plt.title("Duration of /a:/ vs. increasing duration of 2-phon-long words")
plt.legend()
plt.show()

In [None]:
# This scatter plot confirms the result of the previous plot
print("Pearson correlation coefficient for /a:/ in 2-phoneme-long words:")
print(pearsonr(simple_pdur_list, simple_wdur_list)[0])
m, b = np.polyfit(simple_pdur_list, simple_wdur_list,1) # elem. of the regression function
plt.scatter(simple_pdur_list, simple_wdur_list, color = "darkorange")
plt.plot((np.unique(simple_pdur_list)), (m*(np.unique(simple_pdur_list)) + b), "-", color="k") # plot regression line
plt.xlabel("Duration of /a:/ in msec")
plt.ylabel("Word duration in msec")
plt.show()

In [None]:
# This Graph shows that the proportion of /a:/ in 2-phoneme-words doesn't correlate with word duration
x = range(100)
plt.plot(x, simple_dur_list, label = "word duration (s)")
plt.plot(x, simple_prop_list, label = "proportion of /a:/")
plt.legend()
plt.show()

In [None]:
# This scatter plot confirms the result of the previous plot
print(pearsonr(simple_prop_list, simple_dur_list))
plt.scatter(simple_prop_list, simple_dur_list, color = "darkorange")
plt.show()

In [None]:
# Would suggest it makes sense to use only the interquartile area, or even just the mean/median value
plt.boxplot(simple_prop_list, labels = ["a:"], showmeans=True)
plt.show()
print("Median: " + str(np.median(simple_prop_list)))
print("Mean: " + str(np.mean(simple_prop_list)))
Q1 = np.percentile(simple_prop_list, 25)
Q3 = np.percentile(simple_prop_list, 75)
print("Q1: " + str(Q1) + "   Q3: " + str(Q3))

In [None]:
# This plot confirms the intuition, that the percent occupied by /a:/ decreases with length of word (as # of phonemes)
# Boxplot with /a:/-proportions overview on all word legths
prop_lists = []

# sort data by increasing word length
key_list = sorted(simple_pprop_dict["a:"].keys())
first_3 = key_list[:3]
key_list = key_list[3:] + first_3

# generate list of lists for plotting
for el in key_list:
    prop_lists.append(simple_pprop_dict["a:"][el][1::2])

# create a list of count/word-length
count_list = []
for el in prop_lists:
    count_list.append(len(el))

# actually plot data
plt.figure(figsize=(25, 17))
plt.boxplot(prop_lists, labels = key_list, showmeans=True)
plt.title("Proportion variation of /a:/ at different word lengths")
plt.show()
print(count_list)
# print some boxplot relevant info (median, mean, Q1, Q3)
for el in prop_lists:   
    print(key_list[prop_lists.index(el)] + ":")
    Q1 = np.percentile(el, 25)
    Q3 = np.percentile(el, 75)
    print("  Median: " + str(round(np.median(el), 3)) + " Mean: " + str(round(np.mean(el), 3)) + 
          " Q1: " + str(round(Q1, 3)) + "   Q3: " + str(round(Q3, 3)))
   

In [None]:
# Test data for plot /a:/-durations in 3pho_w against w_dur
simple_w3dur_list = sort_dict_val(simple_ddict["a:"]["3pho_w"])[::2]
#simple_wdur_list = [ round(x/1000, 3) for x in simple_wdur_list]     #transform w_dur in sec (from msec)
simple_p3dur_list = sort_dict_val(simple_ddict["a:"]["3pho_w"])[1::2]
#print(len(simple_p3dur_list))
print(simple_pdur_list[:5])
print(simple_wdur_list[:5])

In [None]:
# This scatter plot confirms the result of the previous plot
print(pearsonr(simple_p3dur_list, simple_w3dur_list)[0])
m, b = np.polyfit(simple_p3dur_list, simple_w3dur_list,1) # elem. of the regression function
plt.scatter(simple_p3dur_list, simple_w3dur_list, color = "darkorange")
plt.plot((np.unique(simple_p3dur_list)), (m*(np.unique(simple_p3dur_list)) + b), "-", color="k") # plot regression line
plt.xlabel("Duration of /a:/ in msec")
plt.ylabel("Word duration in msec")
plt.show()

In [None]:
from operator import itemgetter
# sort needed dict_list from the complex dict
complex_dict_la2p = complex_pprop_dict["a:"]["2pho_w"]
l_gr = [complex_dict_la2p[i:i+6] for i in range(0, len(complex_dict_la2p),6)] # transform list in list of 2-elem-lists
l_sorted = sorted(l_gr,key=itemgetter(3)) # sort list of of 2-elem-lists
org_complexDict_la2p = []
for i in range(len(l_sorted)):    # flatten sorted list
    org_complexDict_la2p += l_sorted[i]
print(org_complexDict_la2p[:30])