# Introduction

- This notebook is for visualization of acoustic change over time.
- Example data are from the TransVoices project: two YouTube vloggers who are transwomen, whose videos over a period of seven years have been transcribed as TextGrids.
- The TextGrids were force aligned using a multi-tier version of FAVE, and formant measurements were made using ifcformant (thanks to Ron Sprouse for those scripts).
- The files are ''[name]_multi-align.TextGrid'' in the folder "multi-align" and ''[name].ifc'' in the folder "ifc_files".
- The .ifc and .TextGrid files are merged, the normalized by speaker and by local speech rate.
test link 

# Initialize and read in files

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import matplotlib.pyplot as plt
import numpy as np
from audiolabel import read_label

In [2]:
# The relevant files (ifc files and textgrids) are stored here; set up so files can be read from this object
wdifc = './ifc_files/'
wdtg = './multi_align/'

In [3]:
# Create a df with a column of .ifc filenames using regular expressions
# The tg files are named (e.g.,) JV_013110_Title.ifc/tg, so split by underscore to get speaker, date, and video title.
ifcdf = pd.DataFrame(os.listdir(wdifc), columns=['ifcname'])
code = ifcdf.ifcname.str.extract(r'^(?P<speaker>.+)_(?P<date>\d+)_(?P<title>.+)\.ifc$', expand=True)
ifcdf = pd.concat([ifcdf, code], axis=1)
# Create a df with a column of .tg filenames using regular expressions
tgdf = pd.DataFrame(os.listdir(wdtg), columns=['tgname'])
codetg = tgdf.tgname.str.extract(r'^(?P<speaker>.+)_(?P<date>\d+)_(?P<title>.+)\.multi_align\.TextGrid$', expand=True)
tgdf = pd.concat([tgdf, codetg], axis=1)
# tgdf.head()

In [4]:
# Merge .ifc list dataframe and .textgrid list dataframe, overlapping the columns in 'code'
code = ['speaker','date','title']
matchdf = pd.merge(ifcdf, tgdf, on=code, how='inner')
# matchdf.head()

In [5]:
# Compile a dataframe from all of the .textgrid files using audiolabel
# This extracts all the phones and words from the multi-aligned TextGrids
tgnames = [os.path.join(wdtg, tgname) for tgname in matchdf.tgname]
[phonedf, worddf] = read_label(tgnames, 'praat', addcols=['barename', 'fidx'], tiers=['phone','word'])

In [6]:
# Create two new columns, 'vowel' and 'stress', based on regular expressions in 'phone' column
stress = phonedf.phone.str.extract(r'^(?P<vowel>.+)(?P<stress>\d+)$', expand=True)
# Concatenate the stress column to phonedf
phonedf = pd.concat([phonedf, stress], axis=1)
phonedf['stress'].count() # this is how many syllables are in the entire phonedf
# phonedf.head()

57578

# Get local speech rate from textgrid
Thanks to Geoff Bacon for this section. First function gets local speech rate in syllables, defined as stressed vowels per 20 TextGrid rows. Second and third functions get local speech rate in syllables, defined as stressed vowels per 30 seconds in TextGrid.

In [7]:
def make_rolling_stress_count(df, window_size):
    """
    Returns `df` with an extra column of the moving count of the stress column.
    `df` must have a column called `stress` and `fname` (identifying each video, for groupby)
    Counts all values in `stress` as syllables in `binary_stress`.
    Uses `.rolling` to count `binary_stress` values in a window size of X rows.
    Fills in edge cases (begininng and end of window) using `bfill` and `ffill`.
    """
    df['binary_stress'] = df['stress'].notnull()
    df['binary_stress'].replace({False: 0, True: 1}, inplace=True)
    grouped = df.groupby('fname')
    rolling_count = grouped.rolling(window_size, center=True).sum()['binary_stress']
    rolling_count = rolling_count.groupby(level='fname').fillna(method='bfill')
    rolling_count = rolling_count.groupby(level='fname').fillna(method='ffill')
    multiindexed_df = df.set_index('fname', append=True).swaplevel()
    renamed_rolling_count = rolling_count.rename('rolling_stress_count').to_frame()
    merged = pd.merge(multiindexed_df, renamed_rolling_count, left_index=True, right_index=True)
    df_with_rolling_count = merged.reset_index().drop('level_1', axis=1)
    return df_with_rolling_count

In [8]:
# Apply function to phonedf with a window size of 20 rows
df = make_rolling_stress_count(phonedf, 20)
phonedf = df
#phonedf.head()

In [9]:
from datetime import timedelta

def process(dataframe, offset):
    """
    Changes `t1` in dataframe to seconds units, counts up all values in `binary_stress`
    """
    dataframe['t1_as_datetime'] = pd.to_datetime(dataframe['t1'], unit='s')
    dataframe['rolling_count'] = dataframe.rolling('30s', on='t1_as_datetime').sum()['binary_stress']
    start = dataframe['t1_as_datetime'].iloc[0]
    dataframe['offset'] = dataframe['t1_as_datetime'] - start
    dataframe['beginning'] = dataframe['offset'] < offset
    dataframe.loc[dataframe['beginning'], 'rolling_count'] = np.NaN
    dataframe['rolling_count'].fillna(method='bfill', inplace=True)
    return dataframe

In [10]:
def make_temporal_rolling_stress_count(df, time_in_seconds):
    """
    Returns `df` with an extra column of the moving count of the stress column of length `time_in_seconds`.
    `df` must have a column called `stress` and `fname`.
    Uses `.rolling` to count `binary_stress` values per window of 30 seconds.
    """
    offset = timedelta(seconds=time_in_seconds)
    
    fnames = phonedf['fname'].unique()
    result = []
    for fname in fnames:
        tmp = phonedf[phonedf['fname'] == fname]
        tmp = process(tmp, offset)
        result.append(tmp)
    df = pd.concat(result, ignore_index=True)
    return df.drop(['offset', 't1_as_datetime', 'beginning'], axis=1)

In [11]:
# Apply function to phonedf with a window size of 30 seconds
df = make_temporal_rolling_stress_count(phonedf, 30)
phonedf = df
#phonedf.head()

# Merge phone and word dataframes to create full textgrid dataframe

In [12]:
# Add 't1_w' column to word textgrids, which is copy of 't1'
worddf = worddf.assign(t1_w = worddf.t1)

# Define a function that merges phone and word textgrids
def mergepw(p, w):
    f = p['fidx'].values[0] # find fidx value of argument 'p', assign it to 'f'
    locw = w.loc[w.fidx==f,['t1','t1_w','t2','word']] # select the words that have fidx value 'f'
    return pd.merge_asof(p, locw, on='t1', suffixes=['_p','_w']) # create a suffix for the duplicated columns

In [13]:
# Use the function to merge phone and word textgrids and create the full textgrid dataframe
fulltgdf = phonedf.groupby('fidx').apply(mergepw, w=worddf)
fulltgdf = fulltgdf.rename(columns={'t1':'t1_p'}) # rename 't1' column to 't1_p'
fulltgdf = fulltgdf.assign(barename = fulltgdf.barename.str.replace('.multi_align','')) # take out the extension 
# fulltgdf.head()

# Merge fulltgdf with ifc files

In [14]:
# Columns to read from .ifc files. ###
usecols = ['sec','rms','f0', 'f1', 'f2', 'f3', 'f4']
# Compile a dataframe from all of the .ifc files.
dfs = []
for ifcname in matchdf.ifcname:
    df = pd.read_table(os.path.join(wdifc, ifcname), usecols=usecols)
    df = df.assign(ifcname=ifcname)
    dfs.append(df)
fullifcdf = pd.concat(dfs)
fullifcdf = fullifcdf.assign(barename = fullifcdf.ifcname.str.replace('.ifc','')) # take out the extension
# fullifcdf.tail()

In [15]:
# Read column from each .ifc file and merge on t1 with corresponding .textgrid file
def mergeit(i, pw):
    b = i['barename'].values[0] # find barename value of argument 'i', assign it to 'b'
    locb = pw.loc[pw.barename==b,:] # select the words that have barename value 'b'
    return pd.merge_asof(i, locb, left_on='sec', right_on='t1_p')

full_df = fullifcdf.groupby('barename').apply(mergeit, pw=fulltgdf)
# full_df.head() # initial check of full_df

In [16]:
# Drop unnecessary columns
dropcols = ['barename_x','barename_y','fidx','fname']
full_df = full_df.drop(dropcols, axis=1)

# Add columns for speaker, date, and time, taken from 'ifcname' values
sdt = full_df.ifcname.str.extract(r'^(?P<speaker>.+)_(?P<date>\d+)_(?P<title>.+)\.ifc$', expand=True)
full_df = pd.concat([full_df, sdt], axis=1)

In [17]:
# replace all empty 'phone' cells with NaN, then drop those rows
full_df['phone'].replace('', np.nan, inplace=True)
full_df.dropna(subset=['phone'], inplace=True)

In [18]:
# Add ipa column.
ph2ipa = pd.read_table('arpabet2ipa.txt', names=('phone','ipa')) # Read in conversion table
full_df = full_df.merge(ph2ipa, how='left', on='phone') # Merge conversion table in with vowels
full_df.dropna(subset=['ipa'], inplace=True)

In [19]:
# Calculate duration and local speech rate normalized duration of each phone
full_df['duration'] = full_df['t2_p'] - full_df['t1_p']
full_df['norm_duration'] = full_df['duration']/full_df['rolling_count']*100

In [20]:
# Convert dates to useable formats
full_df['olddatetime'] = pd.to_datetime(full_df['date'], format='%m%d%y', errors='coerce') # datetime format
full_df['datetime'] = full_df.olddatetime.astype(str).str.strip() # convert to string
# full_df.head() # final check of full_df

# Bin videos  by time

In [21]:
# Create up to 31 bins of 20 seconds each for each video, grouping by 'datetime'
binseq = full_df.groupby('datetime').apply(
            lambda y: np.array([0,20,40,60,80,100,120,140,160,180,200,220,240,260,280,300,
                  320,340,360,380,400,420,440,460,480,500,520,540,560,580,600,620]
                               + y.t1_p.min() - 0.01)) # make bins of 20 secs each

In [22]:
# Create  a dataframe that cuts full_df into 30 bins for each video, grouping by 'datetime'
ebins = pd.DataFrame(full_df.groupby('datetime').apply(
    lambda q: pd.cut(q['t1_p'], bins=30, labels=False)))
ebins = ebins.rename(columns={'t1_p':'binidx'}) #ebins should have 30 bins per video

# Create a dataframe that cuts full_df into 20sec bins for each video, grouping by 'datetime'
bins = pd.DataFrame(full_df.groupby('datetime').apply(
    lambda r: pd.cut(r['t1_p'], bins=binseq.loc[r.name], labels=False)))
bins = bins.rename(columns={'t1_p':'binidx'}) #binseq should have variable bins per video

In [23]:
bins = bins.reset_index(level=0) # Remove 'dt' from multi-index and make it a regular column
ebins = ebins.reset_index(level=0)

In [24]:
full_df = full_df.assign(bins=bins.binidx) # Assign values in bins/ebins to the relevant columns in full_df
full_df = full_df.assign(ebins=ebins.binidx)

In [25]:
# Some sanity checks to make sure data look all right
full_df.tail()
# full_df[full_df.datetime=='2007-01-10']
# full_df.loc[2950:3100,['t1_p','datetime','bins','ebins']]

Unnamed: 0,sec,rms,f1,f2,f3,f4,f0,ifcname,t1_p,t2_p,...,speaker,date,title,ipa,duration,norm_duration,olddatetime,datetime,bins,ebins
1507125,436.425,2370.6,840.0,1451.4,3140.0,3924.0,160.9,JV_120517_AdamsApple.ifc,436.2838,436.4734,...,JV,120517,AdamsApple,ɑ,0.1896,0.125563,2017-12-05,2017-12-05,21,29
1507126,436.435,2041.7,962.4,1450.8,3090.6,3914.4,161.0,JV_120517_AdamsApple.ifc,436.2838,436.4734,...,JV,120517,AdamsApple,ɑ,0.1896,0.125563,2017-12-05,2017-12-05,21,29
1507127,436.445,1350.4,1043.9,1466.7,3053.3,3918.9,163.3,JV_120517_AdamsApple.ifc,436.2838,436.4734,...,JV,120517,AdamsApple,ɑ,0.1896,0.125563,2017-12-05,2017-12-05,21,29
1507128,436.455,957.7,1045.8,1477.5,3073.5,3881.9,164.6,JV_120517_AdamsApple.ifc,436.2838,436.4734,...,JV,120517,AdamsApple,ɑ,0.1896,0.125563,2017-12-05,2017-12-05,21,29
1507129,436.465,946.3,928.5,1449.1,3097.0,3847.0,88.4,JV_120517_AdamsApple.ifc,436.2838,436.4734,...,JV,120517,AdamsApple,ɑ,0.1896,0.125563,2017-12-05,2017-12-05,21,29


# Normalize vowel formants

In [26]:
# first create smaller df that is a subset of only vowels!
vowels = ['ɔ','ɑ','i','u','ɛ','ɪ','ʊ','ʌ','æ','ə','eɪ','aɪ','oʊ','aʊ','ɔɪ','ɚ']
vowels_df = full_df[full_df.ipa.isin(vowels)]
#vowels_df.tail()

In [27]:
# Calculate formant zscores for groups defined by num_part, speaker, and vowel.
normcols = ['speaker']
zscorecols = ['f1', 'f2', 'f3', 'f4']
zscore = lambda x: (x - x.mean()) / x.std()

# Select columns of zscore interest, group, and calculate zscore for each group.
zscored = vowels_df.loc[:, normcols + zscorecols].groupby(normcols).transform(zscore)
zscored = zscored.rename(columns={'f1': 'f1_z', 'f2': 'f2_z', 'f3': 'f3_z', 'f4': 'f4_z'})
#zscored.head()

In [28]:
# Verify that observations in zscored match observations in vowels_df.
(zscored.index == vowels_df.index).all()

True

In [29]:
# Combine zscores with original formant measurements. (If you use .concat, only do it once!)
# This also could be done with merge().
vowels_df = pd.concat([vowels_df, zscored], axis=1)
#vowels_df.head()

In [30]:
# Calculate by-speaker means and standard deviations for each vowel
groupcols = ['speaker' ,'ipa']
meanvowels_df = vowels_df.groupby(groupcols).agg([np.mean, np.std])
meanvowels_df.columns = meanvowels_df.columns.map('_'.join)
#meanvowels_df.head()

In [31]:
# Sanity checks! Compare f2 values for corner vowels for each speaker
print(meanvowels_df.loc[('GN',['u','i','æ','ɑ']),'f2_mean'])
print('*'*40)
print(meanvowels_df.loc[('JV',['u','i','æ','ɑ']),'f2_mean'])
print('*'*40)
# Compare average f0 values for each speaker
print('GN average f0 is', vowels_df.loc[vowels_df.speaker=='GN','f0'].mean())
print('JV average f0 is', vowels_df.loc[vowels_df.speaker=='JV','f0'].mean())

speaker  ipa
GN       i      2360.868494
         u      1788.909956
         æ      1939.861093
         ɑ      1516.440005
Name: f2_mean, dtype: float64
****************************************
speaker  ipa
JV       i      2282.620902
         u      1803.614434
         æ      1747.518762
         ɑ      1469.379074
Name: f2_mean, dtype: float64
****************************************
GN average f0 is 143.108815909
JV average f0 is 171.36128927


# Calculate mean and stdev of vowel variables across videos and bins

In [32]:
# Calculate per-video means of all variables
video_mean = vowels_df.groupby(['datetime','speaker']).agg([np.mean, np.std])
video_mean.columns = video_mean.columns.map('_'.join)
# Drop bins with NaN: speaker didn't produce any tokens during that minute
video_mean = video_mean.dropna(axis=0, how='all')

video_mean_sepvowels = vowels_df.groupby(['datetime','speaker','ipa']).agg([np.mean, np.std])
video_mean_sepvowels.columns = video_mean_sepvowels.columns.map('_'.join)

bin_mean = vowels_df.groupby(['datetime','speaker','bins']).agg([np.mean, np.std])
bin_mean.columns = bin_mean.columns.map('_'.join)

equal_bin_mean = vowels_df.groupby(['datetime','speaker','ebins']).agg([np.mean, np.std])
equal_bin_mean.columns = equal_bin_mean.columns.map('_'.join)

tword_mean = vowels_df.groupby(['datetime','speaker','t1_w']).agg([np.mean, np.std])
tword_mean.columns = tword_mean.columns.map('_'.join)

## Do the same, but for only sibilants (s, z, sh, zh)

In [33]:
sibilants = ['s','ʃ','z','ʒ']
sibilants_df = full_df[full_df.ipa.isin(sibilants)]

In [34]:
s_video_mean = sibilants_df.groupby(['datetime','speaker','ipa']).agg([np.mean, np.std])
s_video_mean.columns = s_video_mean.columns.map('_'.join)

s_bin_mean = sibilants_df.groupby(['datetime','speaker','bins','ipa']).agg([np.mean, np.std])
s_bin_mean.columns = s_bin_mean.columns.map('_'.join)

s_equal_bin_mean = sibilants_df.groupby(['datetime','speaker','ebins','ipa']).agg([np.mean, np.std])
s_equal_bin_mean.columns = s_equal_bin_mean.columns.map('_'.join)

groupcols = ['speaker' ,'ipa']
meansibilants_df = sibilants_df.groupby(groupcols).agg([np.mean, np.std])
meansibilants_df.columns = meansibilants_df.columns.map('_'.join)

# Export dataframes as csv files

In [35]:
# Export vowel dfs as csv
tword_mean.to_csv('twordmean.csv', encoding='utf-8')
equal_bin_mean.to_csv('equalbinmean.csv', encoding='utf-8'),
bin_mean.to_csv('binmean.csv', encoding='utf-8')
video_mean.to_csv('videomean.csv', encoding='utf-8')
video_mean_sepvowels.to_csv('videomean_sepvowels.csv', encoding='utf-8')
meanvowels_df.to_csv('meanvowels.csv', encoding='utf-8')

In [36]:
# Export sibilant dfs as csv
s_equal_bin_mean.to_csv('s_equalbinmean.csv', encoding='utf-8'),
s_bin_mean.to_csv('s_binmean.csv', encoding='utf-8')
s_video_mean.to_csv('s_videomean.csv', encoding='utf-8')
meansibilants_df.to_csv('meansibilants_df.csv', encoding='utf-8')
sibilants_df.to_csv('sibilants_df.csv', encoding='utf-8')

In [37]:
# Export largest dfs as csv -- these two take the longest, may be skipped
vowels_df.to_csv('vowels_df.csv', encoding='utf-8')
full_df.to_csv('full_df.csv', encoding='utf-8')