# Melodic distributions

Date: September 2020.

Author: Benjamin LeBrun.

In this notebook, we assess goodness-of-fit to the regional distributions of melodic bigrams. To do so, we first look at goodness of fit $p$-values obtained from a semi-parametric bootstrap (see `bootstrap.py`). Then, for each fit, if $p > 0.1$, we it to all other alternative distributions for the region above the corresponding $\hat{x}_{min}$.

In [152]:
import pandas as pd
import numpy as np
import os
import powerlaw as pl
from IPython.utils import io
from tqdm.notebook import tqdm

In [153]:
from distributions import Powerlaw, Exponential, Lognormal, Stretched_exponential, Powerlaw_with_cutoff

In [154]:
data_source = '../data/melody/'
boot_source = 'bootstrap/melody/'

## 1. Goodness-of-fit $p$-values and parameter estimates

In [160]:
# map names to corresponding distribution objects
dist_dict = {'powerlaw': Powerlaw, 'exponential': Exponential, 
             'lognormal': Lognormal, 'stretched_exponential': Stretched_exponential, 
             'truncated_powerlaw': Powerlaw_with_cutoff}

In [161]:
p_values = pd.DataFrame()
x_mins = pd.DataFrame()
parameters = pd.DataFrame()

regions = ['global', 'africa', 'asia', 'europe', 'oceania', 'north_america', 'south_america', 
                   'middle_america_and_the_caribbean', 'middle_east']
for dist in tqdm(['powerlaw', 'exponential', 'lognormal', 'stretched_exponential', 'truncated_powerlaw']):
    dist_ps, dist_xmins, dist_params = [], [], []
    for region in regions:
        try:
            bootstrap = np.array(pd.read_csv(boot_source+'pitch_'+dist+'_'+region+'.csv').D)
            distribution = dist_dict[dist]
            data = pd.read_csv(data_source+'pitch_ranks_'+region+'.csv')
            fit = distribution(data.ranks)
            KS = fit.D
            p = len(bootstrap[bootstrap >= KS])/len(bootstrap)
            dist_ps.append(p)
            dist_xmins.append(fit.xmin)
            dist_params.append(fit.get_parameters())
        except Exception as e:
            dist_ps.append(np.nan)
            dist_xmins.append(np.nan)
            dist_params.append(np.nan)
            print(e, dist, region)
    p_values[dist] = dist_ps
    x_mins[dist] = dist_xmins
    parameters[dist]= dist_params
p_values.index = regions
x_mins.index = regions
parameters.index = regions

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
  return likelihoods/norm





In [162]:
p_values

Unnamed: 0,powerlaw,exponential,lognormal,stretched_exponential,truncated_powerlaw
global,0.0,0.0,0.007,0.0,0.0
africa,0.0,0.004,0.037,0.141,0.0
asia,0.0,0.0,0.0,0.217,0.0
europe,0.0,0.0,0.013,0.021,0.0
oceania,0.0,0.0,0.0,0.0,0.0
north_america,0.0,0.017,0.0,0.0,0.0
south_america,0.0,0.131,0.0,0.0,0.0
middle_america_and_the_caribbean,0.68,0.942,0.866,0.812,0.94
middle_east,0.0,0.413,0.686,0.682,0.41


In [163]:
x_mins

Unnamed: 0,powerlaw,exponential,lognormal,stretched_exponential,truncated_powerlaw
global,15.0,20.0,19.0,7.0,2.0
africa,16.0,15.0,13.0,11.0,15.0
asia,15.0,10.0,9.0,6.0,1.0
europe,3.0,1.0,2.0,6.0,2.0
oceania,8.0,6.0,4.0,2.0,7.0
north_america,1.0,2.0,2.0,2.0,4.0
south_america,1.0,20.0,2.0,3.0,2.0
middle_america_and_the_caribbean,15.0,15.0,15.0,15.0,15.0
middle_east,10.0,4.0,8.0,4.0,10.0


In [164]:
parameters

Unnamed: 0,powerlaw,exponential,lognormal,stretched_exponential,truncated_powerlaw
global,[6.529766354791297],[0.726211847356715],"[2.9145764980969777, 0.10394131858896394]","[0.08937204064364583, 2.1854766479209937]","[1.0000000666108444, 0.084847860267088]"
africa,[7.676337840340027],[0.36014174352553174],"[2.5866227318097454, 0.24511635411001792]","[0.08287682205380002, 2.5422462444276146]","[1.0000001357065975, 0.30901746016027387]"
asia,[6.866667325823124],[0.26386259508990373],"[2.3355008186573687, 0.3409829622139612]","[0.10756525900759636, 1.8956769474470874]","[1.0000000038635253, 0.07218689970439063]"
europe,[2.305867136118786],[0.2868705426744481],"[0.9451727314189641, 0.920879595039982]","[0.11214917727736515, 1.8173959594917939]","[1.0000013198365116, 0.13612391782847777]"
oceania,[3.4002574161006294],[0.21323578738293184],"[1.9142119542499185, 0.5652633580900399]","[0.17207378580654142, 1.142892502071481]","[1.000001335997875, 0.14831707649572437]"
north_america,[1.691190157072878],[0.22607782659911627],"[1.423172518462089, 0.766974067742713]","[0.21089608672787397, 1.0724267918208166]","[1.0000003917133218, 0.1309112499490951]"
south_america,[1.6270820609963186],[1.2465686801375773],"[1.5463579057596548, 0.9188702315685963]","[0.11379573012710156, 1.4782907549909226]","[1.000000086510513, 0.06701122066523596]"
middle_america_and_the_caribbean,[8.62229488039252],[0.4778949982722869],"[2.4521336080917555, 0.23894780477517594]","[0.12353391616432907, 1.9592314845147787]","[1.0000079338048573, 0.4236339563353775]"
middle_east,[6.23662052838434],[0.33507453171364976],"[2.120560410779368, 0.2843135367060978]","[0.2215097684793183, 1.3679558465257278]","[1.0000087510160416, 0.4053752532913337]"


Before proceeding, we must note that there are cases in which the estimates of $x_{min}$ do not account for a sufficient amount of the data. For instance, in `middle_america_and_the_caribbean`, $\hat{x}_{min}$ accounts for less than $3\%$ of data. Therefore, despite achieving a statistically significant value of $p$, we did not consider these fits valid and instead chose as our estimate of $r_{min}$ the value corresponding to the next lowest value of $D$. We will correct thse fits now.

In [165]:
def correct_fit(dist, data, xmin, bootfile):
    bootstrap = np.array(pd.read_csv(bootfile).D)
    data = pd.read_csv(data)
    fit = dist_dict[dist](data.ranks, xmin=xmin)
    KS = fit.D
    p = len(bootstrap[bootstrap >= KS])/len(bootstrap)
    return p, fit

In [166]:
p, fit = correct_fit('exponential', '../data/melody/pitch_ranks_global.csv', 
                                       3, 'bootstrap/melody/pitch_exponential_global_xmin3.csv')
p_values.at['global', 'exponential'] = p
x_mins.at['global', 'exponential'] =  3
parameters.at['global', 'exponential'] = fit.get_parameters()

  (Theoretical_CDF * (1 - Theoretical_CDF))


In [167]:
p, fit = correct_fit('exponential', '../data/melody/pitch_ranks_south_america.csv', 
                                       3, 'bootstrap/melody/pitch_exponential_south_america_xmin3.csv')
p_values.at['south_america', 'exponential'] = p
x_mins.at['south_america', 'exponential'] =  3
parameters.at['south_america', 'exponential'] = fit.get_parameters()

In [168]:
p, fit = correct_fit('stretched_exponential', '../data/melody/pitch_ranks_middle_america_and_the_caribbean.csv', 
                                       3, 'bootstrap/melody/pitch_stretched_exponential_middle_america_and_the_caribbean_xmin3.csv')
p_values.at['middle_america_and_the_caribbean', 'stretched_exponential'] = p
x_mins.at['middle_america_and_the_caribbean', 'stretched_exponential'] =  3
parameters.at['middle_america_and_the_caribbean', 'stretched_exponential'] = fit.get_parameters()

In [169]:
p, fit = correct_fit('lognormal', '../data/melody/pitch_ranks_middle_america_and_the_caribbean.csv', 
                                       3, 'bootstrap/melody/pitch_lognormal_middle_america_and_the_caribbean_xmin3.csv')
p_values.at['middle_america_and_the_caribbean', 'lognormal'] = p
x_mins.at['middle_america_and_the_caribbean', 'lognormal'] =  3
parameters.at['middle_america_and_the_caribbean', 'lognormal'] = fit.get_parameters()

In [174]:
p, fit = correct_fit('exponential', '../data/melody/pitch_ranks_middle_america_and_the_caribbean.csv', 
                                       3, 'bootstrap/melody/pitch_exponential_middle_america_and_the_caribbean_xmin3.csv')
p_values.at['middle_america_and_the_caribbean', 'exponential'] = p
x_mins.at['middle_america_and_the_caribbean', 'exponential'] =  3
parameters.at['middle_america_and_the_caribbean', 'exponential'] = fit.get_parameters()

  (Theoretical_CDF * (1 - Theoretical_CDF))


In [202]:
p, fit = correct_fit('powerlaw', '../data/melody/pitch_ranks_middle_america_and_the_caribbean.csv', 
                                       13, 'bootstrap/melody/pitch_powerlaw_middle_america_and_the_caribbean_xmin13.csv')
p_values.at['middle_america_and_the_caribbean', 'powerlaw'] = p
x_mins.at['middle_america_and_the_caribbean', 'powerlaw'] =  13
parameters.at['middle_america_and_the_caribbean', 'powerlaw'] = fit.get_parameters()

Calculating best minimal value for power law fit


In [203]:
p, fit = correct_fit('powerlaw', '../data/melody/pitch_ranks_middle_east.csv', 
                                       7, 'bootstrap/melody/pitch_powerlaw_middle_east_xmin7.csv')
p_values.at['middle_east', 'powerlaw'] = p
x_mins.at['middle_east', 'powerlaw'] =  7
parameters.at['middle_east', 'powerlaw'] = fit.get_parameters()

Calculating best minimal value for power law fit


In [204]:
p, fit = correct_fit('lognormal', '../data/melody/pitch_ranks_global.csv', 
                                       3, 'bootstrap/melody/pitch_lognormal_global_xmin3.csv')
p_values.at['global', 'lognormal'] = p
x_mins.at['global', 'lognormal'] =  3
parameters.at['global', 'lognormal'] = fit.get_parameters()

In [219]:
p, fit = correct_fit('truncated_powerlaw', '../data/melody/pitch_ranks_middle_east.csv', 
                                       4, 'bootstrap/melody/pitch_truncated_powerlaw_middle_east_xmin4.csv')
p_values.at['middle_east', 'truncated_powerlaw'] = p
x_mins.at['middle_east', 'truncated_powerlaw'] =  4
parameters.at['middle_east', 'truncated_powerlaw'] = fit.get_parameters()

In [220]:
p, fit = correct_fit('truncated_powerlaw', '../data/melody/pitch_ranks_middle_america_and_the_caribbean.csv', 
                                       2, 'bootstrap/melody/pitch_truncated_powerlaw_middle_america_and_the_caribbean_xmin2.csv')
p_values.at['middle_america_and_the_caribbean', 'truncated_powerlaw'] = p
x_mins.at['middle_america_and_the_caribbean', 'truncated_powerlaw'] =  2
parameters.at['middle_america_and_the_caribbean', 'truncated_powerlaw'] = fit.get_parameters()

In [221]:
p_values

Unnamed: 0,powerlaw,exponential,lognormal,stretched_exponential,truncated_powerlaw
global,0.0,0.0,0.0,0.0,0.0
africa,0.0,0.004,0.037,0.141,0.0
asia,0.0,0.0,0.0,0.217,0.0
europe,0.0,0.0,0.013,0.021,0.0
oceania,0.0,0.0,0.0,0.0,0.0
north_america,0.0,0.017,0.0,0.0,0.0
south_america,0.0,0.0,0.0,0.0,0.0
middle_america_and_the_caribbean,0.0,0.001,0.0,0.0,0.0
middle_east,0.0,0.413,0.686,0.682,0.08


In [222]:
x_mins

Unnamed: 0,powerlaw,exponential,lognormal,stretched_exponential,truncated_powerlaw
global,15.0,3.0,3.0,7.0,2.0
africa,16.0,15.0,13.0,11.0,15.0
asia,15.0,10.0,9.0,6.0,1.0
europe,3.0,1.0,2.0,6.0,2.0
oceania,8.0,6.0,4.0,2.0,7.0
north_america,1.0,2.0,2.0,2.0,4.0
south_america,1.0,3.0,2.0,3.0,2.0
middle_america_and_the_caribbean,13.0,3.0,3.0,3.0,2.0
middle_east,7.0,4.0,8.0,4.0,4.0


In [223]:
parameters

Unnamed: 0,powerlaw,exponential,lognormal,stretched_exponential,truncated_powerlaw
global,[6.529766354791297],[0.17975963671881368],"[1.7174259196005737, 0.7430390937355924]","[0.08937204064364583, 2.1854766479209937]","[1.0000000666108444, 0.084847860267088]"
africa,[7.676337840340027],[0.36014174352553174],"[2.5866227318097454, 0.24511635411001792]","[0.08287682205380002, 2.5422462444276146]","[1.0000001357065975, 0.30901746016027387]"
asia,[6.866667325823124],[0.26386259508990373],"[2.3355008186573687, 0.3409829622139612]","[0.10756525900759636, 1.8956769474470874]","[1.0000000038635253, 0.07218689970439063]"
europe,[2.305867136118786],[0.2868705426744481],"[0.9451727314189641, 0.920879595039982]","[0.11214917727736515, 1.8173959594917939]","[1.0000013198365116, 0.13612391782847777]"
oceania,[3.4002574161006294],[0.21323578738293184],"[1.9142119542499185, 0.5652633580900399]","[0.17207378580654142, 1.142892502071481]","[1.000001335997875, 0.14831707649572437]"
north_america,[1.691190157072878],[0.22607782659911627],"[1.423172518462089, 0.766974067742713]","[0.21089608672787397, 1.0724267918208166]","[1.0000003917133218, 0.1309112499490951]"
south_america,[1.6270820609963186],[0.15287042987717797],"[1.5463579057596548, 0.9188702315685963]","[0.11379573012710156, 1.4782907549909226]","[1.000000086510513, 0.06701122066523596]"
middle_america_and_the_caribbean,[6.358801058542639],[0.21728661519959364],"[1.6474113388385436, 0.6608842297263667]","[0.17811659161373214, 1.2042285941191944]","[1.0000001614293414, 0.10935481764645807]"
middle_east,[4.169151429109972],[0.33507453171364976],"[2.120560410779368, 0.2843135367060978]","[0.2215097684793183, 1.3679558465257278]","[1.0000120123589606, 0.21480647554626023]"


---
## 2. Alternative distribution comparison

Recall that **each valid fit** must be compared to other fits using the likelihood ratio test. So, for each fit with a $p$-value greater than $0.1$, we compare it to all alternative fits above $\hat{x}_{min}$ using the likelihood ratio test. If the test is significant ($p < 0.05$) and the value of $LR$ is negative, we can reject the fit in question.

In [107]:
def likelihood_ratio_tests(fit, fit_dist):
    distributions = ['power_law', 'lognormal', 'truncated_powerlaw', 'stretched_exponential', 'exponential']
    results = []
    for dist in distributions:
        if dist == fit_dist:
            results.append((-1,-1))
            continue
        with io.capture_output() as captured:
            try:
                res = fit.distribution_compare(fit_dist, dist)
            except ZeroDivisionError:
                res = (-1,-1)
        results.append(res)
            
    return results

In [109]:
dataframes = {}
for region in tqdm(p_values.index):
    data = pd.read_csv(data_source+'pitch_ranks_'+region+'.csv')
    df = pd.DataFrame()
    for dist in p_values.columns:
        p_value = p_values.loc[region, dist]
        if p_value > 0.1:
            x_min = x_mins.loc[region, dist]
            if dist == 'powerlaw': 
                dist = 'power_law' # naming convention
            results = likelihood_ratio_tests(pl.Fit(data.ranks, xmin=x_min, discrete=True), dist)
            df[dist] = results
        else:
            df[dist] = [np.nan for i in range(0,5)]
    df.index = ['power_law', 'lognormal', 'truncated_powerlaw', 'stretched_exponential', 'exponential']
    df = df.dropna(axis=1, how='all')
    dataframes[region] = df

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))

  (Theoretical_CDF * (1 - Theoretical_CDF))





We now list the dataframes containing the likelihood ratio results for each region. Some notes on how to interpret these: the dataframe corresponding to each distribution contains the likelihood ratio test results for all plausible fits (i.e. $p > 0.1$). Each column corresponds to a valid fit. Each valid is then compared to all other fits in the same region as the valid fit. Rows are then the distributions being compared to. Each cell contains a tuple containing `(LR, p-value)`. A positive value of $LR$ indicates that the column distribution is favoured over the row distribution. A negative value with $p < 0.05$ therefore means that we can reject the column distribution, i.e. the distribution *in question*.

In [None]:
dataframes['global']

In [110]:
dataframes['africa']

Unnamed: 0,stretched_exponential
power_law,"(123.76360982178119, 3.895341633557874e-26)"
lognormal,"(4.36867788412021, 0.017526891168967135)"
truncated_power_law,"(59.2207968657254, 4.579075611588206e-12)"
stretched_exponential,"(-1, -1)"
exponential,"(47.385901268287625, 0.0)"


In [111]:
dataframes['asia']

Unnamed: 0,stretched_exponential
power_law,"(313.8564815946735, 9.255098664249016e-62)"
lognormal,"(16.473001811239506, 3.607223341357008e-06)"
truncated_power_law,"(127.24596066195431, 2.287927483039653e-22)"
stretched_exponential,"(-1, -1)"
exponential,"(73.51525557944173, 0.0)"


In [None]:
dataframes['europe']

In [112]:
dataframes['oceania']

power_law
lognormal
truncated_power_law
stretched_exponential
exponential


In [113]:
dataframes['north_america']

power_law
lognormal
truncated_power_law
stretched_exponential
exponential


In [114]:
dataframes['south_america']

Unnamed: 0,exponential
power_law,"(-inf, nan)"
lognormal,"(-1.4634891646169488, 0.2567835974790269)"
truncated_power_law,"(-1, -1)"
stretched_exponential,"(-1.5844176157827001, 0.07505595567824142)"
exponential,"(-1, -1)"


In [115]:
dataframes['middle_america_and_the_caribbean']

Unnamed: 0,power_law,exponential,lognormal,stretched_exponential
power_law,"(-1, -1)","(1.1755688752896014, 0.03310461012748955)","(1.380550186278239, 0.22362725105144277)","(1.554106245979625, 0.20397788742472267)"
lognormal,"(-1.380550186278239, 0.22362725105144277)","(-0.20498131098864047, 0.7253948700061696)","(-1, -1)","(0.17355605970138765, 0.20236748741819555)"
truncated_power_law,"(-1.098667498723436, 0.1382495700645)","(0.07690137656616602, 0.294805071692398)","(0.2818826875548097, 0.6669372369323177)","(0.45543874725619116, 0.5422256768261315)"
stretched_exponential,"(-1.554106245979625, 0.20397788742472267)","(-0.37853737069002746, 0.3842455280374447)","(-0.17355605970138765, 0.20236748741819555)","(-1, -1)"
exponential,"(-1.1755688752896014, 0.03310461012748955)","(-1, -1)","(0.20498131098864047, 0.7253948700061696)","(0.37853737069002746, 0.3842455280374447)"


In [116]:
dataframes['middle_east']

Unnamed: 0,exponential,lognormal,stretched_exponential
power_law,"(19.339348984318878, 9.570003204538158e-09)","(4.048255909640306, 0.057382716865207115)","(20.97480821428763, 1.5649248417772737e-05)"
lognormal,"(-0.12573470208813164, 0.934532659939667)","(-1, -1)","(1.5097245278805875, 0.04098268344198582)"
truncated_power_law,"(3.190980680398491, 0.006571755773990671)","(1.5105723414508718, 0.3088060069818447)","(4.826439910367211, 0.07115373446635886)"
stretched_exponential,"(-1.6354592299686885, 0.070518132938997)","(-0.21673328431830474, 0.45168623896281035)","(-1, -1)"
exponential,"(-1, -1)","(1.1139184555472172, 0.39207521654292643)","(1.6354592299686885, 0.070518132938997)"
