In [1]:
import powerlawrs 
import polars as pl
import numpy as np

In [2]:
file = "../reference_data/blackouts.txt"

# polars and pandas do NOT do a good job detecting headers, do not rely on them.
df = pl.read_csv(file, has_header=False)
data = df.to_series()

# API
## Fitting proceedure

In [3]:
# 1. Generate an alpha paramater via MLE for every x_min in the data 
x_mins, alphas = powerlawrs.estimation.find_alphas_fast(data)
print(f"n: {len(data)}, n_x_mins: {len(x_mins)}, n_alphas: {len(alphas)}")

n: 211, n_x_mins: 210, n_alphas: 210


In [4]:
# 2. Find the pair with the lowest KS statistic. This is the estimated best fit.
pareto_fit = powerlawrs.gof.gof(data, alphas=alphas, x_mins=x_mins)
print(f"{pareto_fit}")

ParetoFit(x_min=230000, alpha=1.2726372198302858, D=0.06067379629443781, len_tail=59)


Steps 1 and 2 above are abstracted away via ```powerlawrs.fit()```

## Numerical stability 
Comparison of ```find_alphas_exhaustive()``` and ```find_alphas_fast()``` given the former will be technically more accurate than the latter. Depending on the context, the precision is negligible given significant performance improvements. 

In [5]:
# 1. Generate an alpha paramater via MLE for every x_min in the data via find_alphas_exhaustive()
x_mins_ex, alphas_ex = powerlawrs.estimation.find_alphas_exhaustive(data)

# 2. Find the pair with the lowest KS statistic. This is the estimated best fit.
pareto_fit_ex = powerlawrs.gof.gof(data, alphas=alphas_ex, x_mins=x_mins_ex)

print(f"find_alphas_exhaustive() alpha: {pareto_fit_ex.alpha}")
print(f"find_alphas_fast() alpha:\t{pareto_fit.alpha}")
print(f"Difference: {pareto_fit.alpha - pareto_fit_ex.alpha}")

find_alphas_exhaustive() alpha: 1.2726372198302882
find_alphas_fast() alpha:	1.2726372198302858
Difference: -2.4424906541753444e-15


## Parameter uncertainty

In [6]:
xm_std, a_std = powerlawrs.estimation.param_est(data, m=1000)
print(f"stdev (sample) x_min: {xm_std}, stdev (sample) alpha: {a_std}")

stdev (sample) x_min: 80841.36176494199, stdev (sample) alpha: 0.25783447353318556


## Hypothesis test 

In [7]:
# Run the experiment
# Set a minimum precsion of our p value of the KS test.
precision = 0.01 # p value should be accurate to with 0.01 
H0 = powerlawrs.hypothesis.hypothesis_test(data, precision, pareto_fit.alpha, pareto_fit.x_min, pareto_fit.D)

Generating M = 2500 simulated datasets of length n = 211 with tail size 59 and probability of the tail P(tail|data) = 0.2796208530805687


In [8]:
# hypothesis_test() calls powerlawrs.util.sim.calculate_sim_params() to determine the number of simulated datasets required given the desired precision. 
simparams_dict = powerlawrs.util.sim.calculate_sim_params(precision, data, pareto_fit.x_min)

In [9]:
# Which will require 2500 synthetic datasets of length 211. 59 of the 211 samples will be drawn from a Pareto Type I with the paramaters found above
simparams_dict

{'num_sims_m': 2500,
 'sim_len_n': 211,
 'n_tail': 59,
 'p_tail': 0.2796208530805687}

## Stats module

In [10]:
powerlawrs.stats.descriptive.mean(data)

253868.68246445496

In [11]:
powerlawrs.stats.descriptive.variance(data, 1)

372476564023.59814

In [12]:
powerlawrs.stats.random.random_choice(data, 3)

[88000.0, 91000.0, 312000.0]

In [13]:
powerlawrs.stats.random.random_uniform(3)

[0.23704228210934275, 0.08119401045592678, 0.8492154011450803]

In [14]:
# Define a standard normal CDF in Python
import math
norm_cdf = lambda x: 0.5 * (1 + math.erf(x / math.sqrt(2.0)))

sorted_data = [-1.1, -0.5, 0.1, 0.2, 1.5]

# Call the Rust function, passing the Python function as an argument
(d_plus, d_minus, d_max) = powerlawrs.stats.ks.ks_1sam_sorted(sorted_data, norm_cdf)

print(f"D+: {d_plus}")
print(f"D-: {d_minus}")
print(f"D max: {d_max}")

D+: 0.22074029056089706
D-: 0.13982783727702897
D max: 0.22074029056089706


## Util module

In [15]:
powerlawrs.util.linspace(0,10,5)

[0.0, 2.5, 5.0, 7.5, 10.0]

In [16]:
simparams_dict = powerlawrs.util.sim.calculate_sim_params(0.01, data, 230000)
simparams_dict

{'num_sims_m': 2500,
 'sim_len_n': 211,
 'n_tail': 59,
 'p_tail': 0.2796208530805687}

In [17]:
# convert simparams dict to rust struct
simparams_struct = powerlawrs.util.sim.PySimParams(**simparams_dict)

# use the struct as an argument
sim_data = powerlawrs.util.sim.generate_synthetic_datasets(data, 230000, simparams_struct, 1.27)

In [18]:
#Note the library does not yet impliment zeta distribution for discrete data. 
pl.from_numpy(np.array(sim_data))

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,…,column_174,column_175,column_176,column_177,column_178,column_179,column_180,column_181,column_182,column_183,column_184,column_185,column_186,column_187,column_188,column_189,column_190,column_191,column_192,column_193,column_194,column_195,column_196,column_197,column_198,column_199,column_200,column_201,column_202,column_203,column_204,column_205,column_206,column_207,column_208,column_209,column_210
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2.6032e6,25000.0,56000.0,32000.0,63500.0,100000.0,53000.0,15000.0,431152.454995,80000.0,18000.0,80000.0,9000.0,60000.0,3.5332e6,9000.0,20000.0,106850.0,754355.918425,29900.0,53000.0,7.7148e6,304004.166888,210882.0,50000.0,50000.0,1800.0,10000.0,40000.0,30001.0,18819.0,55000.0,206000.0,231209.546176,60000.0,56000.0,253550.983631,…,18351.0,60000.0,25000.0,303081.668252,30001.0,50000.0,272266.756948,40000.0,270864.166345,450785.775879,190000.0,210882.0,133000.0,163000.0,62000.0,95630.0,12000.0,24506.0,621443.264484,50000.0,33000.0,163000.0,32000.0,81000.0,459265.682523,32000.0,74000.0,66005.0,2.5587e6,124000.0,25000.0,160000.0,120000.0,12000.0,10000.0,63500.0,115000.0
160000.0,261639.854861,237159.376073,503386.396749,442249.227826,164500.0,33000.0,160000.0,427663.512461,46000.0,247692.799403,24000.0,207200.0,695413.36439,596460.214008,296835.744235,286085.916369,238241.932527,339457.963844,133000.0,2.5344e6,29000.0,51000.0,30500.0,130000.0,50000.0,75000.0,130000.0,120000.0,249252.340885,11529.0,244774.947495,100000.0,241913.518287,145000.0,11000.0,10300.0,…,173000.0,790439.503371,1.2987e6,353423.340608,671314.285042,306546.138567,1.1119e6,29900.0,395203.098595,518505.281797,263979.027588,18000.0,263941.710341,166000.0,158000.0,191000.0,120000.0,145000.0,551368.565919,14273.0,130000.0,53000.0,75000.0,232035.016177,43696.0,126000.0,100000.0,1000.0,471972.085265,173000.0,51000.0,25000.0,4150.0,95000.0,88000.0,376835.049682,37000.0
71000.0,173000.0,95630.0,891382.076316,100000.0,50000.0,10000.0,431937.606145,200000.0,5300.0,75000.0,24506.0,94285.0,210882.0,90000.0,51000.0,2000.0,29900.0,148000.0,124000.0,55000.0,2.2544e6,10000.0,75000.0,92000.0,210882.0,248742.733039,63500.0,112000.0,304118.058274,62000.0,362116.83819,359808.089767,3.3630e6,207200.0,115000.0,941379.50795,…,70000.0,106850.0,273689.792559,43000.0,114000.0,294628.124847,191000.0,40000.0,50000.0,883848.530316,145000.0,115000.0,14273.0,295241.510312,251151.203216,148000.0,17000.0,114000.0,389359.862955,1.2922e6,314172.501772,416812.184851,25000.0,43696.0,234258.896157,26334.0,50000.0,322898.152654,43000.0,58000.0,40000.0,373059.935609,383120.540368,95630.0,1800.0,30001.0,329610.985437
10000.0,351826.652419,1.1382e6,2000.0,113200.0,25000.0,284684.314478,81000.0,50000.0,106850.0,128000.0,25000.0,1000.0,60000.0,252886.743308,1.4852e6,351420.782517,114000.0,114500.0,145000.0,200000.0,160000.0,50000.0,100000.0,25000.0,26334.0,48000.0,160000.0,468733.156784,100000.0,219000.0,18351.0,637864.326779,274073.084319,415546.082335,210882.0,60000.0,…,29000.0,29000.0,468762.107139,372219.714661,166000.0,50000.0,5300.0,158000.0,26334.0,1.4835e6,241850.407459,18819.0,2.3991e6,206000.0,1.2292e6,299464.293609,43000.0,3.4505e6,302879.216567,39500.0,1.5803e6,63500.0,1.0520e6,257047.103823,43000.0,5300.0,460589.748417,145000.0,38500.0,11529.0,50462.0,4150.0,282557.154324,100000.0,164500.0,145000.0,70000.0
50000.0,29900.0,11000.0,25000.0,60000.0,50000.0,63500.0,71000.0,10000.0,81000.0,361582.219344,70000.0,80000.0,81000.0,203000.0,1.7594e6,160000.0,88000.0,1646.0,160000.0,756493.727659,230979.814233,454488.444537,92000.0,30001.0,369284.45299,122000.0,36073.0,66005.0,8000.0,51000.0,1.7157e6,25000.0,1.8329e6,18351.0,256979.996604,490386.598058,…,20000.0,106850.0,548601.208959,65000.0,50000.0,37000.0,50000.0,70000.0,1800.0,164500.0,29000.0,270305.005836,70000.0,751808.900018,145000.0,29000.0,15000.0,33000.0,351346.341776,1.0277e6,92000.0,56000.0,122000.0,65000.0,346671.529548,45000.0,232845.935966,51000.0,257103.365533,6.8195e6,100000.0,88000.0,115000.0,29900.0,631136.627979,94285.0,236645.823181
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
75000.0,32000.0,91000.0,7500.0,100000.0,148000.0,26334.0,38500.0,270685.111922,311747.184293,95630.0,782772.848431,26334.0,130000.0,7500.0,51000.0,120000.0,246580.120884,66005.0,1.4920e6,1.2434e6,65000.0,911583.36546,302051.101432,29000.0,40911.0,94285.0,71000.0,553595.904052,2900.0,25000.0,126000.0,145000.0,36073.0,30001.0,160000.0,88000.0,…,255741.357123,10000.0,25000.0,114500.0,295430.017993,160000.0,147000.0,341056.923231,81000.0,1800.0,128000.0,30500.0,521631.801659,788643.593468,92000.0,33000.0,95630.0,248771.967894,37000.0,25000.0,8000.0,504589.848349,19000.0,369107.679018,70000.0,24506.0,467778.764397,251314.130699,14273.0,160000.0,7500.0,50000.0,7500.0,18000.0,88000.0,20000.0,128000.0
281835.98938,18000.0,128000.0,74000.0,259891.494023,510337.162387,91000.0,146000.0,145000.0,164500.0,10000.0,937032.24042,11000.0,1.6170e6,29000.0,43696.0,1646.0,50000.0,592081.80352,56000.0,690779.56559,95000.0,80000.0,36073.0,305848.532101,40000.0,130000.0,242315.883858,210882.0,130000.0,331543.379946,336006.264688,407930.278014,712808.241774,55000.0,408017.827028,51000.0,…,506697.979348,95630.0,200000.0,35000.0,29900.0,91000.0,219000.0,32000.0,130000.0,33000.0,324137.754567,158000.0,60000.0,70000.0,126000.0,70000.0,26334.0,285889.8547,10300.0,50000.0,163000.0,1000.0,230290.691616,30000.0,50000.0,100000.0,164500.0,128000.0,968131.544525,130000.0,71000.0,100000.0,30001.0,231842.809053,261981.93366,365319.842833,50000.0
95630.0,474089.543605,50000.0,163000.0,25000.0,24000.0,120000.0,30000.0,259611.702614,2000.0,30500.0,1.0543e6,234624.846609,20000.0,43696.0,124000.0,20000.0,48000.0,5.8172e6,160000.0,92000.0,4.0747e6,339024.089362,114000.0,40000.0,90000.0,100000.0,332256.283313,50000.0,51000.0,200000.0,50000.0,114500.0,115000.0,15000.0,71000.0,431274.246363,…,130000.0,164500.0,166000.0,397761.124594,146000.0,452555.896509,66005.0,200000.0,114000.0,191000.0,681820.491274,376319.507652,75000.0,50000.0,32000.0,251021.661738,114500.0,237475.62939,2.1929e6,32000.0,5300.0,2.1027e6,2.7297e6,38500.0,206000.0,55000.0,158000.0,200000.0,14273.0,163000.0,37000.0,606334.671187,65000.0,1.5645e6,50462.0,273430.080531,40911.0
562282.275263,70000.0,19000.0,37000.0,120000.0,145000.0,71000.0,915357.338648,130000.0,452840.599679,279941.982329,62000.0,2900.0,5300.0,106850.0,90000.0,310171.10482,106850.0,770434.22062,24506.0,30000.0,56000.0,207200.0,654822.062727,206000.0,163000.0,219000.0,422350.310815,94285.0,71000.0,32000.0,10000.0,270541.609449,82500.0,158000.0,15000.0,33000.0,…,173000.0,70000.0,246526.848675,403354.338669,2000.0,329710.613433,124000.0,33000.0,88000.0,308153.066666,18351.0,92000.0,271137.663657,122000.0,568022.25393,53000.0,20000.0,50000.0,145000.0,4150.0,11529.0,903083.996936,146000.0,106850.0,39500.0,1646.0,10000.0,296260.224376,130000.0,32000.0,266461.730699,114500.0,326696.834017,142000.0,920117.28119,38500.0,112000.0


# Distributions
## Generic Power-Law
$Cf(x)$ s.t. $f(x) = x^{-a}$

In [19]:
# instantiate the class
pl_class = powerlawrs.dist.powerlaw.Powerlaw(2.2726, 230000)

In [20]:
# pdf
pl_class.pdf(500000)

9.47430869971139e-07

In [21]:
# cdf
pl_class.cdf(500000)

0.627757791147596

In [22]:
# ccdf
pl_class.ccdf(500000)

0.372242208852404

In [23]:
# rv
# generate random U(0,1)
u = np.random.rand()
pl_class.rv(u)

350296.6575149392

In [24]:
# Log Likelihood of first 10 data
pl_class.loglikelihood(data[:10])

[-14.167286692248991,
 -11.907555004554373,
 -11.670580405021845,
 -8.447163936291041,
 -6.18495414433208,
 -13.122953520503547,
 -9.52761182944572,
 -6.437725522683576,
 -13.680018818629309,
 -9.232906008194748]

## Pareto Type I

In [25]:
# instantiate the class
pareto_class = powerlawrs.dist.pareto.Pareto(1.2726, 230000)

In [26]:
# pdf
pareto_class.pdf(500000)

9.474308699711417e-07

In [27]:
# cdf
pareto_class.cdf(500000)

0.6277577911475959

In [28]:
# ccdf
pareto_class.ccdf(500000)

0.3722422088524041

In [29]:
# rv
# generate random U(0,1)
u = np.random.rand()
pareto_class.rv(u)

506192.0155102401

In [30]:
# Log Likelihood of first 10 data
# Note the -inf as x < pareto_class.x_min
pareto_class.loglikelihood(data[:10])

[-14.167286692248988,
 -inf,
 -inf,
 -inf,
 -inf,
 -13.122953520503545,
 -inf,
 -inf,
 -13.680018818629307,
 -inf]

In [31]:
data[:10]

column_1
i64
570000
210882
190000
46000
17000
360000
74000
19000
460000
65000


## Shifted Exponential
$f(x:\lambda, x_{min}) = \lambda * e^{-\lambda * (x-x_{min})}$

In [32]:
# instantiate the class
expo_class = powerlawrs.dist.exponential.Exponential(1.5, 0.0)

In [33]:
# pdf
expo_class.pdf(2)

0.07468060255179593

In [34]:
# cdf
expo_class.cdf(2)

0.950212931632136

In [35]:
# ccdf
expo_class.ccdf(2)

0.04978706836786395

In [36]:
# rv
# generate random U(0,1)
u = np.random.rand()
expo_class.rv(u)

0.5377268133049511

In [37]:
# Log Likelihood of 10 rv's
X = [expo_class.rv(np.random.rand()) for x in range(0,10)]
expo_class.loglikelihood(X)

[0.3439758356173637,
 -1.669975339116598,
 -0.5062693554051971,
 -0.9603357419156937,
 -0.4447345337166617,
 -0.07507787436567538,
 -0.009266733541575629,
 0.15271280742474244,
 0.012034621832913758,
 -0.014350311467326848]

## Log-Normal

In [38]:
# instantiate the class
lognormal_class = powerlawrs.dist.lognormal.Lognormal(5,1)

In [39]:
# pdf
lognormal_class.pdf(50)

0.004414730410382431

In [40]:
# cdf
lognormal_class.cdf(50)

0.1383026725766271

In [41]:
# ccdf
lognormal_class.ccdf(50)

0.8616973274233729

In [42]:
# rv
# generate random U(0,1)
u = np.random.rand()
lognormal_class.rv(u)

254.60780190456995

In [43]:
# Log Likelihood of first 10 data
lognormal_class.loglikelihood(data[:10])

[-48.23156695306425,
 -39.52492513891642,
 -38.669151666115724,
 -28.108458617854605,
 -21.898298898560117,
 -44.08491931904303,
 -31.42411507354539,
 -22.543027350758262,
 -46.270534238528505,
 -30.497310074517163]