In [1]:
import powerlawrs 
import polars as pl
import numpy as np

In [2]:
file = "../reference_data/blackouts.txt"

# polars and pandas do NOT do a good job detecting headers, do not rely on them.
df = pl.read_csv(file, has_header=False)
data = df.to_series()

# API
## Fitting proceedure

In [3]:
# 1. Generate an alpha paramater via MLE for every x_min in the data 
x_mins, alphas = powerlawrs.estimation.find_alphas_fast(data)
print(f"n: {len(data)}, n_x_mins: {len(x_mins)}, n_alphas: {len(alphas)}")

n: 211, n_x_mins: 210, n_alphas: 210


In [4]:
# 2. Find the pair with the lowest KS statistic. This is the estimated best fit.
best_fit = powerlawrs.gof.gof(data, alphas=alphas, x_mins=x_mins)
print(f"{best_fit}")

Fitment(x_min=230000, alpha=1.2726372198302858, D=0.06067379629443781, len_tail=59)


Steps 1 and 2 above are abstracted away via ```powerlawrs.fit()```

## Numerical stability 
Comparison of ```find_alphas_exhaustive()``` and ```find_alphas_fast()``` given the former will be technically more accurate than the latter. Depending on the context, the precision is negligible given significant performance improvements. 

In [5]:
# 1. Generate an alpha paramater via MLE for every x_min in the data via find_alphas_exhaustive()
x_mins_ex, alphas_ex = powerlawrs.estimation.find_alphas_exhaustive(data)

# 2. Find the pair with the lowest KS statistic. This is the estimated best fit.
best_fit_ex = powerlawrs.gof.gof(data, alphas=alphas_ex, x_mins=x_mins_ex)

print(f"find_alphas_exhaustive() alpha: {best_fit_ex.alpha}")
print(f"find_alphas_fast() alpha:\t{best_fit.alpha}")
print(f"Difference: {best_fit.alpha - best_fit_ex.alpha}")

find_alphas_exhaustive() alpha: 1.2726372198302882
find_alphas_fast() alpha:	1.2726372198302858
Difference: -2.4424906541753444e-15


## Parameter uncertainty

In [6]:
xm_std, a_std = powerlawrs.estimation.param_est(data, m=1000)
print(f"stdev (sample) x_min: {xm_std}, stdev (sample) alpha: {a_std}")

stdev (sample) x_min: 84389.68459018729, stdev (sample) alpha: 0.2557135477980042


## Hypothesis test 

In [7]:
# Run the experiment
# Set a minimum precsion of our p value of the KS test.
precision = 0.01 # p value should be accurate to with 0.01 
H0 = powerlawrs.hypothesis.hypothesis_test(data, precision, best_fit.alpha, best_fit.x_min, best_fit.D)

Generating M = 2500 simulated datasets of length n = 211 with tail size 59 and probability of the tail P(tail|data) = 0.2796208530805687


In [8]:
# hypothesis_test() calls powerlawrs.util.sim.calculate_sim_params() to determine the number of simulated datasets required given the desired precision. 
simparams_dict = powerlawrs.util.sim.calculate_sim_params(precision, data, best_fit.x_min)

In [9]:
# Which will require 2500 synthetic datasets of length 211. 59 of the 211 samples will be drawn from a Pareto Type I with the paramaters found above
simparams_dict

{'num_sims_m': 2500,
 'sim_len_n': 211,
 'n_tail': 59,
 'p_tail': 0.2796208530805687}

## Stats module

In [10]:
powerlawrs.stats.descriptive.mean(data)

253868.68246445496

In [11]:
powerlawrs.stats.descriptive.variance(data, 1)

372476564023.59814

In [12]:
powerlawrs.stats.random.random_choice(data, 3)

[60000.0, 160000.0, 1660000.0]

In [13]:
powerlawrs.stats.random.random_uniform(3)

[0.37432851300189807, 0.08709372804979165, 0.44552346171226165]

In [15]:
# Define a standard normal CDF in Python
import math
norm_cdf = lambda x: 0.5 * (1 + math.erf(x / math.sqrt(2.0)))

sorted_data = [-1.1, -0.5, 0.1, 0.2, 1.5]

# Call the Rust function, passing the Python function as an argument
(d_plus, d_minus, d_max) = powerlawrs.stats.ks.ks_1sam_sorted(sorted_data, norm_cdf)

print(f"D+: {d_plus}")
print(f"D-: {d_minus}")
print(f"D max: {d_max}")

D+: 0.22074029056089706
D-: 0.13982783727702897
D max: 0.22074029056089706


## Util module

In [16]:
powerlawrs.util.linspace(0,10,5)

[0.0, 2.5, 5.0, 7.5, 10.0]

In [17]:
simparams_dict = powerlawrs.util.sim.calculate_sim_params(0.01, data, 230000)
simparams_dict

{'num_sims_m': 2500,
 'sim_len_n': 211,
 'n_tail': 59,
 'p_tail': 0.2796208530805687}

In [18]:
# convert simparams dict to rust struct
simparams_struct = powerlawrs.util.sim.PySimParams(**simparams_dict)

# use the struct as an argument
sim_data = powerlawrs.util.sim.generate_synthetic_datasets(data, 230000, simparams_struct, 1.27)

In [19]:
#Note the library does not yet impliment zeta distribution for discrete data. 
pl.from_numpy(np.array(sim_data))

column_0,column_1,column_2,column_3,column_4,column_5,column_6,column_7,column_8,column_9,column_10,column_11,column_12,column_13,column_14,column_15,column_16,column_17,column_18,column_19,column_20,column_21,column_22,column_23,column_24,column_25,column_26,column_27,column_28,column_29,column_30,column_31,column_32,column_33,column_34,column_35,column_36,…,column_174,column_175,column_176,column_177,column_178,column_179,column_180,column_181,column_182,column_183,column_184,column_185,column_186,column_187,column_188,column_189,column_190,column_191,column_192,column_193,column_194,column_195,column_196,column_197,column_198,column_199,column_200,column_201,column_202,column_203,column_204,column_205,column_206,column_207,column_208,column_209,column_210
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
100000.0,122000.0,331840.363386,46000.0,133000.0,30001.0,81000.0,145000.0,70000.0,32000.0,60000.0,18351.0,66005.0,29000.0,53000.0,94285.0,15000.0,1.7205e6,683031.323883,100000.0,332550.63628,219000.0,26334.0,75000.0,230450.952094,48000.0,46000.0,262085.884775,29900.0,50000.0,238973.828376,40000.0,203000.0,106850.0,1646.0,270952.440511,160000.0,…,48000.0,25000.0,242388.504578,10000.0,1646.0,60000.0,7500.0,75000.0,40000.0,18351.0,18351.0,92000.0,80000.0,58000.0,10300.0,113200.0,1.9365e6,19000.0,18351.0,122000.0,383487.78015,50000.0,191000.0,388664.690528,390360.652242,7500.0,315683.113762,51000.0,372984.365063,40000.0,9000.0,909117.267408,379865.179761,40911.0,844723.114882,1646.0,163000.0
10000.0,120000.0,7500.0,53000.0,145000.0,377406.499392,160000.0,40911.0,106850.0,160000.0,762851.426482,122000.0,3.3534e6,160000.0,11529.0,100000.0,283116.975577,166000.0,32000.0,59000.0,30000.0,417725.581866,478902.48035,106850.0,33000.0,124000.0,24000.0,249067.468354,10000.0,58000.0,100000.0,25000.0,12000.0,60000.0,260113.483068,160000.0,60000.0,…,160000.0,50000.0,71000.0,100000.0,43000.0,71000.0,18000.0,190000.0,92000.0,120000.0,7500.0,1.3379e6,91000.0,542092.377332,12000.0,530725.012462,360133.80759,63500.0,10300.0,203000.0,56000.0,80000.0,458060.083224,70000.0,440119.298963,32000.0,296421.09999,449373.047109,25000.0,50000.0,58000.0,50000.0,1.4254e6,5.4478e6,50462.0,43000.0,473169.066342
288465.143882,30001.0,18000.0,1.7893e6,249492.29766,59000.0,443253.483935,51000.0,18000.0,163000.0,100000.0,624114.397016,219000.0,70000.0,32000.0,428952.343356,88000.0,45000.0,24000.0,100000.0,18819.0,46000.0,310536.879565,17000.0,757255.076504,857860.457568,311782.262354,26334.0,190000.0,60000.0,60000.0,582826.734536,261567.39745,60000.0,541728.865643,266478.82326,43000.0,…,25000.0,207200.0,503867.41711,210882.0,39500.0,120000.0,418040.943635,71000.0,106850.0,591007.903487,1.0864e6,36073.0,160000.0,10000.0,207200.0,26334.0,518765.957022,18000.0,640975.608514,74000.0,512912.812656,48000.0,120000.0,39500.0,100000.0,71000.0,1.0276e7,94285.0,40000.0,9000.0,70000.0,58000.0,59000.0,25000.0,418744.993101,15000.0,207200.0
839176.445631,71000.0,63500.0,575152.863177,50000.0,11000.0,4.9364e6,234016.312714,80000.0,142000.0,30500.0,43696.0,88000.0,394874.734362,142000.0,32000.0,115000.0,645798.596759,106850.0,256536.790341,8000.0,100000.0,100000.0,233148.651382,523408.467873,15000.0,281473.788318,130000.0,145000.0,14273.0,238648.423964,245032.720822,70000.0,548482.434108,160000.0,60000.0,1.2799e6,…,20000.0,775003.360584,635875.909853,252337.867939,70000.0,497395.249396,122000.0,70000.0,50000.0,48000.0,80000.0,258201.207025,35000.0,377026.801246,452424.116601,313997.438219,476619.446095,446124.266765,1.1040e6,145000.0,18351.0,18000.0,71000.0,24506.0,82500.0,160000.0,961641.260911,124000.0,25000.0,357491.211036,17000.0,120000.0,203000.0,130000.0,100000.0,20000.0,56000.0
60000.0,379877.400734,5300.0,75000.0,50000.0,71000.0,53000.0,344716.649886,75000.0,53000.0,12000.0,413980.802105,32000.0,25000.0,55000.0,354545.109081,56000.0,492935.526535,120000.0,56000.0,1.0926e6,25000.0,24506.0,29000.0,40000.0,48000.0,548189.380961,128000.0,142000.0,8.1762e6,596686.835835,1.4441e6,328749.387079,36073.0,26334.0,113200.0,3.1163e6,…,145000.0,56000.0,9000.0,50000.0,288570.529765,661463.221515,312284.729146,18000.0,10000.0,283427.291189,349421.201744,94285.0,292510.424908,291914.247624,308202.545051,19000.0,25000.0,304790.513931,18000.0,30000.0,166000.0,328485.072626,18000.0,1.5492e6,241967.728473,269782.262041,191000.0,741329.442381,25000.0,515077.168188,19000.0,95000.0,18819.0,51000.0,472261.639257,74000.0,374515.118178
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
704779.337999,252297.647573,1.0964e6,50000.0,247050.863339,100000.0,29900.0,9000.0,424432.711537,115000.0,593255.132357,1.6707e6,100000.0,18351.0,694557.729772,130000.0,17000.0,474116.877979,20000.0,75000.0,120000.0,771906.808257,50000.0,20000.0,100000.0,62000.0,15000.0,50462.0,254373.679582,25000.0,90000.0,115000.0,75000.0,95000.0,56000.0,133000.0,160000.0,…,160000.0,25000.0,19000.0,35000.0,114000.0,275544.434428,112000.0,219000.0,122000.0,55000.0,130000.0,821943.18612,130000.0,254154.875419,71000.0,9000.0,306200.100002,55000.0,100000.0,95630.0,859430.585481,337137.333463,166000.0,60000.0,2900.0,130000.0,120000.0,65000.0,29000.0,29000.0,160000.0,50000.0,12000.0,604442.10288,25000.0,51000.0,173000.0
7500.0,18000.0,25000.0,88000.0,88000.0,92000.0,29900.0,24000.0,7.3347e6,25000.0,191000.0,65000.0,24000.0,271192.513607,106850.0,1646.0,256256.687549,272688.02354,70000.0,25000.0,257852.334399,147000.0,26334.0,805961.602071,200000.0,5.7963e6,50000.0,63500.0,1800.0,100000.0,59000.0,146000.0,130000.0,53000.0,200000.0,599784.404653,29900.0,…,50000.0,74000.0,50000.0,50000.0,115000.0,32000.0,18000.0,25000.0,75000.0,43000.0,456440.315142,46000.0,158000.0,173000.0,9000.0,11529.0,55000.0,367299.544547,71000.0,122000.0,130000.0,1.0119e6,203000.0,822618.833143,11000.0,207200.0,100000.0,231427.040586,236635.580162,1000.0,130000.0,20000.0,166000.0,120000.0,160000.0,56000.0,1.8714e6
70000.0,15000.0,40000.0,26334.0,241837.146265,145000.0,7500.0,100000.0,50000.0,548824.977187,245998.524658,33000.0,1.1010e6,304241.749576,38500.0,80000.0,203000.0,25000.0,95630.0,233679.818411,60000.0,452811.734516,18000.0,5300.0,65000.0,91000.0,50462.0,512510.882437,60000.0,343072.963245,38500.0,20000.0,289489.329069,1646.0,2000.0,50000.0,474932.499226,…,789753.260739,326637.797559,1.2448e6,718398.959226,25000.0,1000.0,91000.0,20000.0,398835.000612,55000.0,50000.0,305087.162756,1.2407e6,92000.0,53000.0,9.7733e6,60000.0,1.2890e6,29900.0,160000.0,148000.0,164500.0,40911.0,29000.0,191000.0,7500.0,200000.0,651858.926249,160000.0,142000.0,71000.0,50000.0,1.9212e6,5300.0,394207.311118,206000.0,30500.0
130000.0,100000.0,264756.776792,30500.0,1.8593e7,14273.0,5300.0,80000.0,382146.592379,313507.884809,50000.0,300957.57869,252339.400957,2.0149e6,310062.384439,100000.0,514836.442509,10000.0,389941.95227,74000.0,60000.0,527164.324788,112000.0,219000.0,30000.0,173000.0,320879.008981,113200.0,1.9044e6,50000.0,55000.0,133000.0,50000.0,2.8812e6,50000.0,398013.726668,70000.0,…,669536.423127,379989.871164,190000.0,207200.0,247951.691924,74000.0,258405.847912,287898.394699,50000.0,382935.697121,60000.0,207200.0,114500.0,158000.0,51000.0,472759.999343,459437.170532,112000.0,160000.0,32000.0,51000.0,264761.2505,90000.0,95630.0,1000.0,126000.0,71000.0,40000.0,200000.0,353272.318943,133000.0,7500.0,95000.0,90000.0,62000.0,33000.0,92000.0


# Distributions
## Generic Power-Law

In [20]:
# instantiate the class
pl_class = powerlawrs.dist.powerlaw.Powerlaw(2.2726, 230000)

In [21]:
# pdf
pl_class.pdf(500000)

9.47430869971139e-07

In [22]:
# cdf
pl_class.cdf(500000)

0.627757791147596

In [23]:
# ccdf
pl_class.ccdf(500000)

0.372242208852404

In [23]:
# rv
# generate random U(0,1)
u = np.random.rand()
pl_class.rv(u)

454936.6539476187

In [26]:
# Log Likelihood of first 10 data
pl_class.loglikelihood(data[:10])

[-14.167286692248991,
 -11.907555004554373,
 -11.670580405021845,
 -8.447163936291041,
 -6.18495414433208,
 -13.122953520503547,
 -9.52761182944572,
 -6.437725522683576,
 -13.680018818629309,
 -9.232906008194748]

## Pareto Type I

In [27]:
# instantiate the class
pareto_class = powerlawrs.dist.pareto.Pareto(1.2726, 230000)

In [28]:
# pdf
pareto_class.pdf(500000)

9.474308699711417e-07

In [29]:
# cdf
pareto_class.cdf(500000)

0.6277577911475959

In [30]:
# ccdf
pareto_class.ccdf(500000)

0.3722422088524041

In [31]:
# rv
# generate random U(0,1)
u = np.random.rand()
pareto_class.rv(u)

231387.9586762427

In [39]:
# Log Likelihood of first 10 data
# Note the -inf as x < pareto_class.x_min
pareto_class.loglikelihood(data[:10])

[-14.167286692248988,
 -inf,
 -inf,
 -inf,
 -inf,
 -13.122953520503545,
 -inf,
 -inf,
 -13.680018818629307,
 -inf]

In [40]:
data[:10]

column_1
i64
570000
210882
190000
46000
17000
360000
74000
19000
460000
65000


## Exponential

In [41]:
# instantiate the class
expo_class = powerlawrs.dist.exponential.Exponential(1.5)

In [42]:
# pdf
expo_class.pdf(2)

0.07468060255179593

In [43]:
# cdf
expo_class.cdf(2)

0.950212931632136

In [44]:
# ccdf
expo_class.ccdf(2)

0.04978706836786395

In [45]:
# rv
# generate random U(0,1)
u = np.random.rand()
expo_class.rv(u)

1.8669079353153712

In [51]:
# Log Likelihood of 10 rv's
X = [expo_class.rv(np.random.rand()) for x in range(0,10)]
expo_class.loglikelihood(X)

[0.26167033599080436,
 -1.4083147212477003,
 -0.0727862507877217,
 0.34638240517676494,
 -1.1097103090018494,
 0.3534068016885221,
 0.1911279110963032,
 -0.3664573406051153,
 -4.2959656711159795,
 -0.06231217448637833]