In [None]:
import numpy as np
from scipy import stats
import math
from collections import defaultdict, Counter
from tqdm import tqdm
import pandas as pd
import os  
import time
import matplotlib.pyplot as plt

In [None]:
MILLION = 1000000
PAIRS = 6
BLOCKSIZE = MILLION

In [None]:
from os import walk
mypath = '../data/'
filenames = next(walk(mypath), (None, None, []))[2]

pi_cf_digits = []
for filename in tqdm(filenames):
    pi_digits = np.load(mypath + filename)
    pi_cf_digits.extend(pi_digits['arr_0'])
    pi_cf_digits = pi_cf_digits[:math.floor((len(pi_cf_digits)/BLOCKSIZE))* BLOCKSIZE]
pi_cf_digits = [int(x) for x in pi_cf_digits]
len(pi_cf_digits)

In [None]:
def ChiSquare(data, blocksize=100, pairs=5):
    p_val_arr = []
    q_res_arr = []
    number_of_blocks = len(data)/blocksize
    
    groups = list(np.array_split(data, number_of_blocks))
    
    def P(pairs):
        p_dict = {}
        for a in (range(1, pairs+1)):
            for b in range(1, pairs+1):
                prob = math.log(((a*b + a + b + 2)*(a*b + 1))/((a*b + a + 1)*(a*b + b + 1)))/math.log(2)
                p_dict[(a, b)] = prob
        p_dict['else'] = 1 - sum(list(p_dict.values()))
        return p_dict

    def Y(pi_digits, pairs):
        y_dict = defaultdict(int)
        for a, b in zip(pi_digits[:-1], pi_digits[1:]):
            if a <=pairs and b <= pairs:
                y_dict[(a, b)] += 1
            else:
                y_dict['else'] += 1
        return y_dict
    
    def chisquare_test(y_array, p_array, blocksize, pairs):
        q_val = 0
        for a in (range(1, pairs+1)):
            for b in range(1, pairs+1):
                q_val += ((y_array[(a, b)] - blocksize*p_array[(a, b)])**2)/(blocksize*p_array[(a, b)])
        return q_val
    p_array = P(pairs)
    for pi_cf_digits in tqdm(groups):
        pi_digits = list(pi_cf_digits)

        y_array = Y(pi_digits, pairs)
        
        q_res = chisquare_test(y_array, p_array, blocksize, pairs)
        p_val = 1 - stats.chi2.cdf(q_res , len(p_array))
        
        p_val_arr.append(p_val)
        q_res_arr.append(q_res)


    result = defaultdict(list)
    for p_val, q_val in zip(p_val_arr, q_res_arr):
        result['p-value'].append(p_val)
        result['chi-square value'].append(q_val)
    df = pd.DataFrame(data=result)
    filename = '../results/chi-square-pair-results-' + str(len(data)) +'-pi-digits-' + \
                str(int(number_of_blocks)) + '-blocks'
    df.to_csv(filename + '.csv') 
    return p_val_arr, q_res_arr

In [None]:
p_val_arrV1, q_res_arrV1 = ChiSquare(pi_cf_digits, BLOCKSIZE, PAIRS)

In [None]:
p_val_arrV1

In [None]:
def Y(pi_digits, pairs):
    y_dict = defaultdict(int)
    for a, b in zip(pi_digits[:-1], pi_digits[1:]):
        if a <= pairs and b <= pairs:
            y_dict[(a, b)] += 1
        else:
            y_dict['else'] += 1
    return y_dict

In [None]:
test = Y(pi_cf_digits[:100000000], 3)

In [None]:
test_test = dict(sorted(test.items(), key=lambda item: item[1]))

In [None]:
names = list(test_test.keys())
values = list([math.log(x) for x in test_test.values()])
plt.figure(figsize=(20,10))
plt.bar(range(len(test_test)), values, tick_label=names)
plt.xticks(rotation = 90)
plt.show()

In [None]:
test_test