In [76]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# OBIPath = '/media/ak/Data/InterestRateFuturesData/ReconstructedLOB/OrderBookImbalance/'
# OBIMFDFAResults = '/media/ak/Data/InterestRateFuturesData/ReconstructedLOB/OrderBookImbalance/OBIMFDFAResults'

In [3]:
OBIPath = '/media/ak/LaCie/InterestRateFuturesData/ReconstructedLOB/OrderBookImbalance'

OBIMFDFAResults = os.path.join(OBIPath, 'OBIMFDFAResults')

In [4]:
resultsPath = os.path.join(OBIMFDFAResults, 'TwoSampleDataFrames')

In [5]:
def process_dict(input_dict):
    """
    Process a given input dictionary to transform and restructure its data.

    This function iterates through each key-value pair in the input dictionary. 
    It processes the dictionary in the following ways:
    
    1. If the value is a list of dictionaries, each sub-dictionary is flattened. 
       For each key-value pair in the sub-dictionary, a new key is created in the 
       format "original_key_index_sub_key", and the sub-value is stored as a list.
       If the sub-value is a list or a numpy array, the median of the array is calculated 
       and stored in a list. If the median calculation raises a TypeError, the original 
       sub-value is stored as is.

    2. If the key is one of ['list_gwidth', 'list_kernels', 'list_permuted_mmd2'], 
       the value is stored directly without transformation.

    3. For other values, if they are lists or numpy arrays, the function tries to calculate 
       the median and store it in a list. If the median calculation raises a TypeError, 
       or if the array is empty, the original value is stored as is.

    4. For all other types of values, they are stored in a list.

    Parameters:
    - input_dict (dict): The input dictionary to be processed.

    Returns:
    - dict: A new dictionary with processed key-value pairs.
    """
    processed_dict = {}
    for key, value in input_dict.items():
        # Check if the value is a list of dictionaries
        if isinstance(value, list) and all(isinstance(item, dict) for item in value):
            for i, sub_dict in enumerate(value):
                for sub_key, sub_value in sub_dict.items():
                    new_key = f"{key}_{i}_{sub_key}"
                    # Process the sub_value similar to how other values are processed
                    if isinstance(sub_value, list) or (type(sub_value).__module__ == 'numpy' and isinstance(sub_value, np.ndarray)):
                        if len(sub_value) > 0:
                            try:
                                processed_dict[new_key] = [np.median(sub_value)]
                            except TypeError:
                                processed_dict[new_key] = sub_value
                        else:
                            processed_dict[new_key] = sub_value
                    else:
                        processed_dict[new_key] = [sub_value]
        elif key in ['list_gwidth', 'list_kernels', 'list_permuted_mmd2']:
            processed_dict[key] = value
        elif isinstance(value, list) or (type(value).__module__ == 'numpy' and isinstance(value, np.ndarray)):
            if len(value) > 0:
                try:
                    processed_dict[key] = [np.median(value)]
                except TypeError:
                    processed_dict[key] = value
            else:
                processed_dict[key] = value
        else:
            processed_dict[key] = [value]

    return processed_dict


In [58]:
def proportion_second_value(final_df, column):
    total_count = final_df[column].count()
    if total_count > 0 and len(final_df[column].value_counts()) > 1:
        second_value_count = final_df[column].value_counts()[1]
        proportion = (second_value_count / total_count) * 100
    else:
        proportion = 0
    return f"{proportion:.2f}%"

# Calculate the proportion of the second value (which is 'True' in your case)
# true_proportion_XZ = proportion_second_value('XZ_test_h0_rejected')
# true_proportion_YZ = proportion_second_value('YZ_test_h0_rejected')

# # If you need to round numeric columns to two decimals
# median_values = final_df.median().round(3)

# # Store the formatted proportions in the median_values Series
# median_values['XZ_test_h0_rejected'] = true_proportion_XZ
# median_values['YZ_test_h0_rejected'] = true_proportion_YZ


In [71]:
def create_results_table(df):
    # format the columns 'pvalue_1', 'test_stat_1', 'pvalue_2', and 'test_stat_2'
    df[['pvalue_1', 'test_stat_1', 'pvalue_2', 'test_stat_2']] = df[['pvalue_1', 'test_stat_1', 'pvalue_2', 'test_stat_2']].applymap(lambda x: f'{float(x):.2e}'.replace('e-', 'x10^-') if pd.notna(x) else 'N/A')

    # replace the NaN values with 'N/A'
    df.fillna('N/A', inplace=True)

    # create the LaTeX table
    latex = df.to_latex(column_format='lcccccccc', escape=False, header=['', 'pvalue_1', 'test_stat_1', 'pvalue_2', 'test_stat_2', 'h0_rejected_1', 'Percentage', 'Median Window', 'Median Shift'], bold_rows=[0, len(df)-1], multicolumn_format='c')

    # return the LaTeX table
    return latex

In [6]:
def flatten_dict(d):
    def expand(key, value):
        if isinstance(value, dict):
            return [(str(key) + '_' + str(k), v) for k, v in flatten_dict(value).items()]
        elif isinstance(value, list) or (type(value).__module__ == 'numpy' and isinstance(value, np.ndarray)):
            if len(value) > 0:
                try:
                    return [(str(key), np.median(value))]
                except TypeError:
                    return [(str(key), value)]
            else:
                return [(str(key), value)]
        else:
            return [(str(key), value)]
    
    items = [item for k, v in d.items() for item in expand(k, v)]
    
    return dict(items)

# flattened_dict = flatten_dict(input_dict)


In [7]:
import pandas as pd

def process_and_flatten(file_path):
    """
    Process and flatten the contents of a pickle file.

    This function reads a pickle file specified by file_path, extracts
    and processes its contents (assuming a specific structure of the
    data), then flattens the dictionary structure into a single level. 
    Certain keys are removed from the final flattened dictionary as 
    part of data cleaning.

    Parameters:
    file_path (str): The path to the pickle file to be processed.

    Returns:
    dict: A flattened dictionary with specified keys removed.

    Note:
    - The function assumes the existence of 'process_dict' and 
      'flatten_dict' functions which are not defined in this snippet.
    - The structure of the pickle file and the dictionary keys are 
      assumed based on the function's implementation.
    """

    # Load the pickle file
    unPickledFile = pd.read_pickle(file_path)

    # Extract the first item's value from the dictionary
    resultsDict = unPickledFile[next(iter(unPickledFile))]

    # Process the dictionary (process_dict needs to be defined elsewhere)
    process_dict(resultsDict)

    # Flatten the dictionary (flatten_dict needs to be defined elsewhere)
    flattened_dict = flatten_dict(resultsDict)
    
    # List of keys to be removed from the flattened dictionary
    keys_to_remove = ['list_gwidth', 'list_kernels', 'best_ker']

    # Remove specified keys
    for key in keys_to_remove:
        flattened_dict.pop(key, None)
    
    return flattened_dict


In [8]:
symbolIDX = 3
symbols = os.listdir(OBIMFDFAResults)
symbolPath = os.path.join(OBIMFDFAResults, symbols[symbolIDX])
os.listdir(symbolPath)
keys = ['alpha', 'Spectrum', 'tau', 'Hlist']

In [9]:
keyIDX = 3
symbolKeyPath =  os.path.join(symbolPath, keys[keyIDX])
files = os.listdir(symbolKeyPath)
files

['FV1_list_H_shift_1_wind_1_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_1_wind_4_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_1_wind_5_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_1_wind_6_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_1_wind_8_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_1_wind_10_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_2_wind_1_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_2_wind_4_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_2_wind_5_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_2_wind_6_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_2_wind_8_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_2_wind_10_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_3_wind_1_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_3_wind_4_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_3_wind_5_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_3_wind_6_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_3_wind_8_OBI_quad_MMD_test.pkl',
 'FV1_list_H_shift_3_wind_10_OBI_quad_MMD_test.pkl']

In [10]:
# fileIdxPath 

In [11]:
# print([f for f in enumerate(resultsKeys) ])

In [59]:

# fileIdx = 1
for fileIdx, _ in enumerate(files):
    fileIdxPath = os.path.join(symbolKeyPath, files[fileIdx])
    shift_str, window_str = files[fileIdx].split("_shift_")[1].split("_OBI")[0].split("_wind_")
    # Convert the extracted strings to integers (or floats if needed)
    shift = int(shift_str)  # Use float(shift_str) if the value can be a decimal
    window = int(window_str)  # Use float(window_str) if the value can be a decimal

    #####################################################
    unPickledFile = pd.read_pickle(fileIdxPath)
    idXkeys = list(unPickledFile.keys())
    resultsDict = unPickledFile[idXkeys[0]]
    resultsKeys = list(resultsDict.keys())
    process_dict(resultsDict)
    flattened_dict = flatten_dict(resultsDict)
    # List of keys to remove
    keys_to_remove = ['list_gwidth', 'list_kernels', 'best_ker']

    # Remove the specified keys from the flattened dictionary
    for key in keys_to_remove:
        flattened_dict.pop(key, None)  # Use pop to avoid KeyError if key is not present

    # Now you can convert this flattened dictionary into a DataFrame
    df = pd.DataFrame([flattened_dict])
    df['window']= window
    df['shift']= shift
    print(df)

ModuleNotFoundError: No module named 'freqopttest'

In [None]:

# symbols = os.listdir(OBIMFDFAResults)
# for symbolIDX in range(0,13):
#     symbolPath = os.path.join(OBIMFDFAResults, symbols[symbolIDX])
#     os.listdir(symbolPath)
#     keys = ['alpha', 'Spectrum', 'tau', 'Hlist']
#     keyIDX = 3

#     symbolKeyPath =  os.path.join(symbolPath, keys[keyIDX])
#     files = os.listdir(symbolKeyPath)
#     # Using list comprehension to process each file and store the flattened dict
#     dfs = [pd.DataFrame([process_and_flatten(os.path.join(symbolKeyPath, file))])
#            .assign(**dict(zip(['window', 'shift'], map(int, file.split("_shift_")[1].split("_OBI")[0].split("_wind_")))))
#            for file in files]

#     # Concatenate all dataframes
#     final_df = pd.concat(dfs, ignore_index=True)

#     # final_df now contains all the data stacked together
#     # Assuming symbols, symbolIDX, keys, and keyIDX are already defined in your code
#     fileName = os.path.join(resultsPath,"_".join((symbols[symbolIDX], keys[keyIDX], "twoSampleResults.pkl")))

#     # Save the DataFrame to a pickle file
#     final_df.to_pickle(fileName)


In [13]:
#### Making Latex Tables 

In [14]:
resultsFiles = os.listdir(resultsPath)
# keys = ['tau', 'alpha', 'Spectrum', 'listH']
tauResultsFiles = os.path.join(resultsPath, 'tau')
alphaResultsFiles = os.path.join(resultsPath, 'alpha')


In [15]:
alphaFiles = os.listdir(alphaResultsFiles)
fileIdx = 1
filePath = os.path.join(alphaResultsFiles, alphaFiles[fileIdx])

In [32]:
list_dfs = []
for fileIdx in range(len(alphaFiles)):
    filePath = os.path.join(alphaResultsFiles, alphaFiles[fileIdx])
    list_dfs.append(pd.read_pickle(filePath))



In [44]:
final_df = pd.concat(list_dfs, axis=1).median(axis=0)

In [43]:
list_dfs[1]

Unnamed: 0,perm_mmds1,chi2_weights,sim_mmds,sig2,Kxy,mean,var,Kxx,Kyy,mean_gram,...,XZ_test_test_stat,XZ_test_h0_rejected,XZ_test_list_permuted_mmd2,YZ_test_alpha,YZ_test_pvalue,YZ_test_test_stat,YZ_test_h0_rejected,YZ_test_list_permuted_mmd2,window,shift
0,-0.005058,2.165511e-20,-0.002703,0.505663,0.615778,0.006041,1.022819e-07,0.60419,0.637628,0.006041,...,0.099276,True,-0.002658,0.05,0.008,0.043991,True,-0.002855,1,10
1,-0.005058,2.165511e-20,-0.002703,0.505663,0.615778,0.006041,1.022819e-07,0.60419,0.637628,0.006041,...,0.023075,True,-0.002253,0.05,0.0,0.126276,True,-0.002482,1,1
2,-0.005058,2.165511e-20,-0.002703,0.505663,0.615778,0.006041,1.022819e-07,0.60419,0.637628,0.006041,...,0.111461,True,-0.002667,0.05,0.0,0.185769,True,-0.00275,1,4
3,-0.005058,2.165511e-20,-0.002703,0.505663,0.615778,0.006041,1.022819e-07,0.60419,0.637628,0.006041,...,0.085999,True,-0.002462,0.05,0.0,0.194587,True,-0.002531,1,5
4,-0.005058,2.165511e-20,-0.002703,0.505663,0.615778,0.006041,1.022819e-07,0.60419,0.637628,0.006041,...,0.053124,True,-0.002589,0.05,0.0,0.172872,True,-0.002488,1,6
5,-0.005058,2.165511e-20,-0.002703,0.505663,0.615778,0.006041,1.022819e-07,0.60419,0.637628,0.006041,...,0.085088,True,-0.002565,0.05,0.011,0.042174,True,-0.003027,1,8
6,-0.003459,8.578479e-20,-0.002061,0.597476,0.487778,0.11323,0.001260073,0.456199,0.735149,0.11323,...,0.122525,True,-0.003335,0.05,0.0,0.134229,True,-0.003567,2,10
7,-0.003459,8.578479e-20,-0.002061,0.597476,0.487778,0.11323,0.001260073,0.456199,0.735149,0.11323,...,0.129429,True,-0.002975,0.05,0.0,0.186401,True,-0.003317,2,1
8,-0.003459,8.578479e-20,-0.002061,0.597476,0.487778,0.11323,0.001260073,0.456199,0.735149,0.11323,...,0.119388,True,-0.003263,0.05,0.995,-0.01897,False,-0.004269,2,4
9,-0.003459,8.578479e-20,-0.002061,0.597476,0.487778,0.11323,0.001260073,0.456199,0.735149,0.11323,...,0.091474,True,-0.003232,0.05,0.0,0.097113,True,-0.003687,2,5


In [16]:
# pd.read_pickle(filePath)

In [17]:
import pickle

# Replace 'your_pickle_file.pkl' with the path to your pickle file
pickle_file_path = filePath

# Attempt to open the pickle file
try:
    with open(pickle_file_path, 'rb') as file:
        data = pickle.load(file)
    print("Pickle file loaded successfully.")
except Exception as e:
    print(f"An error occurred: {e}")


Pickle file loaded successfully.


In [20]:

# Assuming final_df is your DataFrame
final_df= data

# Step 1: Calculate median for numeric columns
median_values = final_df.median()

# Step 2: Calculate proportion of 'True' for specified columns
def true_proportion(column):
    true_count = final_df[column].value_counts().get('True', 0)
    total_count = final_df[column].count()
    return true_count / total_count if total_count else 0

true_proportion_XZ = true_proportion('XZ_test_h0_rejected')
true_proportion_YZ = true_proportion('YZ_test_h0_rejected')

# Adding these proportions to the median_values Series for completeness
median_values['XZ_test_h0_rejected'] = true_proportion_XZ
median_values['YZ_test_h0_rejected'] = true_proportion_YZ

# median_values now contains the medians of numeric columns and the proportions of 'True' for the specified columns


In [21]:
median_values

perm_mmds1                   -4.471654e-03
chi2_weights                  8.578479e-20
sim_mmds                     -2.416498e-03
sig2                          5.974756e-01
Kxy                           4.877782e-01
mean                          5.086719e-02
var                           1.401101e-05
Kxx                           4.561986e-01
Kyy                           6.376281e-01
mean_gram                     5.086719e-02
var_gram                      1.401101e-05
med                           5.974756e-01
besti                         0.000000e+00
powers                        1.791948e+00
XZ_test_alpha                 5.000000e-02
XZ_test_pvalue                0.000000e+00
XZ_test_test_stat             1.070532e-01
XZ_test_h0_rejected           0.000000e+00
XZ_test_list_permuted_mmd2   -2.576960e-03
YZ_test_alpha                 5.000000e-02
YZ_test_pvalue                0.000000e+00
YZ_test_test_stat             1.623211e-01
YZ_test_h0_rejected           0.000000e+00
YZ_test_lis

In [None]:
fileName = "_".join((symbols[symbolIDX], keys[keyIDX], "twoSampleResults.pkl"))

In [None]:
final_df['XZ_test_h0_rejected'].value_counts()

In [50]:
df_for_latex = pd.DataFrame(median_values)

In [64]:
df_for_latex = df_for_latex.rename(columns={0:'alpha'})

In [70]:
df_for_latex.to_latex(column_format='lcccccccc', escape=False, multicolumn_format='c')

'\\begin{tabular}{lcccccccc}\n\\toprule\n & alpha \\\\\n\\midrule\nperm_mmds1 & -0.004000 \\\\\nchi2_weights & 0.000000 \\\\\nsim_mmds & -0.002000 \\\\\nsig2 & 0.597000 \\\\\nKxy & 0.488000 \\\\\nmean & 0.051000 \\\\\nvar & 0.000000 \\\\\nKxx & 0.456000 \\\\\nKyy & 0.638000 \\\\\nmean_gram & 0.051000 \\\\\nvar_gram & 0.000000 \\\\\nmed & 0.597000 \\\\\nbesti & 0.000000 \\\\\npowers & 1.792000 \\\\\nXZ_test_alpha & 0.050000 \\\\\nXZ_test_pvalue & 0.000000 \\\\\nXZ_test_test_stat & 0.107000 \\\\\nXZ_test_h0_rejected & 0.00% \\\\\nXZ_test_list_permuted_mmd2 & -0.003000 \\\\\nYZ_test_alpha & 0.050000 \\\\\nYZ_test_pvalue & 0.000000 \\\\\nYZ_test_test_stat & 0.162000 \\\\\nYZ_test_h0_rejected & 11.11% \\\\\nYZ_test_list_permuted_mmd2 & -0.003000 \\\\\nwindow & 2.000000 \\\\\nshift & 5.500000 \\\\\n\\bottomrule\n\\end{tabular}\n'