In [1]:
import os
import re
import pickle
import pandas as pd
from collections import defaultdict
import tabulate


In [2]:
!pip install tabulate



In [25]:
class ResultsProcessor:
    def __init__(self, folder_path, symbol, variable):
        self.folder_path = folder_path
        self.symbol = symbol
        self.variable = variable

#     def _read_pickle_file(self, file_path):
#         with open(file_path, 'rb') as f:
#             data = pickle.load(f)
#         return data
    def _read_pickle_file(self, symbol, bar_choice, variable, results_directory):
        new_filename = f"{symbol}_{bar_choice}_{variable}_results.pickle"
        original_filename = f"{symbol}_{bar_choice}_{variable}_results.pkl"

        new_filepath = os.path.join(results_directory, new_filename)
        original_filepath = os.path.join(results_directory, original_filename)

        if os.path.exists(new_filepath):
            with open(new_filepath, "rb") as file:
                data = pickle.load(file)
        elif os.path.exists(original_filepath):
            with open(original_filepath, "rb") as file:
                data = pickle.load(file)
        else:
            raise FileNotFoundError(f"Neither '{new_filename}' nor '{original_filename}' exist in the directory '{results_directory}'.")

        return data


    def _get_shift_and_window(self, file_name):
        parts = file_name.split('_')
        shift = int(parts[parts.index('shift') + 1])
        window = int(parts[parts.index('wind') + 1])
        return shift, window

    def _process_defaultdict(self, default_dict):
        data = []
        for key, value in default_dict.items():
            row = [
                key,
                value["med_on_test_data"],
                value["test_result"]["alpha"],
                value["test_result"]["pvalue"],
                value["test_result"]["test_stat"],
                value["test_result"]["h0_rejected"],
            ]
            data.append(row)

        columns = [
            "Key",
            "Med_on_test_data",
            "Alpha",
            "P-value",
            "Test_stat",
            "H0_rejected",
        ]

        df = pd.DataFrame(data, columns=columns)
        return df

    def process_files(self):
        dfs = {}
        for file_name in os.listdir(self.folder_path):
            if self.symbol in file_name and self.variable in file_name:
                shift, window = self._get_shift_and_window(file_name)
                file_path = os.path.join(self.folder_path, file_name)
                default_dict = self._read_pickle_file(file_path)
                df = self._process_defaultdict(default_dict)
                dfs[(shift, window)] = df
        return dfs
    def filter_and_print_results(self, dfs):
        h0_rejected_false_count = 0
        total_count = 0

        for (shift, window), df in dfs.items():
            df_filtered = df.dropna()
            h0_rejected_false_count += (df_filtered["H0_rejected"] == False).sum()
            total_count += len(df_filtered)

            print(f"Shift: {shift}, Window: {window}")
            print(df_filtered)
            print("\n")

        if total_count > 0:
            h0_rejected_false_ratio = h0_rejected_false_count / total_count
            print(f"H0_rejected as False ratio: {h0_rejected_false_ratio:.2%}")
        else:
            print("No numerical results found.")
        return dfs  # Add this line to return the filtered DataFrames dictionary
    def summary_statistics(self, dfs):
        summary_data = []
        
        for (shift, window), df in dfs.items():
            df_filtered = df.dropna()
            
            med_test_stat = df_filtered["Test_stat"].median()
            std_test_stat = df_filtered["Test_stat"].std()
            med_med_on_test_data = df_filtered["Med_on_test_data"].median()
            std_med_on_test_data = df_filtered["Med_on_test_data"].std()
            
            summary_data.append([shift, window, med_test_stat, std_test_stat, med_med_on_test_data, std_med_on_test_data])
        
        summary_columns = ["Shift", "Window", "Med_Test_stat", "Std_Test_stat", "Med_Med_on_test_data", "Std_Med_on_test_data"]
        summary_df = pd.DataFrame(summary_data, columns=summary_columns)
        
        return summary_df
    def extract_shifts_and_windows(dict_keys):
        shifts = []
        windows = []

        for key_pair in dict_keys:
            shift, window = key_pair
            shifts.append(shift)
            windows.append(window)

        return shifts, windows
    
    def rank_h0_rejections(self, dfs):
        h0_rejections_data = []

        for (shift, window), df in dfs.items():
            total_rows = len(df)
            h0_rejected_count = df["H0_rejected"].sum()
            rejection_proportion = h0_rejected_count / total_rows

            h0_rejections_data.append([shift, window, rejection_proportion])

        h0_rejections_columns = ["Shift", "Window", "Rejection_Proportion"]
        h0_rejections_df = pd.DataFrame(h0_rejections_data, columns=h0_rejections_columns)
        
        h0_rejections_df = h0_rejections_df.sort_values("Rejection_Proportion", ascending=False).reset_index(drop=True)

        return h0_rejections_df

    def create_summary_table(self, dfs):
        summary_data = []

        for (shift, window), df in dfs.items():
            h0_rejected_count = df["H0_rejected"].sum()
            total_rows = len(df)
            summary_data.append((shift, window, h0_rejected_count, total_rows))

        summary_columns = ["Shift", "Window", "H0_rejected_count", "Total_rows"]
        summary_df = pd.DataFrame(summary_data, columns=summary_columns)

        return summary_df
    








In [4]:
folder_path = "/media/ak/T71/August11th2022Experiments/ExperimentOne/LinearMMDOutputFiles"
symbol = "KE1"
variable = "alpha"

processor = ResultsProcessor(folder_path, symbol, variable)
dfs = processor.process_files()

# Print the DataFrames
for (shift, window), df in dfs.items():
    print(f"Shift: {shift}, Window: {window}")
    print(df)
    print("\n")

Shift: 1, Window: 1
     Key  Med_on_test_data  Alpha  P-value  Test_stat  H0_rejected
0      0               NaN   0.01      NaN        NaN        False
1      1               NaN   0.01      NaN        NaN        False
2      2               NaN   0.01      NaN        NaN        False
3      3               NaN   0.01      NaN        NaN        False
4      4               NaN   0.01      NaN        NaN        False
..   ...               ...    ...      ...        ...          ...
119  119               NaN   0.01      NaN        NaN        False
120  120               NaN   0.01      NaN        NaN        False
121  121               NaN   0.01      NaN        NaN        False
122  122               NaN   0.01      NaN        NaN        False
123  123               NaN   0.01      NaN        NaN        False

[124 rows x 6 columns]


Shift: 1, Window: 2
     Key  Med_on_test_data  Alpha  P-value  Test_stat  H0_rejected
0      0          1.751910   0.01      NaN        0.0        Fa

In [5]:
 processor.filter_and_print_results(dfs)

Shift: 1, Window: 1
    Key  Med_on_test_data  Alpha   P-value  Test_stat  H0_rejected
18   18          1.217856   0.01  0.000038   0.588348         True
19   19          1.068112   0.01  0.000074   0.326038         True
20   20          1.056599   0.01  0.000050   0.593986         True
21   21          0.970018   0.01  0.000058   0.609645         True
37   37          1.537628   0.01  0.000065   0.724482         True
87   87          1.612587   0.01  0.000039   0.884001         True


Shift: 1, Window: 2
Empty DataFrame
Columns: [Key, Med_on_test_data, Alpha, P-value, Test_stat, H0_rejected]
Index: []


Shift: 1, Window: 3
    Key  Med_on_test_data  Alpha   P-value  Test_stat  H0_rejected
17   17          1.217856   0.01  0.000038   0.588348         True
18   18          1.068112   0.01  0.000074   0.326038         True
19   19          1.056599   0.01  0.000050   0.593986         True
20   20          0.970018   0.01  0.000058   0.609645         True
36   36          1.537628   0.01 

{(1,
  1):      Key  Med_on_test_data  Alpha  P-value  Test_stat  H0_rejected
 0      0               NaN   0.01      NaN        NaN        False
 1      1               NaN   0.01      NaN        NaN        False
 2      2               NaN   0.01      NaN        NaN        False
 3      3               NaN   0.01      NaN        NaN        False
 4      4               NaN   0.01      NaN        NaN        False
 ..   ...               ...    ...      ...        ...          ...
 119  119               NaN   0.01      NaN        NaN        False
 120  120               NaN   0.01      NaN        NaN        False
 121  121               NaN   0.01      NaN        NaN        False
 122  122               NaN   0.01      NaN        NaN        False
 123  123               NaN   0.01      NaN        NaN        False
 
 [124 rows x 6 columns],
 (1,
  2):      Key  Med_on_test_data  Alpha  P-value  Test_stat  H0_rejected
 0      0          1.751910   0.01      NaN        0.0        False
 

In [6]:
filtered_dfs = processor.filter_and_print_results(dfs)
summary_df = processor.summary_statistics(filtered_dfs)
print(summary_df)

Shift: 1, Window: 1
    Key  Med_on_test_data  Alpha   P-value  Test_stat  H0_rejected
18   18          1.217856   0.01  0.000038   0.588348         True
19   19          1.068112   0.01  0.000074   0.326038         True
20   20          1.056599   0.01  0.000050   0.593986         True
21   21          0.970018   0.01  0.000058   0.609645         True
37   37          1.537628   0.01  0.000065   0.724482         True
87   87          1.612587   0.01  0.000039   0.884001         True


Shift: 1, Window: 2
Empty DataFrame
Columns: [Key, Med_on_test_data, Alpha, P-value, Test_stat, H0_rejected]
Index: []


Shift: 1, Window: 3
    Key  Med_on_test_data  Alpha   P-value  Test_stat  H0_rejected
17   17          1.217856   0.01  0.000038   0.588348         True
18   18          1.068112   0.01  0.000074   0.326038         True
19   19          1.056599   0.01  0.000050   0.593986         True
20   20          0.970018   0.01  0.000058   0.609645         True
36   36          1.537628   0.01 

In [7]:
filtered_dfs = processor.filter_and_print_results(dfs)
h0_rejections_df = processor.rank_h0_rejections(filtered_dfs)
print(h0_rejections_df)

Shift: 1, Window: 1
    Key  Med_on_test_data  Alpha   P-value  Test_stat  H0_rejected
18   18          1.217856   0.01  0.000038   0.588348         True
19   19          1.068112   0.01  0.000074   0.326038         True
20   20          1.056599   0.01  0.000050   0.593986         True
21   21          0.970018   0.01  0.000058   0.609645         True
37   37          1.537628   0.01  0.000065   0.724482         True
87   87          1.612587   0.01  0.000039   0.884001         True


Shift: 1, Window: 2
Empty DataFrame
Columns: [Key, Med_on_test_data, Alpha, P-value, Test_stat, H0_rejected]
Index: []


Shift: 1, Window: 3
    Key  Med_on_test_data  Alpha   P-value  Test_stat  H0_rejected
17   17          1.217856   0.01  0.000038   0.588348         True
18   18          1.068112   0.01  0.000074   0.326038         True
19   19          1.056599   0.01  0.000050   0.593986         True
20   20          0.970018   0.01  0.000058   0.609645         True
36   36          1.537628   0.01 

In [8]:
summary_table = processor.create_summary_table(filtered_dfs)
print(summary_table)

    Shift  Window  H0_rejected_count  Total_rows
0       1       1                  6         124
1       1       2                  0         123
2       1       3                  6         122
3       1       4                  4         121
4       1       5                  5         120
5       1       6                 10         119
6       1       7                  5         118
7       1       8                  5         117
8       1       9                  9         116
9       1      10                  6         115
10      1      11                  6         114
11      1      12                  3         113
12      1      13                  5         112
13      1      14                  8         111
14      1      15                  7         110
15      1      16                  5         109
16      1      17                  4         108
17      1      18                  7         107
18      1      19                  5         106
19      1      20   

In [9]:
def summary_statistics_short(dfs):
    summary_data = []

    for (shift, window), df in dfs.items():
        total_rows = len(df)
        h0_rejected_count = df["H0_rejected"].sum()
        h0_rejected_percentage = (h0_rejected_count / total_rows) * 100
        median_test_stat = df["Test_stat"].median()
        median_p_value = df["P-value"].median()

        summary_data.append({
            "Shift": shift,
            "Window": window,
            "H0_rejected_percentage": h0_rejected_percentage,
            "Median_test_stat": median_test_stat,
            "Median_p_value": median_p_value
        })

    summary_df = pd.DataFrame(summary_data)
    # Format the H0_rejected_percentage column as a percentage
    summary_df["H0_rejected_percentage"] = summary_df["H0_rejected_percentage"].apply(lambda x: f"{x:.2f}%")
    summary_df["Median_test_stat"] = summary_df["Median_test_stat"].round(2)
    summary_df["Median_p_value"] = summary_df["Median_p_value"].round(2)
    return summary_df

In [10]:
short_df = summary_statistics_short(filtered_dfs)

  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [11]:
from tabulate import tabulate
# Assuming your DataFrame is called summary_df
latex_table = tabulate(short_df, tablefmt='latex_booktabs', headers='keys', showindex=False, floatfmt=".2f")

print(latex_table)

\begin{tabular}{rrlrr}
\toprule
   Shift &   Window & H0\_rejected\_percentage   &   Median\_test\_stat &   Median\_p\_value \\
\midrule
       1 &        1 & 4.84\%                    &               0.60 &             0.00 \\
       1 &        2 & 0.00\%                    &               0.00 &           nan    \\
       1 &        3 & 4.92\%                    &               0.60 &             0.00 \\
       1 &        4 & 3.31\%                    &               0.00 &             0.00 \\
       1 &        5 & 4.17\%                    &               0.61 &             0.00 \\
       1 &        6 & 8.40\%                    &               0.53 &             0.00 \\
       1 &        7 & 4.24\%                    &               0.75 &             0.00 \\
       1 &        8 & 4.27\%                    &               0.77 &             0.00 \\
       1 &        9 & 7.76\%                    &               0.41 &             0.00 \\
       1 &       10 & 5.22\%                

In [12]:
modified_latex_table = (
    "\\begin{table}\n"
    "\\centering\n"
    + latex_table.replace('\\toprule', '\\toprule\n\\midrule[1.5pt]')
    + "\n\\end{table}"
)
print(modified_latex_table)

\begin{table}
\centering
\begin{tabular}{rrlrr}
\toprule
\midrule[1.5pt]
   Shift &   Window & H0\_rejected\_percentage   &   Median\_test\_stat &   Median\_p\_value \\
\midrule
       1 &        1 & 4.84\%                    &               0.60 &             0.00 \\
       1 &        2 & 0.00\%                    &               0.00 &           nan    \\
       1 &        3 & 4.92\%                    &               0.60 &             0.00 \\
       1 &        4 & 3.31\%                    &               0.00 &             0.00 \\
       1 &        5 & 4.17\%                    &               0.61 &             0.00 \\
       1 &        6 & 8.40\%                    &               0.53 &             0.00 \\
       1 &        7 & 4.24\%                    &               0.75 &             0.00 \\
       1 &        8 & 4.27\%                    &               0.77 &             0.00 \\
       1 &        9 & 7.76\%                    &               0.41 &             0.00 \\
   

In [15]:
@staticmethod
    def save_summary_table_to_unique_directory(summary_df, unique_dir, filename="summary_table.csv"):
        # Create the unique directory if it doesn't exist
        Path(unique_dir).mkdir(parents=True, exist_ok=True)

        # Save the summary DataFrame as a CSV file in the unique directory
        summary_df.to_csv(os.path.join(unique_dir, filename), index=False)

['DU1',
 'FB1',
 'FV1',
 'G_1',
 'JB1',
 'KE1',
 'RX1',
 'TU1',
 'TY1',
 'US1',
 'XM1',
 'YM1']

In [18]:
file_name = 'FV1_volume_tau_results.pickle'
file_loc = os.path.join(folder_path, file_name)

In [20]:
fvdefdict = pd.read_pickle(file_loc)

In [22]:
fvdefdict.keys()

dict_keys([(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (4, 7), (4, 8), (4, 9), (4, 10), (5, 9), (5, 10)])