# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Some simple functions to generate synthetic or dummy data

In [2]:
def produce_synthetic_game_log(T, L, q_max):
    game_log = pd.DataFrame({'t' : [t for t in range(0, T)]})
    for l in range(L):
        game_log['q_' + str(l)] = [np.random.randint(q_max) for _ in range(0, T)]
    return game_log

def produce_database(number_of_games):
    return pd.concat([produce_synthetic_game_log(T, L, q_max) for _ in range(number_of_games)])

## Parameters of the synthetic log we want to produce

In [None]:
T = 10
L = 5
q_max = 5
database = produce_database(10000)
database

## Now, we fix a level and a time step, and clean the data accordingly

In [None]:
current_level = 3
cols = {'q_' + str(i) for i in range(0, L)}
curr_level_col_name = 'q_' + str(current_level)
prev_level_col_name = 'q_' + str(current_level - 1)
drop_cols = cols - {curr_level_col_name, prev_level_col_name}

current_time = 1
time_steps = {t for t in range(0, T)}
drop_time_steps = time_steps - {current_time, current_time - 1}

# Select only the data that we need for this computation
y = database.drop(drop_time_steps, axis=0).drop(drop_cols, axis = 1)
y

## Generate 3 columns: Q_{t-1}^{l-1}, Q_{t-1}^{l}, and Q_{t}^{l}

In [None]:
# New column names for the mesh, we care about:
# previous level, previous time step -> Q_{t-1}^{l-1}
# current level, previous time step  -> Q_{t-1}^{l}
# current level, current time step   -> Q_{t}^{l}
prev_level_prev_time_col_name = prev_level_col_name + '_t_' + str(current_time - 1)
curr_level_prev_time_col_name = curr_level_col_name + '_t_' + str(current_time - 1)
curr_level_curr_time_col_name = curr_level_col_name + '_t_' + str(current_time)

# Horizontal merger of the next time step data
w = pd.concat([
            y.drop(current_time).reset_index(drop=True).
                rename(columns={prev_level_col_name : prev_level_prev_time_col_name,
                                curr_level_col_name : curr_level_prev_time_col_name}), 
            y.drop(current_time - 1).reset_index(drop=True).
                rename(columns={curr_level_col_name : curr_level_curr_time_col_name})
              ], axis=1).drop(['t', prev_level_col_name], axis=1)
w

# For each possible value Q_{t}^{l} \in {0, ..., q_max}, count

In [None]:
# Count occurrences from 0 to q_max.
# More specifically, for each value q \in {0, ..., q_max}, create a column with a 1 if 
# the current row coincides with q, and 0 otherwise. 
for i in range(0, q_max):
    w[curr_level_col_name + '_t_' + str(current_time) + '=' + str(i)] = \
            np.where(w[curr_level_col_name + '_t_' + str(current_time)] == i, 1, 0)
w = w.drop(curr_level_col_name + '_t_' + str(current_time), axis=1)
w

## The final magic here... group by the two variables: Q_{t-1}^{l-1}, Q_{t-1}^{l} and sum up

In [None]:
# Computing histograms
z = w.groupby([prev_level_prev_time_col_name, curr_level_prev_time_col_name]).sum()
z

# Normalization into a probability distribution

In [None]:
final = z.div(z.sum(axis=1), axis=0)
final

# Save to .csv file. This saves a SINGLE conditional probability table...

In [None]:
final.to_csv('data/T_' + str(T) + '_L_' + str(L) + '_' + curr_level_curr_time_col_name + '.csv')