# Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Some simple functions to generate synthetic or dummy data

In [2]:
def produce_synthetic_game_log(T, L, q_max):
    game_log = pd.DataFrame({'t' : [t for t in range(0, T)]})
    for l in range(L):
        game_log['q_' + str(l)] = [np.random.randint(q_max) for _ in range(0, T)]
    return game_log

def produce_database(number_of_games):
    return pd.concat([produce_synthetic_game_log(T, L, q_max) for _ in range(number_of_games)])

## Parameters of the synthetic log we want to produce

In [3]:
T = 10
L = 5
q_max = 5
database = produce_database(10000)
database

Unnamed: 0,t,q_0,q_1,q_2,q_3,q_4
0,0,2,1,4,2,2
1,1,4,2,0,4,3
2,2,0,2,1,4,0
3,3,3,4,0,1,0
4,4,0,1,3,1,1
5,5,4,3,2,3,1
6,6,3,1,4,0,1
7,7,3,1,3,2,2
8,8,4,4,1,3,4
9,9,0,4,1,2,4


## Now, we fix a level and a time step, and clean the data accordingly

In [4]:
current_level = 3
cols = {'q_' + str(i) for i in range(0, L)}
curr_level_col_name = 'q_' + str(current_level)
prev_level_col_name = 'q_' + str(current_level - 1)
drop_cols = cols - {curr_level_col_name, prev_level_col_name}

current_time = 1
time_steps = {t for t in range(0, T)}
drop_time_steps = time_steps - {current_time, current_time - 1}

# Select only the data that we need for this computation
y = database.drop(drop_time_steps, axis=0).drop(drop_cols, axis = 1)
y

Unnamed: 0,t,q_2,q_3
0,0,4,2
1,1,0,4
0,0,2,1
1,1,1,2
0,0,4,3
1,1,1,2
0,0,0,2
1,1,4,4
0,0,0,2
1,1,3,3


## Generate 3 columns: Q_{t-1}^{l-1}, Q_{t-1}^{l}, and Q_{t}^{l}

In [5]:
# New column names for the mesh, we care about:
# previous level, previous time step -> Q_{t-1}^{l-1}
# current level, previous time step  -> Q_{t-1}^{l}
# current level, current time step   -> Q_{t}^{l}
prev_level_prev_time_col_name = prev_level_col_name + '_t_' + str(current_time - 1)
curr_level_prev_time_col_name = curr_level_col_name + '_t_' + str(current_time - 1)
curr_level_curr_time_col_name = curr_level_col_name + '_t_' + str(current_time)

# Horizontal merger of the next time step data
w = pd.concat([
            y.drop(current_time).reset_index(drop=True).
                rename(columns={prev_level_col_name : prev_level_prev_time_col_name,
                                curr_level_col_name : curr_level_prev_time_col_name}), 
            y.drop(current_time - 1).reset_index(drop=True).
                rename(columns={curr_level_col_name : curr_level_curr_time_col_name})
              ], axis=1).drop(['t', prev_level_col_name], axis=1)
w

Unnamed: 0,q_2_t_0,q_3_t_0,q_3_t_1
0,4,2,4
1,2,1,2
2,4,3,2
3,0,2,4
4,0,2,3
5,0,4,0
6,0,3,4
7,0,4,1
8,2,2,3
9,2,2,1


# For each possible value Q_{t}^{l} \in {0, ..., q_max}, count

In [6]:
# Count occurrences from 0 to q_max.
# More specifically, for each value q \in {0, ..., q_max}, create a column with a 1 if 
# the current row coincides with q, and 0 otherwise. 
for i in range(0, q_max):
    w[curr_level_col_name + '_t_' + str(current_time) + '=' + str(i)] = \
            np.where(w[curr_level_col_name + '_t_' + str(current_time)] == i, 1, 0)
w = w.drop(curr_level_col_name + '_t_' + str(current_time), axis=1)
w

Unnamed: 0,q_2_t_0,q_3_t_0,q_3_t_1=0,q_3_t_1=1,q_3_t_1=2,q_3_t_1=3,q_3_t_1=4
0,4,2,0,0,0,0,1
1,2,1,0,0,1,0,0
2,4,3,0,0,1,0,0
3,0,2,0,0,0,0,1
4,0,2,0,0,0,1,0
5,0,4,1,0,0,0,0
6,0,3,0,0,0,0,1
7,0,4,0,1,0,0,0
8,2,2,0,0,0,1,0
9,2,2,0,1,0,0,0


## The final magic here... group by the two variables: Q_{t-1}^{l-1}, Q_{t-1}^{l} and sum up

In [7]:
# Computing histograms
z = w.groupby([prev_level_prev_time_col_name, curr_level_prev_time_col_name]).sum()
z

Unnamed: 0_level_0,Unnamed: 1_level_0,q_3_t_1=0,q_3_t_1=1,q_3_t_1=2,q_3_t_1=3,q_3_t_1=4
q_2_t_0,q_3_t_0,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,83,74,70,83,80
0,1,84,85,73,86,91
0,2,77,73,81,83,73
0,3,69,79,78,66,79
0,4,87,82,78,72,85
1,0,80,72,95,72,83
1,1,77,82,71,77,73
1,2,85,75,78,103,82
1,3,80,81,90,90,91
1,4,76,79,77,70,75


# Normalization into a probability distribution

In [8]:
final = z.div(z.sum(axis=1), axis=0)
final

Unnamed: 0_level_0,Unnamed: 1_level_0,q_3_t_1=0,q_3_t_1=1,q_3_t_1=2,q_3_t_1=3,q_3_t_1=4
q_2_t_0,q_3_t_0,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,0.212821,0.189744,0.179487,0.212821,0.205128
0,1,0.200477,0.202864,0.174224,0.205251,0.217184
0,2,0.198966,0.18863,0.209302,0.21447,0.18863
0,3,0.185984,0.212938,0.210243,0.177898,0.212938
0,4,0.215347,0.20297,0.193069,0.178218,0.210396
1,0,0.199005,0.179104,0.236318,0.179104,0.206468
1,1,0.202632,0.215789,0.186842,0.202632,0.192105
1,2,0.200946,0.177305,0.184397,0.243499,0.193853
1,3,0.185185,0.1875,0.208333,0.208333,0.210648
1,4,0.201592,0.209549,0.204244,0.185676,0.198939


# Save to .csv file. This saves a SINGLE conditional probability table...

In [9]:
final.to_csv('data/T_' + str(T) + '_L_' + str(L) + '_' + curr_level_curr_time_col_name + '.csv')