# Analyzing results

In [None]:
import sys,os
root_path = os.path.abspath(os.path.join('..'))
if root_path not in sys.path:
    sys.path.append(root_path)
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from lib.utils import utils
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt

## Computing Triples

### Get a number of games

In [None]:
start_index = 0
num_games = 100
path = "../data/diplomacy-v1-27k-msgs/standard_no_press.jsonl"
games_jsons = utils.load_jsonl(path, num_games=num_games, mmap=False, completed_only=True)

In [None]:
# Convert to a pandas dataframe
df = pd.DataFrame(games_jsons)

In [None]:
games = df['phases']
games.apply(lambda x: x[0])

In [None]:
for game in games:
    for ix, iy in enumerate(game):
        game[ix]['phase_id'] = ix

### Flatten json records

In [None]:
all_records = []
for idx, game in enumerate(games):
    for idx, phase in enumerate(game):
        row_generator = utils.flatten_json(phase)
        assert row_generator is not None, row_generator
        for row in row_generator:
            all_records.append(row.copy())

### Get the complete dataframe with an added column

In [None]:
complete_df = pd.DataFrame.from_records(all_records)
complete_df['unique_unit_id'] = -1

### Get the unique games in the dataframe

In [None]:
unique_games = complete_df["game_id"].unique()

### Do temporal binding of units

In [None]:
game_phase_df_list = []
for idx, game_id in enumerate(unique_games):
    phases_df_list = []
    print(idx, game_id, flush=True)
    s_dict = {}
    d_dict = {}
    _id = 1
    game_df = complete_df.loc[complete_df["game_id"].apply(lambda x: x == game_id)]
    unique_phases = game_df['phase_id'].unique()
    for phase in unique_phases:
        condition = game_df["phase_id"].apply(lambda x: x == phase)
        phase_df = game_df.loc[condition]
        s_dict, d_dict, _id = utils.assign_unit_id(phase_df, s_dict, d_dict, _id)
        phases_df_list.append(phase_df)
    phases_cdf = pd.concat(phases_df_list)
    dislodged_df = phases_cdf.loc[phases_cdf['action'] == -2].copy()
    utils.replace_dislodged_units(phases_cdf, dislodged_df)
    game_phase_df_list.append(phases_cdf)

In [None]:
cdf = pd.concat(game_phase_df_list)

### Assert no invalid rows are left

In [None]:
assert cdf.loc[cdf['coordinator'] == 'RA'].empty

In [None]:
cdf.loc[cdf['unique_unit_id'].apply(lambda x: x == 1)].head()

### Filter out adjustment and retreat seasons

In [None]:
spring_fall_phases=(cdf['phase_name'].apply(lambda x:x[0])!='W') & (cdf['phase_name'].apply(lambda x:x[-1])!='R') & (cdf['phase_name'].apply(lambda x:x[-1]) == 'M')
cdf_sf = cdf.loc[spring_fall_phases].copy()
cdf_sf['phase_num']=cdf_sf.phase_name.apply(lambda x: float(x[1:-1]+('.0' if x[0]=='S' else '.5')))

In [None]:
cdf_sf.head()

### Get the triples' presence based on the filtered dataframe

In [None]:
game_triple_presence = {}
# lens = []
for idx, game_id in enumerate(unique_games):
    assert type(game_id) is str, (game_id, "is not a string")
    print(idx, game_id)
    game_df = cdf_sf.loc[cdf_sf['game_id'] == game_id]
    
    if game_df.unique_unit_id.nunique() == game_df.unique_unit_id.max():
        triples = utils.get_triples(game_df)
        # lens.append(len(triples))
        try:
            emp = utils.get_triples_presence(game_df, triples)
            print(emp, len(triples))
            game_triple_presence[game_id] = triples
        except AssertionError as msg:
            print(msg)
    # if idx == 0:
    #     break

## Get n-tuples

In [None]:
game_tuple_presence = {}
lens = []
for idx, game_id in enumerate(unique_games):
    assert type(game_id) is str, (game_id, "is not a string")
    print(idx, game_id)
    game_df = cdf_sf.loc[cdf_sf['game_id'] == game_id]

    if game_df.unique_unit_id.nunique() == game_df.unique_unit_id.max():
        tuples = utils.get_n_tuples(game_df)
        random_tuples = utils.get_random_samples(tuples, 5)
        tuple_lens = [len(tuples[n]) for n in tuples]
        lens.append(tuple_lens)
        try:
            ret_dict = utils.get_tuples_presence(game_df, random_tuples)
            print(emp)
            game_tuple_presence[game_id] = ret_dict
        except Exception as msg:
            print(msg)
    if idx == 0:
        break

In [None]:
game_tuple_presence

In [None]:
tuples[0].keys()

In [None]:
sum(lens[0])


In [None]:
all_records = []
for row in utils.gen_tuple_rows(game_tuple_presence):
    all_records.append(row.copy())

In [None]:
df = df.from_records(all_records)

In [None]:
df = df.dropna()

In [None]:
df["mult"] = df.unit_counts.apply(lambda x: np.prod(x))

In [None]:
df

In [None]:
df.max_min_diff.value_counts().sort_index()

In [None]:
df.to_hdf('../data/neww.h5', key='df', mode='w')

In [None]:
pd.read_hdf('../data/neww.h5', key='df')

In [None]:
df.loc[df.k == 5]

In [None]:
np_lens = np.array(lens)

In [None]:
df = pd.DataFrame(np_lens)

In [None]:
np_sum = np_lens.sum(axis=0)

In [None]:
import seaborn as sns
sns.pointplot(x = np.arange(len(np_sum)), y = np_sum).set_title('total numebr of 5-tuples for 100 games, k = 0,1,2,3,4')

In [None]:
lens
sns.lineplot(x = np.arange(len(lens)), y = lens).set_title(f'triples lens ove 100 games with sum = {sum(lens)}')

### Save json file if necessary

In [None]:
name = "whatever"
with open(f'{name}.json', 'w') as file:          
    json.dump(game_triple_presence, file, indent=4, sort_keys=True,)

In [None]:
cdf_sf[cdf_sf["unique_unit_id"].apply(lambda x: x == 1)].head(20)

## Analyze the gathered triples

In [None]:
path = ""
with open(f'{path}', 'r') as file:
    game_triple_presence = json.load(file)

In [None]:
list(game_triple_presence.values())[0][0]

In [None]:
game_id = list(game_triple_presence.keys())[0]

In [None]:
all_records = []
for row in utils.gen_triple_rows(game_triple_presence):
    all_records.append(row.copy())

In [None]:
df = pd.DataFrame.from_records(all_records)

In [None]:
df = df.dropna()

In [None]:
df.reset_index(inplace=True)

In [None]:
df.head()

### Plotting

In [None]:
samediff_count_list = (df.factor_same.values, df.factor_diff.values)

In [None]:
a = zip(samediff_count_list[0],samediff_count_list[1])

In [None]:
fig,ax=plt.subplots()
for data in zip(*filter(None,a)):
    counts,bins=np.histogram(data)
    ax.plot(bins[:-1],counts)

In [None]:
import seaborn as sns

In [None]:
h5_array = []
for i in range(11):
    if i == 9:
        continue
    path = f"path_{i*100}_{(i+1)*100 - 10}_{10}.h5"
    
    try:
        # with open(f'{path}', 'r') as file:
        #     game_triple_presence = json.load(file)
        df = pd.read_hdf(path, key='df')
        h5_array.append(df)
        print("appended", path)

    except Exception as e:
        print("path does not exist", path, e)

cdf = pd.concat(h5_array)
del h5_array

In [None]:
sliced_df = cdf[["min_phase_num" , "max_min_diff", "unit_counts", "joint", "k"]]

In [None]:
del cdf

In [None]:
sliced_df.count()

In [None]:
sliced_df['prod'] = sliced_df.unit_counts.apply(lambda x: np.prod(x))

In [None]:
sliced_df['factor'] = sliced_df['prod'] / sliced_df['joint']

In [None]:
sliced_df.head()

In [None]:
def dummy_func(x):
    counts_df = x.diff_sign.value_counts()
    return counts_df[1]/(counts_df[1] + counts_df[-1])

In [None]:
gp_df = sliced_df.groupby(['min_phase_num', 'max_min_diff']).apply(lambda x: x.factor.mean())

In [None]:
gp_df_k = sliced_df.groupby(['k', 'min_phase_num']).apply(lambda x: x.factor.mean())

In [None]:
gp_df_k_diff = sliced_df.groupby(['k', 'max_min_diff']).apply(lambda x: x.factor.mean())

In [None]:
gp_df_kw = sliced_df.groupby(['k', 'min_phase_num', 'max_min_diff']).apply(lambda x: x.factor.mean())

In [None]:
sliced_df

In [None]:
import seaborn as sns

In [None]:
def dummy_func(x):
    counts_df = x.diff_sign.value_counts()
    if 1 in counts_df.index:
        one_counts = counts_df.loc[1]
    else:
        one_counts = 0
    if -1 in counts_df.index:
        minus_one_counts = counts_df.loc[-1]
    else:
        minus_one_counts = 0
    
    if (one_counts + minus_one_counts) == 0:
        print(x.max_min_diff)
        return np.NaN
    else:
        return one_counts/(one_counts + minus_one_counts)

In [None]:
cdf = pd.read_hdf('path', key='df')

In [None]:
cdf['diff'] = cdf['factor_same'] - cdf['factor_diff']
cdf['diff_sign'] = np.sign(cdf['diff'])
gp_df = cdf.groupby(['min_phase_num', 'max_min_diff']).apply(dummy_func)
gp_df = gp_df.dropna()

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
sns.heatmap(gp_df.unstack().T, xticklabels= 6, yticklabels= 6, ax=ax, vmin=0.4, vmax=0.65)
ax.set_xlabel("Year")
ax.set_ylabel("Window Size, w")
ax.set_title("Mean factor")
x_labels = [str(x.get_text()[:-2]) for x in ax.get_xticklabels()]
ax.set_xticklabels(x_labels)
ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
ax.invert_yaxis()
fig.tight_layout()
fig.savefig('factor_heatmap.pdf', dpi=300, bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
sns.heatmap(gp_df_k.unstack().T, xticklabels= 2, yticklabels= 6, ax=ax)
sns.lineplot()
ax.set_xlabel("k")
ax.set_ylabel("Year")
ax.set_title("Mean factor")
# x_labels = [str(x.get_text()[:-2]) for x in ax.get_xticklabels()]
# ax.set_xticklabels(x_labels)
# ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
ax.invert_yaxis()
fig.tight_layout()
fig.savefig('factor_heatmap_k.pdf', dpi=300, bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(3.5, 2.5))
ax = fig.add_subplot(111)
gp_df_k.unstack().T.plot(ax=ax,cmap="viridis_r")
# x_labels = [str(x.get_text()[:-2]) for x in ax.get_xticklabels()]
ax.set_xticks([x for idx, x in enumerate(gp_df_k.unstack().T.index.tolist()) if idx % 3 == 0])
# ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=45)
for label in ax.get_xticklabels():
    label.set_rotation(90)
ax.set_xlabel(r"Year")
ax.set_ylabel(r"Compression Factor, $Q$")
# ax.set_label("k")
ax.legend([x for x in ax.get_legend_handles_labels()[1]], prop={'size': 9}, frameon=False, bbox_to_anchor=(1.05, 1))

fig.tight_layout()
fig.savefig('factor_heatmap_k_year.pdf', dpi=300, bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(2,2))
ax = fig.add_subplot(111)
gp_df_k_diff.unstack().T.mean(axis=1).plot(ax=ax, legend=False)
# ax.set_xticks([x for idx, x in enumerate(gp_df_k_diff.unstack().T.index.tolist()) if idx % 10 == 0])
ax.set_xticks(np.arange(0, 21, 5))
ax.set_xlim(0, 20)
ax.set_ylim(0, 15)
ax.set_xlabel("Window Size, w")
ax.set_ylabel("Mean Factor")
ax.grid(True)
# ax.legend(['k = ' + x for x in ax.get_legend_handles_labels()[1]], prop={'size': 9}, frameon=False)

fig.tight_layout()
fig.savefig('factor_heatmap_k_window.pdf', dpi=300, bbox_inches='tight')

In [None]:
gp_df_k.unstack().T.index

In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.add_subplot(111)
sns.heatmap(gp_df_k.unstack().T, xticklabels= 2, yticklabels= 6, ax=ax)
plt.cm.inferno_r(0.5)
ax.set_xlabel("k")
ax.set_ylabel("Window Size, w")
ax.set_title("Mean factor")
# x_labels = [str(x.get_text()[:-2]) for x in ax.get_xticklabels()]
# ax.set_xticklabels(x_labels)
# ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
ax.invert_yaxis()
fig.tight_layout()
fig.savefig('factor_heatmap_k.pdf', dpi=300, bbox_inches='tight')