In [1]:
import pandas as pd
from joblib import Parallel, delayed

In [2]:
# DATA_DIRECTORY = "data/test_1_10"
# DATA_DIRECTORY = "data/test_3_10"
# DATA_DIRECTORY = "data/test_7_10"
DATA_DIRECTORY = "data/test_10_10"

In [3]:
def read_df(num):
    df = pd.read_csv(f"{DATA_DIRECTORY}/{num}.csv")
    
    with open(f"{DATA_DIRECTORY}/{num}.txt") as iter_file:
        df["iterations"] = int(iter_file.read())
        
    return df

In [4]:
from os import listdir
from os.path import isfile, join
onlyfiles = [f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f))]

In [5]:
max_file = max(int(f.split('.')[0]) for f in onlyfiles)

In [6]:
nums = range(max_file + 1)

In [7]:
%%time
dfs = Parallel(n_jobs=10)(delayed(read_df)(num) for num in nums)

Wall time: 17.8 s


In [8]:
all_together = pd.concat(dfs, axis=0).reset_index(drop=True)

In [9]:
all_together

Unnamed: 0,used_algo,time_interval,local_traffic0,local_traffic1,local_traffic2,local_traffic3,expected_traffic0,expected_traffic1,expected_traffic2,expected_traffic3,nearby_algos0,nearby_algos1,nearby_algos2,nearby_algos3,iterations
0,1,10,0,1,0,0,2,0,0,0,1,1,0,0,53
1,1,10,1,0,0,0,2,2,0,0,1,1,0,0,53
2,1,10,0,0,0,0,2,0,0,0,1,0,0,0,53
3,1,10,2,0,0,0,2,0,0,0,1,1,0,0,53
4,1,10,2,0,0,0,1,3,0,0,1,1,0,0,53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25666,6,10,0,0,0,0,1,2,0,0,6,6,0,0,38
25667,6,10,0,0,0,0,2,0,0,0,6,0,0,0,38
25668,6,10,0,0,0,0,0,0,0,0,6,6,0,0,38
25669,6,10,0,0,0,0,0,0,0,0,6,6,0,0,38


In [10]:
data_cols = all_together.columns.drop(["iterations"])
groupby_stats =  all_together.groupby(list(data_cols))
avg_iterations = groupby_stats.mean()

In [11]:
avg_iterations.reset_index(inplace=True)

In [12]:
counts = groupby_stats.count().reset_index(drop=True)["iterations"]

In [13]:
counts

0        30
1       527
2        64
3         8
4         4
       ... 
3125      1
3126      1
3127      1
3128      1
3129      1
Name: iterations, Length: 3130, dtype: int64

In [14]:
avg_iterations = avg_iterations.loc[avg_iterations.index.repeat(counts)].reset_index(drop=True)

In [15]:
avg_iterations

Unnamed: 0,used_algo,time_interval,local_traffic0,local_traffic1,local_traffic2,local_traffic3,expected_traffic0,expected_traffic1,expected_traffic2,expected_traffic3,nearby_algos0,nearby_algos1,nearby_algos2,nearby_algos3,iterations
0,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
1,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
2,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
3,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
4,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25666,6,10,2,1,0,0,0,1,0,0,2,0,0,0,68.000000
25667,6,10,2,1,0,0,0,2,0,0,5,0,0,0,78.000000
25668,6,10,2,1,0,0,2,0,0,0,2,0,0,0,35.000000
25669,6,10,3,0,0,0,2,1,0,0,5,0,0,0,35.000000


In [16]:
input_cols = avg_iterations.columns.drop(["used_algo", "iterations"])

In [17]:
best_algos_index_vals = avg_iterations.groupby(list(input_cols))["iterations"].min().reset_index()

In [18]:
best_algos_index_vals

Unnamed: 0,time_interval,local_traffic0,local_traffic1,local_traffic2,local_traffic3,expected_traffic0,expected_traffic1,expected_traffic2,expected_traffic3,nearby_algos0,nearby_algos1,nearby_algos2,nearby_algos3,iterations
0,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
1,10,0,0,0,0,0,0,0,0,1,1,0,0,38.000000
2,10,0,0,0,0,0,0,0,0,1,2,0,0,41.666667
3,10,0,0,0,0,0,0,0,0,1,4,0,0,49.600000
4,10,0,0,0,0,0,0,0,0,1,5,0,0,44.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1873,10,4,0,0,0,3,1,0,0,1,1,0,0,46.500000
1874,10,4,2,0,0,1,1,0,0,2,2,0,0,71.000000
1875,10,5,0,0,0,0,1,0,0,2,2,0,0,51.000000
1876,10,5,0,0,0,1,0,0,0,1,1,0,0,100.000000


In [19]:
# groupby_inp_cols = all_together.groupby(list(input_cols))

In [20]:
# best_algos_indices = groupby_inp_cols['iterations'].min()

In [21]:
# best_algos_indices

In [22]:
# best_algos_index_vals = [[*index, value] for index, value in best_algos_indices.items()]

In [23]:
index_cols = [*list(input_cols), "iterations"]

best_algos_entries = [avg_iterations[(avg_iterations[index_cols] == best_algo).all(axis=1)] for _, best_algo in best_algos_index_vals.iterrows()]

In [24]:
best_algos_df = pd.concat(best_algos_entries, axis=0)

In [25]:
best_algos_df.reset_index(drop=True)

Unnamed: 0,used_algo,time_interval,local_traffic0,local_traffic1,local_traffic2,local_traffic3,expected_traffic0,expected_traffic1,expected_traffic2,expected_traffic3,nearby_algos0,nearby_algos1,nearby_algos2,nearby_algos3,iterations
0,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
1,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
2,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
3,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
4,1,10,0,0,0,0,0,0,0,0,1,0,0,0,68.966667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7934,1,10,4,0,0,0,3,1,0,0,1,1,0,0,46.500000
7935,2,10,4,2,0,0,1,1,0,0,2,2,0,0,71.000000
7936,2,10,5,0,0,0,0,1,0,0,2,2,0,0,51.000000
7937,1,10,5,0,0,0,1,0,0,0,1,1,0,0,100.000000


In [26]:
# output_path = "data/test_1_10_model_data3.csv"
# output_path = "data/test_3_10_model_data.csv"
# output_path = "data/test_7_10_model_data.csv"
output_path = "data/test_10_10_model_data.csv"

In [27]:
best_algos_df.to_csv(output_path, index=False)