In [1]:
import pandas as pd
from joblib import Parallel, delayed

from os import listdir
from os.path import isfile, join

In [2]:
cars, time_interval = 30, 40

In [3]:
DATA_DIRECTORY = f"algorithms/ml_algo_files/data/test_{cars}_{time_interval}_updated"

In [4]:
def read_df(num):
    df = pd.read_csv(f"{DATA_DIRECTORY}/{num}.csv")
    
    with open(f"{DATA_DIRECTORY}/{num}.txt") as iter_file:
        df["iterations"] = int(iter_file.read())
        
    return df

In [5]:
onlyfiles = [f for f in listdir(DATA_DIRECTORY) if isfile(join(DATA_DIRECTORY, f))]

In [6]:
max_file = max(int(f.split('.')[0]) for f in onlyfiles)

In [7]:
nums = range(max_file + 1)

In [8]:
%%time
dfs = Parallel(n_jobs=10)(delayed(read_df)(num) for num in nums)

Wall time: 15.5 s


In [9]:
all_together = pd.concat(dfs, axis=0).reset_index(drop=True)

In [10]:
all_together

Unnamed: 0,used_algo,time_interval,local_traffic0,local_traffic1,local_traffic2,local_traffic3,expected_traffic0,expected_traffic1,expected_traffic2,expected_traffic3,nearby_algos0,nearby_algos1,nearby_algos2,nearby_algos3,iterations
0,1,10,0,0,0,0,5,3,0,0,1,1,0,0,72
1,1,10,0,1,0,0,3,3,0,0,1,1,0,0,72
2,1,10,1,2,0,0,3,5,0,0,1,0,0,0,72
3,1,10,2,0,0,0,6,3,0,0,1,1,0,0,72
4,1,10,3,0,0,0,4,8,0,0,1,1,0,0,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25912,6,10,0,0,0,0,2,1,0,0,6,6,0,0,51
25913,6,10,0,0,0,0,1,0,0,0,6,0,0,0,51
25914,6,10,0,0,0,0,0,0,0,0,6,6,0,0,51
25915,6,10,0,0,0,0,0,0,0,0,6,6,0,0,51


In [11]:
data_cols = all_together.columns.drop(["iterations"])
groupby_stats =  all_together.groupby(list(data_cols))
avg_iterations = groupby_stats.mean()

In [12]:
avg_iterations.reset_index(inplace=True)

In [13]:
counts = groupby_stats.count().reset_index(drop=True)["iterations"]

In [14]:
counts

0        29
1       357
2        37
3         7
4         4
       ... 
9691      1
9692      1
9693      1
9694      1
9695      1
Name: iterations, Length: 9696, dtype: int64

In [15]:
avg_iterations = avg_iterations.loc[avg_iterations.index.repeat(counts)].reset_index(drop=True)

In [16]:
avg_iterations

Unnamed: 0,used_algo,time_interval,local_traffic0,local_traffic1,local_traffic2,local_traffic3,expected_traffic0,expected_traffic1,expected_traffic2,expected_traffic3,nearby_algos0,nearby_algos1,nearby_algos2,nearby_algos3,iterations
0,1,10,0,0,0,0,0,0,0,0,1,0,0,0,77.0
1,1,10,0,0,0,0,0,0,0,0,1,0,0,0,77.0
2,1,10,0,0,0,0,0,0,0,0,1,0,0,0,77.0
3,1,10,0,0,0,0,0,0,0,0,1,0,0,0,77.0
4,1,10,0,0,0,0,0,0,0,0,1,0,0,0,77.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25912,6,10,4,1,0,0,9,2,0,0,4,0,0,0,55.0
25913,6,10,4,2,0,0,1,3,0,0,2,4,0,0,64.0
25914,6,10,4,2,0,0,4,3,0,0,6,0,0,0,68.0
25915,6,10,5,1,0,0,7,0,0,0,2,0,0,0,57.0


In [17]:
input_cols = avg_iterations.columns.drop(["used_algo", "iterations"])

In [18]:
best_algos_index_vals = avg_iterations.groupby(list(input_cols))["iterations"].min().reset_index()

In [19]:
best_algos_index_vals

Unnamed: 0,time_interval,local_traffic0,local_traffic1,local_traffic2,local_traffic3,expected_traffic0,expected_traffic1,expected_traffic2,expected_traffic3,nearby_algos0,nearby_algos1,nearby_algos2,nearby_algos3,iterations
0,10,0,0,0,0,0,0,0,0,1,0,0,0,69.408451
1,10,0,0,0,0,0,0,0,0,1,1,0,0,63.950000
2,10,0,0,0,0,0,0,0,0,1,2,0,0,60.333333
3,10,0,0,0,0,0,0,0,0,1,4,0,0,60.434783
4,10,0,0,0,0,0,0,0,0,1,5,0,0,70.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,10,7,1,0,0,6,4,0,0,1,1,0,0,65.000000
7391,10,7,2,0,0,7,1,0,0,1,1,0,0,50.000000
7392,10,8,0,0,0,0,0,0,0,1,1,0,0,74.000000
7393,10,8,0,0,0,2,0,0,0,1,1,0,0,81.000000


In [20]:
index_cols = [*list(input_cols), "iterations"]

best_algos_entries = [avg_iterations[(avg_iterations[index_cols] == best_algo).all(axis=1)] for _, best_algo in best_algos_index_vals.iterrows()]

In [21]:
best_algos_df = pd.concat(best_algos_entries, axis=0)

In [22]:
best_algos_df.reset_index(drop=True)

Unnamed: 0,used_algo,time_interval,local_traffic0,local_traffic1,local_traffic2,local_traffic3,expected_traffic0,expected_traffic1,expected_traffic2,expected_traffic3,nearby_algos0,nearby_algos1,nearby_algos2,nearby_algos3,iterations
0,4,10,0,0,0,0,0,0,0,0,1,0,0,0,69.408451
1,4,10,0,0,0,0,0,0,0,0,1,0,0,0,69.408451
2,4,10,0,0,0,0,0,0,0,0,1,0,0,0,69.408451
3,4,10,0,0,0,0,0,0,0,0,1,0,0,0,69.408451
4,4,10,0,0,0,0,0,0,0,0,1,0,0,0,69.408451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12842,1,10,7,1,0,0,6,4,0,0,1,1,0,0,65.000000
12843,1,10,7,2,0,0,7,1,0,0,1,1,0,0,50.000000
12844,1,10,8,0,0,0,0,0,0,0,1,1,0,0,74.000000
12845,1,10,8,0,0,0,2,0,0,0,1,1,0,0,81.000000


In [23]:
output_path = f"algorithms/ml_algo_files/data/test_{cars}_{time_interval}_model_data_updated.csv"

In [24]:
best_algos_df.to_csv(output_path, index=False)