In [1]:
# reload

%load_ext autoreload
%autoreload 2
from helper_func_create_dataset import *

In [2]:
# Increase plot font size from default for all plots instead of setting it in each plot 
plt.rcParams.update({'font.size': 22})
plt.rcParams["figure.autolayout"] = False

In [4]:
create_dataset = True
print_logfile_info = True
# When using time aggregation to smooth out plots and make them more visually interpretable what window size to use 
time_wind_str = '500ms'
time_wind_val = 500

n_step_history = True
num_steps = 5
small_wind_str = '100ms'
small_wind_val = 100

data_slice = 'all' #'macro' 'micro' 'fast' 'slow' 'all'

#===================================
# Data source
#=================================== 
data_dir = '../../../dataset_ver1/'
save_dir=data_dir+'parsed_data_5steps_debug/'

# NOTE: 'mobility_trace.txt' needs to be first in the list since it is the only file that has consisently periodic values for all UEs
# And this info is used to create the basic data structure 
files= ['mobility_trace.txt']+['dashClient_trace.txt', 'delay_trace.txt', 'vrFragment_trace.txt', 'httpClientRtt_trace.txt']+ran_files

print('Raw data folder: \n'+data_dir+'\n')

#=======================================
# Initilizing empty lists 
#=======================================
# Use 15 runs for pretraining and fine tuning and 5 runs for testing 
#runs = range(1, 20 + 1)
runs = range(1, 1 + 1)

Raw data folder: 
../../../dataset_ver1/



In [6]:
def setup_run(run, create_dataset, print_logfile_info, time_wind_str, time_wind_val, data_slice, save_dir, files):
    # constant multipliers
    M = (10**6)
    K = (10**3)

    print('============================================================')
    print (run)
    print('============================================================')
    only_run = run.split('/')[-1]
    print(only_run)

    sim_info = pd.read_csv(run+'/sim_info.txt', sep=',').set_index('parameter').to_dict('index')
    # Specify here topology details from the dataset we are using 
    # This can be infered from the files, but this is easier
    print(sim_info)
    separate_macro_micro = bool(int(sim_info['create_micro_layer']['value'])) 
    print('separate_macro_micro ', separate_macro_micro)
    total_num_cells=(int(sim_info['macro_num_bs']['value'])+int(sim_info['micro_num_bs']['value'])) if separate_macro_micro else int(sim_info['macro_num_bs']['value']) 
    print('total_num_cells', total_num_cells)
    total_num_ues=int(sim_info['macro_layer_ues']['value'])+int(sim_info['micro_layer_ues']['value']) if separate_macro_micro else int(sim_info['macro_layer_ues']['value'])
    print('total_num_ues', total_num_ues)
    sim_time = int(sim_info['simulation_time_seconds']['value']) # seconds
    print('sim_time', sim_time)

    dict_gnb_pos = pd.read_csv(run+'/gnb_locations.txt', sep=',').set_index('cellId').to_dict('index')
    dict_gnb_pos[0] = {'gnbpos_x': np.nan, 'gnbpos_y': np.nan, 'gnbpos_z': np.nan}
    #print(dict_gnb_pos)
    # read UE group list
    ue_groups = open(run+'/ue_gnb_groups.txt').readlines()
    macro_imsis = [int(i) for i in ue_groups[0].split(',')[1:-1]]
    print("Macro UE IMSIs: ", macro_imsis)
    micro_imsis = list(set(range(1, total_num_ues+1)) - set(macro_imsis))
    print("Micro UE IMSIs: ", micro_imsis)
    fast_imsis = [int(i) for i in ue_groups[1].split(',')[1:-1]]
    print("Macro fast UE IMSIs: ", fast_imsis)
    slow_imsis = list(set(range(1, total_num_ues+1)) - set(fast_imsis) - set(micro_imsis))
    print("Macro Slow UE IMSIs: ", slow_imsis)
    #only_delay_imsis=[3,5,7,9,13,15,17,19,23,55,57]
    only_delay_imsis=[int(i) for i in ue_groups[2].split(',')[1:-1]]
    print("Only delay IMSIs: ", only_delay_imsis)
    
    # Read these from the ue_groups file later and also rename ue_groups to something else 
    macro_cells = [1,2,3]
    print('Macro CellIds: ', macro_cells)
    micro_cells = [4,5,6]
    print('Micro CellIds: ', micro_cells)

    sim_start_time = pd.to_datetime(1.0, unit='s', origin='unix')
    sim_end_time = pd.to_datetime(sim_time, unit='s', origin='unix')

    vr_imsi=np.empty(0)
    dash_imsi=np.empty(0)
    http_imsi=np.empty(0)
    
    df_mi_cellId = None
    df_cellId = None
    
    # This dataframe shall aggregate the parsed features from all the different files 
    # before vertically concatenating to master_df 
    per_run_df = pd.DataFrame()
    # stored from the first RAN file processed 
    basic_info = pd.DataFrame(columns=['IMSI', 'cellId', 'conn_imsi_count'])
    first_ran_file = True 
    
    #chosen_imsis = None
#=================================================================
    for file in files:
#=================================================================
        print('--------------------------------------------')
        print(file)
        start_time = time.time()
        
        #=======================================
        # Preprocess logs 
        #=======================================
        
        ## Read file while fixing the tab issue
        if file in files_with_trailing_tab:
            df = pd.read_csv(run+'/'+file, sep='\t', usecols=range(0,18))
            print(run+'/'+file)
        else:
            df = pd.read_csv(run+'/'+file, sep='\t')
            print(run+'/'+file)
        
        print('time to read file: ', (time.time() - start_time))
        
        df = clean_up_logs(df, file, sim_start_time, sim_end_time)
        
        #=======================================
        # Print log file info  
        #=======================================
        if print_logfile_info:
            ## Display info about the UEs who have made entries in this file to make sure that all the UEs who should be here are here. 
            print('ueIds: min:', min(df['IMSI'].value_counts().index), 'max:', max(df['IMSI'].value_counts().index), 
                  'count:', len(df['IMSI'].value_counts().index))
            if (file in ran_files) and (len(df['IMSI'].value_counts().index) < total_num_ues):
                print('WARNING: Fewer UEs in this file than the total number in the simulation')
                print(df['IMSI'].value_counts())

            ## Total runtime of log
            print('log time (start, end): (', np.nanmin(df['tstamp_us']), ', ' ,np.nanmax(df['tstamp_us']), ')')
            print('log runtime:', (np.nanmax(df['tstamp_us']) - np.nanmin(df['tstamp_us'])), ' seconds')

        # set the name of the index before doing all the index based processing that lies ahead 
        df.index.set_names('wind_tstamp', inplace=True)
        #==========================================================================
        # File specific extraction and aggregation of metrics for dataset creation
        #==========================================================================
        
        if create_dataset:
            
            # this must come first, so make sure it is the first file being read 
            # by putting it first in the files list
            
            if file == 'mobility_trace.txt':
                
                # Need to take unique for each IMSI list
                df_mi_cellId_imsi_conn = df[['IMSI', 'cellId']].sort_values(by=['cellId']).groupby(by=['cellId'])
                df_mi_cellId_imsi_conn = df_mi_cellId_imsi_conn.resample(time_wind_str).agg(list)
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                # lets just slice off a second since I goofed up in the beginning and dont want to include extra samples for each cell 
                df_mi_cellId_imsi_conn = df_mi_cellId_imsi_conn.drop(['cellId'], axis=1).loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]  
                df_mi_cellId_imsi_conn.index = df_mi_cellId_imsi_conn.index.set_names(['cellId', 'wind_tstamp'])


            elif file == 'httpClientRtt_trace.txt':

                chosen_imsis = filtered_imsis(data_slice, sorted(df['IMSI'].unique()))                    
                print('Web browsing IMSIs: ', chosen_imsis)

                this_log_df = pd.DataFrame()
                tmp_this_log_df = pd.DataFrame()

                # IMSI perspective 
                for imsi, group1 in df.sort_values(by=['IMSI']).groupby(by=['IMSI']):
                    imsi = imsi[0]
                    
                    if imsi in chosen_imsis:
                            
                        page_load_time_df = list()
                        webpage_size_df = list()
                        http_index = list()
                        
                        for pageId, group2 in group1.groupby(by=['webpageId']):

                            # this takes the receive timestamp
                            http_index = http_index + [group2.index[-1]] # list append 
                            page_load_time_df = page_load_time_df + [group2['delay'].sum()] # list append 
                            webpage_size_df = webpage_size_df + [group2['objectSize'].sum()/K] # list append                            
                            
                        page_load_time_df = pd.DataFrame(index=pd.to_datetime(http_index), columns=['page_load_time'], data=page_load_time_df)
                        page_load_time_df = page_load_time_df.resample(time_wind_str).mean().loc[sim_start_time : sim_end_time]
                        
                        webpage_size_df = pd.DataFrame(index=http_index, columns=['webpage_size'], data=webpage_size_df)
                        webpage_size_df = webpage_size_df.resample(time_wind_str).mean().loc[sim_start_time : sim_end_time] # Mbps
                        
                        # concatenate
                        this_log_df_per_imsi = pd.concat([page_load_time_df, webpage_size_df], axis=1)
                        this_log_df_per_imsi.columns = ['page_load_time', 'webpage_size']
                        # Add IMSI as a column to use later for indexing
                        this_log_df_per_imsi['IMSI'] = [imsi] * this_log_df_per_imsi.shape[0]                        
                        tmp_this_log_df = pd.concat([tmp_this_log_df, this_log_df_per_imsi], axis=0)  

                        
                # Make sure that the order of IMSIs is the same as when we do the RAN logs 
                # set index name for the resampled time stamp
                print('---------------------------------------------------------')
                tmp_this_log_df.index = tmp_this_log_df.index.set_names('wind_tstamp')
                
                # Convert the IMSI column to an index
                tmp_this_log_df = tmp_this_log_df.set_index('IMSI', append=True)
                this_log_df = tmp_this_log_df.reorder_levels(['IMSI', 'wind_tstamp'])
                
                # add a prefix to the column names
                this_log_df = this_log_df.add_prefix(file+'_')
                
                # Concatenating along axis=1 and setting 'IMSI' and 'wind_tstamp' as the index again
                per_run_df = pd.concat([per_run_df, this_log_df], axis=1, sort=True)
                print('time to parse file: ', (time.time() - start_time))

            elif file == 'dashClient_trace.txt':
                
                chosen_imsis = filtered_imsis(data_slice, sorted(df['IMSI'].unique()))                    
                print('video streaming IMSIs: ', chosen_imsis)
                
                # IMSI perspective 
                
                # Create the UE and cell perspective for the dataset 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep  
                df_mi_imsi_log = df.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                df_mi_imsi_log = df_mi_imsi_log.resample(time_wind_str).mean()
        
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_imsi_log = df_mi_imsi_log.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                # give a name to the resampled time index 
                df_mi_imsi_log.index = df_mi_imsi_log.index.set_names(['IMSI', 'wind_tstamp'])
                
                this_log_df = pd.DataFrame(columns=['newBitRate_bps', 'oldBitRate_bps'])
                # I want to drop all the rows that correspond to IMSIs that are not in chosen_imsis
                this_log_df['newBitRate_bps'] = df_mi_imsi_log[df_mi_imsi_log.index.get_level_values('IMSI').isin(chosen_imsis)]['newBitRate_bps']
                this_log_df['oldBitRate_bps'] = df_mi_imsi_log[df_mi_imsi_log.index.get_level_values('IMSI').isin(chosen_imsis)]['oldBitRate_bps']
                
                # add a prefix to the column names
                this_log_df = this_log_df.add_prefix(file+'_')
                
                # Concatenating along axis=1 and setting 'IMSI' and 'wind_tstamp' as the index again
                per_run_df = pd.concat([per_run_df, this_log_df], axis=1, sort=True)
                
                print('time to parse file: ', (time.time() - start_time)) 
                
            elif file == 'vrFragment_trace.txt':

                chosen_imsis = filtered_imsis(data_slice, sorted(df['IMSI'].unique()))
                print('VR IMSIs: ', chosen_imsis)
                
                this_log_df = pd.DataFrame()
                tmp_this_log_df = pd.DataFrame()

                for imsi, group1 in df.sort_values(by=['IMSI']).groupby(by=['IMSI']):
                    imsi = imsi[0]
                    
                    if imsi in chosen_imsis:
                        
                        vr_frag_time_df = (group1['delay']).resample(time_wind_str).mean().loc[sim_start_time : sim_end_time]
                        vr_frag_thput_df = (group1['burstSize']*8/K/group1['numFragsInBurst']/group1['delay']).resample(time_wind_str).mean().loc[sim_start_time : sim_end_time] #Mbps
                        
                        vr_burst_time_df = list()
                        vr_burst_thput_df = list()
                        vr_index = list()
                        
                        for burstSeqNum, group2 in group1.groupby(by=['burstSeqNum']):
                            # this takes the receive timestamp
                            vr_index = vr_index + [group2.index[-1]]
                            vr_burst_time_df = vr_burst_time_df + [group2['delay'].iloc[-1]]
                            vr_burst_thput_df = vr_burst_thput_df + [(group2['burstSize'].iloc[-1]*8/K)/group2['delay'].iloc[-1] ]# Mbps
                        
                        vr_burst_time_df = pd.DataFrame(index=pd.to_datetime(vr_index), columns=['vr_burst_time'], data=vr_burst_time_df)
                        vr_burst_time_df = vr_burst_time_df.resample(time_wind_str).mean().loc[sim_start_time : sim_end_time]
                        
                        vr_burst_thput_df = pd.DataFrame(index=vr_index, columns=['vr_burst_thput_mbps'], data=vr_burst_thput_df)
                        vr_burst_thput_df = vr_burst_thput_df.resample(time_wind_str).mean().loc[sim_start_time : sim_end_time] # Mbps
                        
                        # concatenate
                        this_log_df_per_imsi = pd.concat([vr_frag_time_df, vr_frag_thput_df, vr_burst_time_df, vr_burst_thput_df], axis=1)
                        this_log_df_per_imsi.columns = ['vr_frag_time', 'vr_frag_thput_mbps', 'vr_burst_time', 'vr_burst_thput_mbps']
                        # Add IMSI as a column to use later for indexing
                        this_log_df_per_imsi['IMSI'] = [imsi] * this_log_df_per_imsi.shape[0]                        
                        tmp_this_log_df = pd.concat([tmp_this_log_df, this_log_df_per_imsi], axis=0)  

                        
                # Make sure that the order of IMSIs is the same as when we do the RAN logs 
                # set index name for the resampled time stamp
                print('---------------------------------------------------------')
                tmp_this_log_df.index = tmp_this_log_df.index.set_names('wind_tstamp')
                
                # Convert the IMSI column to an index
                tmp_this_log_df = tmp_this_log_df.set_index('IMSI', append=True)
                this_log_df = tmp_this_log_df.reorder_levels(['IMSI', 'wind_tstamp'])
                
                # add a prefix to the column names
                this_log_df = this_log_df.add_prefix(file+'_')
                
                # Concatenating along axis=1 and setting 'IMSI' and 'wind_tstamp' as the index again
                per_run_df = pd.concat([per_run_df, this_log_df], axis=1, sort=True)
                
                print('time to parse file: ', (time.time() - start_time))
                
            elif file == 'delay_trace.txt': 
                # process the my_metrics part and skip the cell_metrics part 
                # just take the ul and dl delay cols and discard the rest 
                
                # IMSI perspective 
                
                # Create the UE and cell perspective for the dataset 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep  
                # Also drop the UL and DL markers so that only numerica features are left
                df_ul = df[df['dir'] == 'UL'].drop(['dir'], axis=1)
                df_dl = df[df['dir'] == 'DL'].drop(['dir'], axis=1)

                df_mi_imsi_log_ul = df_ul.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                
                df_mi_imsi_log_ul = df_mi_imsi_log_ul.resample(time_wind_str).mean()
                
                df_mi_imsi_log_dl = df_dl.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                df_mi_imsi_log_dl = df_mi_imsi_log_dl.resample(time_wind_str).mean()
                
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_imsi_log_ul = df_mi_imsi_log_ul.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                df_mi_imsi_log_dl = df_mi_imsi_log_dl.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                # give a name to the resampled time index 
                df_mi_imsi_log_ul.index = df_mi_imsi_log_ul.index.set_names(['IMSI', 'wind_tstamp'])
                df_mi_imsi_log_dl.index = df_mi_imsi_log_dl.index.set_names(['IMSI', 'wind_tstamp'])
                
                this_log_df = pd.DataFrame(columns=['ul_delay', 'dl_delay'])
                
                chosen_imsis = filtered_imsis(data_slice, sorted(df['IMSI'].unique()))

                this_log_df['ul_delay'] = df_mi_imsi_log_ul[df_mi_imsi_log_ul.index.get_level_values('IMSI').isin(chosen_imsis)]['delay']
                this_log_df['dl_delay'] = df_mi_imsi_log_dl[df_mi_imsi_log_dl.index.get_level_values('IMSI').isin(chosen_imsis)]['delay']
            
                # Make sure that the order of IMSIs is the same as when we do the RAN logs
                # add a prefix to the column names
                this_log_df = this_log_df.add_prefix(file+'_')
                per_run_df = pd.concat([per_run_df, this_log_df], axis=1, sort=True)
                
                print('time to parse file: ', (time.time() - start_time)) 
            
            # ends up being all the RAN files 
            else:
                # identify the sum and mean columns from the set of columns in this log file
                sum_cols_feats = list(set(sum_feats) & set(df.columns))
                mean_cols_feats = list(set(df.columns) - set(sum_cols_feats))
                mean_cols_feats.remove('cellId')

                # IMSI perspective
                
                # Create the UE and cell perspective for the dataset 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep
                df_mi_imsi_log = df.sort_values(by=['IMSI']).groupby(by=['IMSI']) # groupBy object 
                #resample into windows and either mean or sum based on which group the column belongs to
                if n_step_history:
                    sum_df_mi_imsi_tmp = df_mi_imsi_log[sum_cols_feats].resample(small_wind_str).sum()
                    sum_df_mi_imsi_log = sum_df_mi_imsi_tmp.groupby([pd.Grouper(level='IMSI'), pd.Grouper(level='wind_tstamp', freq=time_wind_str)]).agg(list)
                    # don't expand the cellId column                    
                    mean_df_mi_imsi_tmp = df_mi_imsi_log[mean_cols_feats].resample(small_wind_str).mean()
                    mean_df_mi_imsi_log = mean_df_mi_imsi_tmp.groupby([pd.Grouper(level='IMSI'), pd.Grouper(level='wind_tstamp', freq=time_wind_str)]).agg(list)
                    mean_df_mi_imsi_log['cellId'] = df_mi_imsi_log['cellId'].resample(time_wind_str).mean()
                else: # single step history 
                    sum_df_mi_imsi_log = df_mi_imsi_log[sum_cols_feats].resample(time_wind_str).sum()
                    mean_df_mi_imsi_log = df_mi_imsi_log[mean_cols_feats].resample(time_wind_str).mean()

                if not sum_cols_feats: # if sum cols is null
                    df_mi_imsi_log = mean_df_mi_imsi_log
                elif not mean_cols_feats:# if mean cols is null
                    df_mi_imsi_log = sum_df_mi_imsi_log
                else:
                    df_mi_imsi_log = pd.concat([mean_df_mi_imsi_log, sum_df_mi_imsi_log], axis=1)
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_imsi_log = df_mi_imsi_log.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                # give a name to the resampled time index 
                df_mi_imsi_log.index = df_mi_imsi_log.index.set_names(['IMSI', 'wind_tstamp'])
                df_mi_imsi_log = df_mi_imsi_log.drop(drop_cols_before_sep, axis=1, errors='ignore')

                # Cell perspective

                # Create the UE abnd cell perspective for the dataset 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep  
                df_mi_cell_log = df.sort_values(by=['cellId']).groupby(by=['cellId'])
                #resample into windows and either mean or sum based on which group the column belongs to 
                if n_step_history:
                    sum_df_mi_cell_tmp = df_mi_cell_log[sum_cols_feats].resample(small_wind_str).sum()
                    sum_df_mi_cell_log = sum_df_mi_cell_tmp.groupby([pd.Grouper(level='cellId'), pd.Grouper(level='wind_tstamp', freq=time_wind_str)]).agg(list)
                    # don't expand the cellId column
                    mean_df_mi_cell_tmp = df_mi_cell_log[mean_cols_feats].resample(small_wind_str).mean()
                    mean_df_mi_cell_log = mean_df_mi_cell_tmp.groupby([pd.Grouper(level='cellId'), pd.Grouper(level='wind_tstamp', freq=time_wind_str)]).agg(list)
                    mean_df_mi_cell_log['cellId'] = df_mi_cell_log['cellId'].resample(time_wind_str).mean()
                else:
                    sum_df_mi_cell_log = df_mi_cell_log[sum_cols_feats].resample(time_wind_str).sum()
                    mean_df_mi_cell_log = df_mi_cell_log[mean_cols_feats].resample(time_wind_str).mean()

                if not sum_cols_feats: # if sum cols is null
                    df_mi_cell_log = mean_df_mi_cell_log
                elif not mean_cols_feats:# if mean cols is null
                    df_mi_cell_log = sum_df_mi_cell_log
                else:
                    df_mi_cell_log = pd.concat([mean_df_mi_cell_log, sum_df_mi_cell_log], axis=1)

                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_cell_log = df_mi_cell_log.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                # give a name to the resampled time index 
                df_mi_cell_log.index = df_mi_cell_log.index.set_names(['cellId', 'wind_tstamp'])
                df_mi_cell_log = df_mi_cell_log.drop(drop_cols_before_sep, axis=1, errors='ignore')

                if n_step_history:
                    # Expecting that the columns in m_ulSched are the same as ld_ulSched 
                    # before they are concatenated with the prefix loadUe
                    for col in df_mi_imsi_log.columns:
                        if col == 'cellId': # because we did not expand this
                            continue
                        # add the list as multiple colums 
                        step_cols = [col+'_'+str(i) for i in range(num_steps)]
                        df_mi_imsi_log[step_cols] = pd.DataFrame(df_mi_imsi_log[col].to_list(), index=df_mi_imsi_log.index) 
                        df_mi_cell_log[step_cols] = pd.DataFrame(df_mi_cell_log[col].to_list(), index=df_mi_cell_log.index) 
                        # then delete the col with the list
                        df_mi_imsi_log = df_mi_imsi_log.drop([col], axis=1)
                        df_mi_cell_log = df_mi_cell_log.drop([col], axis=1)
                                        
                # Align the cell perspective to the IMSI perspective
                
                this_log_df = pd.DataFrame()
                
                # the imsis from whose application's perspective we are constructing the dataset 
                # make sure we are reading from a fixed list and not something like for imsi in all_imsis_in_this_file
                # we need the same order of IMSIs to be iterated thru so that the data frame for each file 
                # will have the same imsi order 
                chosen_imsis = filtered_imsis(data_slice, sorted(df['IMSI'].unique()))
                
                for i_imsi in chosen_imsis:
                    
                    num_rows = df_mi_imsi_log.loc[i_imsi].shape[0]
                    num_cols = df_mi_imsi_log.loc[i_imsi].shape[1]
                    basic_info_per_imsi = pd.DataFrame(index=df_mi_imsi_log.loc[i_imsi].index,
                                                   columns=['IMSI', 'cellId', 'conn_imsi_count'])
                    if first_ran_file:
                        # set the index for this as well 
                        basic_info_per_imsi['IMSI'] = [i_imsi] * num_rows
                        basic_info_per_imsi['cellId'] = [0] * num_rows #an array of the corresponding cellId
                        basic_info_per_imsi['conn_imsi_count'] = [0] * num_rows #as given by the imsis connected cell 
                        
                    # Get the metrics for this IMSI
                    my_metrics = df_mi_imsi_log.loc[i_imsi]

                    # Initialize a DataFrame to store metrics for other IMSIs
                    cell_metrics = pd.DataFrame(index=df_mi_imsi_log.loc[i_imsi].index,
                                                  columns=['cell_' + col for col in df_mi_imsi_log.loc[i_imsi].columns],
                                                  data=np.zeros((num_rows, num_cols)))
                    
                    # Iterate over each window for this IMSI
                    for wind in df_mi_imsi_log.loc[i_imsi].index:
                        # the cell that the UE was connected to
                        con_cell = np.round(df_mi_imsi_log.loc[i_imsi, wind]['cellId'])
                        if first_ran_file:
                            basic_info_per_imsi.loc[wind, 'cellId'] = con_cell
                        # Handle NaN case
                        if np.isnan(con_cell):
                            cell_metrics.loc[wind] = np.nan
                            continue

                        if first_ran_file:
                            basic_info_per_imsi.loc[wind, 'conn_imsi_count'] = len(np.unique(df_mi_cellId_imsi_conn.loc[con_cell, wind][0]))
                        cell_metrics.loc[wind] = df_mi_cell_log.loc[con_cell, wind].values
                        
                    # Combine metrics for the current IMSI and other IMSIs
                    this_imsi_df = pd.concat([my_metrics, cell_metrics], axis=1)

                    # Drop the columns that need to be removed
                    this_imsi_df = this_imsi_df.drop(drop_cols_after_sep, axis=1, errors='ignore')

                    # Append the metrics for this IMSI to the final DataFrame and include IMSI as a column to use as index later
                    this_imsi_df['IMSI'] = [i_imsi] * this_imsi_df.shape[0]
                    this_log_df = pd.concat([this_log_df, this_imsi_df], axis=0)
                    
                    if first_ran_file:
                        # combine the basic info 
                        basic_info = pd.concat([basic_info, basic_info_per_imsi], axis=0)
     
                # end of for over imsis

                # convert the IMSI column of this_log_df into an index
                this_log_df = this_log_df.set_index('IMSI', append=True)
                this_log_df = this_log_df.reorder_levels(['IMSI', 'wind_tstamp'])
                
                # add a prefix to the column names
                this_log_df = this_log_df.add_prefix(file+'_')
                
                # horz concatenate it to the per_run_df
                if first_ran_file:
                    basic_info.index = basic_info.index.set_names('wind_tstamp')                    
                    # Convert the IMSI column to an index
                    basic_info = basic_info.set_index('IMSI', append=True)
                    basic_info = basic_info.reorder_levels(['IMSI', 'wind_tstamp'])                 
                    per_run_df = pd.concat([basic_info, per_run_df, this_log_df], axis=1)
                
                per_run_df = pd.concat([per_run_df, this_log_df], axis=1)
                first_ran_file = False
                print(this_log_df.columns.to_list())
                print('Number of columns added from this log file', this_log_df.shape[1])
                print('time to parse file: ', (time.time() - start_time))   
            # end of elif file in ran_files:
        # end of if create_dataset:
    # end of for over files
    # each run is saved separately in a different file 
    # Save the dataset
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    per_run_df = per_run_df.reset_index(level=('IMSI',))    
    per_run_df.to_csv(save_dir+only_run+'_dataslice_'+data_slice+'_webpage_video_delay_vr_'+time_wind_str+'.csv', index=True)
    print('============================================================')   
    print('======================= RUN DONE =====================================')
    return per_run_df.columns.to_list()

# Process each run in parallel

In [7]:
pool = Pool(processes=20)
inputs = [(run, create_dataset, print_logfile_info, time_wind_str, time_wind_val, data_slice, save_dir, files) for run in [data_dir+'run'+str(r) for r in runs]]
outputs = pool.starmap(setup_run, inputs)  
#outputs = pool.map(setup_run, range(0,runs))
print(outputs)

../../../dataset_ver1/run1
run1
{'macro_rings': {'value': '0'}, 'macro_num_bs': {'value': '3'}, 'macro_layer_ues': {'value': '30'}, 'simulation_time_seconds': {'value': '1000'}, 'rand_seed': {'value': '0'}, 'create_micro_layer': {'value': '1'}, 'scheduler': {'value': 'PF'}, 'handover_algo': {'value': 'A2A4Rsrq'}, 'micro_num_bs': {'value': '3'}, 'micro_layer_ues': {'value': '60'}, 'delay_app_installed': {'value': '1'}, 'delay_pkt_interval_seconds': {'value': '+0.1s'}, 'rtt_app_installed': {'value': '0'}, 'http_app_installed': {'value': '1'}, 'dash_app_installed': {'value': '1'}, 'vr_app_installed': {'value': '1'}}
separate_macro_micro  True
total_num_cells 6
total_num_ues 90
sim_time 1000
Macro UE IMSIs:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
Micro UE IMSIs:  [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 

Process ForkPoolWorker-15:
Process ForkPoolWorker-7:
Process ForkPoolWorker-11:
Process ForkPoolWorker-16:
Process ForkPoolWorker-20:
Process ForkPoolWorker-4:
Process ForkPoolWorker-12:
Process ForkPoolWorker-19:
Process ForkPoolWorker-13:
Process ForkPoolWorker-17:
Process ForkPoolWorker-14:
Process ForkPoolWorker-2:
Process ForkPoolWorker-5:
Process ForkPoolWorker-18:
Process ForkPoolWorker-9:
Process ForkPoolWorker-3:
Process ForkPoolWorker-6:
Process ForkPoolWorker-10:
Process ForkPoolWorker-8:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Trace

KeyboardInterrupt: 

Process ForkPoolWorker-31:
Process ForkPoolWorker-22:
Process ForkPoolWorker-21:
Process ForkPoolWorker-34:
Process ForkPoolWorker-36:
Process ForkPoolWorker-39:
Process ForkPoolWorker-37:
Process ForkPoolWorker-40:
Process ForkPoolWorker-28:
Process ForkPoolWorker-24:
Process ForkPoolWorker-32:
Process ForkPoolWorker-25:
Process ForkPoolWorker-26:
Process ForkPoolWorker-33:
Process ForkPoolWorker-35:
Process ForkPoolWorker-30:
Process ForkPoolWorker-29:
Process ForkPoolWorker-38:
Process ForkPoolWorker-23:
Process ForkPoolWorker-27:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Trace