In [1]:
# reload

%load_ext autoreload
%autoreload 2

In [2]:
# import  

#%matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import time
import numpy as np
import sys
import math
import scipy.stats as stats
#from tqdm import tqdm
import datetime as dt
from functions_dicts_to_parse_data import *
# Increase plot font size from default for all plots instead of setting it in each plot 
plt.rcParams.update({'font.size': 22})

In [3]:
# redirect all print statements to file instead of console

#file_path = 'logs_from_notebook.txt'
#sys.stdout = open(file_path, "w")

In [4]:
create_dataset = True
print_logfile_info = True
# When using time aggregation to smooth out plots and make them more visually interpretable what window size to use 
time_wind_str = '500ms'
time_wind_val = 500
data_slice = 'all' #'macro' 'micro' 'fast' 'slow' 'all'

#===================================
# Data source
#=================================== 
data_dir = '../../dataset_ver1/'

# This is a subset of log files that we would like to parse and plot at the moment depending on what is in the data 
#files = ['mobility_trace.txt']+ran_files+[
#                   #'dlThroughput_trace.txt',
#                   #'rtt_trace.txt', 
#                   'delay_trace.txt', 
#                   'handover_trace.txt',     
#                   'vrBurst_trace.txt', 'vrFragment_trace.txt',
#                   'dashClient_trace.txt', 'mpegPlayer_trace.txt', # video streaming
#                   'httpServerDelay_trace.txt', 'httpClientDelay_trace.txt', 'httpClientRtt_trace.txt'
#                    ]
# reduced version to test df buildup 

files= ['mobility_trace.txt']+['dashClient_trace.txt', 'delay_trace.txt', 'vrFragment_trace.txt']+ran_files
print('Raw data folder: \n'+data_dir+'\n')

Raw data folder: 
../../dataset_ver1/



In [5]:
plt.rcParams["figure.autolayout"] = False
#=======================================
# Initilizing empty lists 
#=======================================
# USe 15 runs for pretraining and fine tuning and 5 runs for testing 
runs = range(16, 20 + 1)
skip_runs = []

# constant multipliers
M = (10**6)
K = (10**3)


# This dataframe shall aggregate the logs over all the different files and horzcat it over files and then vertcat it over runs
master_df = pd.DataFrame()

#=======================================
# Iterate over runs and files 
#=======================================

#=================================================================
#use_runs = [13]
#use_runs = range(1,5)
#use_runs_paths = [(data_dir+'run'+str(r)) for r in use_runs]


#use_runs_paths = glob.glob(data_dir+'run*')
#for run in tqdm(use_runs_paths):

#for run in [data_dir+'run'+str(3)]:
for run in [data_dir+'run'+str(r) for r in runs]:
    
#=================================================================
    print('============================================================')
    print (run)
    print('============================================================')
    only_run = run.split('/')[-1]
    print(only_run)
    # check if run ran to completion
    # Don't need this anymore since we now have fixed the bug that would make it crash
    #with open(run+'/simulation_info.txt', "r") as sim_info_file:
    #    last_line = sim_info_file.readlines()[-1]
    #    print(last_line)
    #    if 'Elapsed wall clock' not in last_line:
    #        print('This run did not finish to completion, skipping it')
    #        skip_runs.append(only_run)
    #        print(skip_runs)
    #        continue
    
    sim_info = pd.read_csv(run+'/sim_info.txt', sep=',').set_index('parameter').to_dict('index')
    # Specify here topology details from the dataset we are using 
    # This can be infered from the files, but this is easier
    print(sim_info)
    separate_macro_micro = bool(int(sim_info['create_micro_layer']['value'])) 
    print('separate_macro_micro ', separate_macro_micro)
    total_num_cells=(int(sim_info['macro_num_bs']['value'])+int(sim_info['micro_num_bs']['value'])) if separate_macro_micro else int(sim_info['macro_num_bs']['value']) 
    print('total_num_cells', total_num_cells)
    total_num_ues=int(sim_info['macro_layer_ues']['value'])+int(sim_info['micro_layer_ues']['value']) if separate_macro_micro else int(sim_info['macro_layer_ues']['value'])
    print('total_num_ues', total_num_ues)
    sim_time = int(sim_info['simulation_time_seconds']['value']) # seconds
    print('sim_time', sim_time)
    
    sim_start_time = pd.to_datetime(1.0, unit='s', origin='unix')
    sim_end_time = pd.to_datetime(sim_time, unit='s', origin='unix')

    dict_gnb_pos = pd.read_csv(run+'/gnb_locations.txt', sep=',').set_index('cellId').to_dict('index')
    dict_gnb_pos[0] = {'gnbpos_x': np.nan, 'gnbpos_y': np.nan, 'gnbpos_z': np.nan}
    #print(dict_gnb_pos)
    # read UE group list
    ue_groups = open(run+'/ue_gnb_groups.txt').readlines()
    macro_imsis = [int(i) for i in ue_groups[0].split(',')[1:-1]]
    print("Macro UE IMSIs: ", macro_imsis)
    micro_imsis = list(set(range(1, total_num_ues+1)) - set(macro_imsis))
    print("Micro UE IMSIs: ", micro_imsis)
    fast_imsis = [int(i) for i in ue_groups[1].split(',')[1:-1]]
    print("Macro fast UE IMSIs: ", fast_imsis)
    slow_imsis = list(set(range(1, total_num_ues+1)) - set(fast_imsis) - set(micro_imsis))
    print("Macro Slow UE IMSIs: ", slow_imsis)
    #only_delay_imsis=[3,5,7,9,13,15,17,19,23,55,57]
    only_delay_imsis=[int(i) for i in ue_groups[2].split(',')[1:-1]]
    print("Only delay IMSIs: ", only_delay_imsis)

    
    # Read these from the ue_groups file later and also rename ue_groups to something else 
    macro_cells = [1,2,3]
    print('Macro CellIds: ', macro_cells)
    micro_cells = [4,5,6]
    print('Micro CellIds: ', micro_cells)
    
    vr_imsi=np.empty(0)
    dash_imsi=np.empty(0)
    http_imsi=np.empty(0)
    
    df_mi_cellId = None
    df_cellId = None
    
    # This dataframe shall aggregate the parsed features from all the different files 
    # before vertically concatenating to master_df 
    per_run_df = pd.DataFrame()
    # stored from the first RAN file processed 
    basic_info = pd.DataFrame(columns=['IMSI', 'cellId', 'conn_imsi_count'])
    first_ran_file = True 
    
    #chosen_imsis = None
#=================================================================
    for file in files:
#=================================================================
        print('--------------------------------------------')
        print(file)
        start_time = time.time()
        
        #=======================================
        # Preprocess logs 
        #=======================================
        
        ## Read file while fixing the tab issue
        if file in files_with_trailing_tab:
            #df = pd.read_csv(data_dir+run+'/'+file, sep='\t', usecols=range(0,18))
            df = pd.read_csv(run+'/'+file, sep='\t', usecols=range(0,18))
            print(run+'/'+file)
        else:
            #df = pd.read_csv(data_dir+run+'/'+file, sep='\t')
            df = pd.read_csv(run+'/'+file, sep='\t')
            print(run+'/'+file)
        
        print('time to read file: ', (time.time() - start_time))
        
        
        ## Do some file specific preprocessing
        ## Make uniform the timestamp units convert them all to micro seconds 
        if '% time' in df.columns:
            df.rename(columns = {'% time':'tstamp_us'}, inplace = True)
            if file_name_to_tstamp_unit[file] == 'ms':
                df['tstamp_us'] = df['tstamp_us']*K
            elif file_name_to_tstamp_unit[file] == 's':
                df['tstamp_us'] = df['tstamp_us']*M
        
        ## Make uniform the timestamp units        
        if '% start' in df.columns:
            ## TO DO: check if this is actually micro seconds. I think it is seconds   
            df.rename(columns = {'% start':'tstamp_us'}, inplace = True)
            df.rename(columns = {'end':'end_timeslot_us'}, inplace = True)
            df['tstamp_us'] = df['tstamp_us']*M
            df['end_timeslot_us'] = df['end_timeslot_us']*M
        
        ## Some internally generated logs use the naming 'CellId' replace that with 'cellId'
        if ('CellId' in df.columns):
            df.rename(columns = {'CellId':'cellId'}, inplace = True)
        if ('currentCellId' in df.columns):
            df.rename(columns = {'currentCellId':'cellId'}, inplace = True)    
        
        # Warning: cellId and IMSI here are IP addresses, but it should not matter since 
        # there will only be 1 UE doing these ul and dl throughput scans   
        if file == 'dlThroughput_trace.txt':
            df.rename(columns = {'toAddr':'IMSI', 'fromAddr': 'cellId'}, inplace = True)
            assert (df['IMSI'].nunique() == 1), "More than one throughput measurement UE is in the logs" 
        if file == 'ulThroughput_trace.txt':
            df.rename(columns = {'toAddr':'cellId', 'fromAddr': 'IMSI'}, inplace = True)
            assert (df['IMSI'].nunique() == 1), "More than one throughput measurement UE is in the logs" 
            
        ## Just for plotting change the timestamp_us to seconds and delay values to milli seconds 
        ## since I am mostly plotting directly from pandas and don't know how to add a multiplicative factor to a column    
        if 'tstamp_us' in df.columns:
            df['tstamp_us'] = df['tstamp_us']/M
            # Set datetime index for all files so that we can do series operations 
            datatime_timestamps = pd.to_datetime(df['tstamp_us'], unit='s', origin='unix')
            df = df.set_index(datatime_timestamps, inplace=False)
            
            # add a sample at the beginning and ending of every timeseries at sim_start_time and sim_end_time. 
            # This way the timeseries after resampling are all of the same length 
            if file in a_vs_b_files: 
                alignment_sample = np.empty(df.shape[1])
                alignment_sample[:] = np.nan
                start_sample = pd.DataFrame([alignment_sample], columns=df.columns, index=[sim_start_time])
                end_sample = pd.DataFrame([alignment_sample], columns=df.columns, index=[sim_end_time])
                if 'dir' in df.columns:
                    imsis_in_file = sorted(df['IMSI'].unique())
                    # all_imsis = range(1,total_num_ues+1)
                    for imsi in imsis_in_file:
                        for di in ['UL', 'DL']:
                            start_sample['IMSI'] = imsi
                            start_sample['dir'] = di
                            end_sample['IMSI'] = imsi
                            end_sample['dir'] = di
                            df = pd.concat([start_sample, df, end_sample])
                elif 'cellId' in df.columns:
                    #print('The issue is here. I am only initializing for cell Id 0 and not the others which is why those start from 0.5 and cellSI 0 starts from 0')
                    # Is there a better way to do this ? Like just slice them afterwards to something smaller? 
                    imsis_in_file = sorted(df['IMSI'].unique())
                    # all_imsis = range(1,total_num_ues+1)
                    for imsi in imsis_in_file:
                        start_sample['IMSI'] = imsi
                        start_sample['cellId'] = 0
                        end_sample['IMSI'] = imsi
                        end_sample['cellId'] = 0
                        df = pd.concat([start_sample, df, end_sample])
                else:
                    imsis_in_file = sorted(df['IMSI'].unique())
                    # all_imsis = range(1,total_num_ues+1)
                    for imsi in imsis_in_file:
                        start_sample['IMSI'] = imsi
                        end_sample['IMSI'] = imsi
                        df = pd.concat([start_sample, df, end_sample])
        
        if file == 'dlThroughput_trace.txt' or file == 'ulThroughput_trace.txt':
            df['IMSI'] = 1
            thput_meas_imsi = 1
        
        # converting all delay values to ms instead of us
        if 'delay' in df.columns:
            df['delay'] = df['delay']/K
        
        #=======================================
        # Print log file info  
        #=======================================
        if print_logfile_info:
            ## Display info about the UEs who have made entries in this file to make sure that all the UEs who should be here are here. 
            print('ueIds: min:', min(df['IMSI'].value_counts().index), 'max:', max(df['IMSI'].value_counts().index), 
                  'count:', len(df['IMSI'].value_counts().index))
            if (file in ran_files) and (len(df['IMSI'].value_counts().index) < total_num_ues):
                print('WARNING: Fewer UEs in this file than the total number in the simulation')
                print(df['IMSI'].value_counts())

            ## Display info about the Cells who have made entries in this file
            #print('cellIds: min:', min(df['cellId'].value_counts().index), 'max:', max(df['cellId'].value_counts().index),
            #     'count:', len(df['cellId'].value_counts().index))

            ## Total runtime of log
            print('log time (start, end): (', np.nanmin(df['tstamp_us']), ', ' ,np.nanmax(df['tstamp_us']), ')')
            print('log runtime:', (np.nanmax(df['tstamp_us']) - np.nanmin(df['tstamp_us'])), ' seconds')
           
        #==========================================================================
        # File specific extraction and aggregation of metrics for dataset creation
        #==========================================================================
        
        if create_dataset:
            
            # this must come first, so make sure it is the first file being read 
            # by putting it first in the files list
            
            if file == 'mobility_trace.txt':
                
                # Need to take unique for each IMSI list
                df_mi_cellId_imsi_conn = df[['IMSI', 'cellId']].sort_values(by=['cellId']).groupby(by=['cellId'])
                df_mi_cellId_imsi_conn = df_mi_cellId_imsi_conn.resample(time_wind_str).agg(list)
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                # lets just slice off a second since I goofed up in the beginning and dont want to include extra samples for each cell 
                df_mi_cellId_imsi_conn = df_mi_cellId_imsi_conn.drop(['cellId'], axis=1).loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]  
                df_mi_cellId_imsi_conn.index = df_mi_cellId_imsi_conn.index.set_names(['cellId', 'wind_tstamp'])
             
            elif file == 'dashClient_trace.txt':
                
                chosen_imsis = filtered_imsis(data_slice, sorted(df['IMSI'].unique()))                    
                print('video streaming IMSIs: ', chosen_imsis)
                
                # IMSI perspective 
                
                # Create the UE and cell perspective for the dataset 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep  
                df_mi_imsi_log = df.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                df_mi_imsi_log = df_mi_imsi_log.resample(time_wind_str).mean()
        
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_imsi_log = df_mi_imsi_log.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                # give a name to the resampled time index 
                df_mi_imsi_log.index = df_mi_imsi_log.index.set_names(['IMSI', 'wind_tstamp'])
                
                this_log_df = pd.DataFrame(columns=['newBitRate_bps', 'oldBitRate_bps'])
                # I want to drop all the rows that correspond to IMSIs that are not in chosen_imsis
                this_log_df['newBitRate_bps'] = df_mi_imsi_log[df_mi_imsi_log.index.get_level_values('IMSI').isin(chosen_imsis)]['newBitRate_bps']
                this_log_df['oldBitRate_bps'] = df_mi_imsi_log[df_mi_imsi_log.index.get_level_values('IMSI').isin(chosen_imsis)]['oldBitRate_bps']
                
                # add a prefix to the column names
                this_log_df = this_log_df.add_prefix(file+'_')
                                                
                # Make sure that the order of IMSIs is the same as when we do the RAN logs
                #print(this_log_df.shape)
                #print(this_log_df.columns)
                #print(this_log_df.head(n=2))
                
                # Concatenating along axis=1 and setting 'IMSI' and 'wind_tstamp' as the index again
                per_run_df = pd.concat([per_run_df, this_log_df], axis=1, sort=True)
                print(per_run_df.head(n=2))
                
                print('time to parse file: ', (time.time() - start_time)) 
                
            elif file == 'vrFragment_trace.txt':

                chosen_imsis = filtered_imsis(data_slice, sorted(df['IMSI'].unique()))
                print('VR IMSIs: ', chosen_imsis)
                
                this_log_df = pd.DataFrame()
                tmp_this_log_df = pd.DataFrame()

                for imsi, group1 in df.sort_values(by=['IMSI']).groupby(by=['IMSI']):
                    imsi = imsi[0]
                    if imsi in chosen_imsis:
                        #print(imsi)        
                        vr_frag_time_df = (group1['delay']).resample(time_wind_str).mean().loc[sim_start_time : sim_end_time]
                        #print('vr_frag_time_df')
                        #print(vr_frag_time_df.index)
                        #print(vr_frag_time_df.head(n=2))
                        vr_frag_thput_df = (group1['burstSize']*8/K/group1['numFragsInBurst']/group1['delay']).resample(time_wind_str).mean().loc[sim_start_time : sim_end_time] #Mbps
                        #print('vr_frag_thput_df')
                        #print(vr_frag_thput_df.index)
                        #print(vr_frag_thput_df.head(n=2))
                        
                        vr_burst_time_df = list()
                        vr_burst_thput_df = list()
                        vr_index = list()
                        
                        for burstSeqNum, group2 in group1.groupby(by=['burstSeqNum']):
                            # this takes the receive timestamp
                            vr_index = vr_index + [group2.index[-1]]
                            vr_burst_time_df = vr_burst_time_df + [group2['delay'].iloc[-1]]
                            vr_burst_thput_df = vr_burst_thput_df + [(group2['burstSize'].iloc[-1]*8/K)/group2['delay'].iloc[-1] ]# Mbps
                        
                        vr_burst_time_df = pd.DataFrame(index=pd.to_datetime(vr_index), columns=['vr_burst_time'], data=vr_burst_time_df)
                        vr_burst_time_df = vr_burst_time_df.resample(time_wind_str).mean().loc[sim_start_time : sim_end_time]
                        #print('vr_burst_time_df')
                        #print(vr_burst_time_df.index)
                        #print(vr_burst_time_df.head(n=2))
                        
                        vr_burst_thput_df = pd.DataFrame(index=vr_index, columns=['vr_burst_thput_mbps'], data=vr_burst_thput_df)
                        vr_burst_thput_df = vr_burst_thput_df.resample(time_wind_str).mean().loc[sim_start_time : sim_end_time] # Mbps
                        #print('vr_burst_thput_df')
                        #print(vr_burst_thput_df.index)
                        #print(vr_burst_thput_df.head(n=2))
                        
                        print('----------------------------------')
                        # concatenate
                        this_log_df_per_imsi = pd.concat([vr_frag_time_df, vr_frag_thput_df, vr_burst_time_df, vr_burst_thput_df], axis=1)
                        this_log_df_per_imsi.columns = ['vr_frag_time', 'vr_frag_thput_mbps', 'vr_burst_time', 'vr_burst_thput_mbps']
                        # Add IMSI as a column to use later for indexing
                        this_log_df_per_imsi['IMSI'] = [imsi] * this_log_df_per_imsi.shape[0]
                        #print('this_log_df_per_imsi')
                        #print(this_log_df_per_imsi.index)
                        #print(this_log_df_per_imsi.head(n=2))
                        
                        tmp_this_log_df = pd.concat([tmp_this_log_df, this_log_df_per_imsi], axis=0)  
                        #print('tmp_this_log_df')
                        #print(tmp_this_log_df.index)
                        #print(tmp_this_log_df.head(n=2))
                        
                # Make sure that the order of IMSIs is the same as when we do the RAN logs 
                # set index name for the resampled time stamp
                print('---------------------------------------------------------')
                tmp_this_log_df.index = tmp_this_log_df.index.set_names('wind_tstamp')
                #print('tmp_this_log_df')
                #print(tmp_this_log_df.head(n=2))
                
                # Convert the IMSI column to an index
                #tmp_this_log_df = tmp_this_log_df.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                tmp_this_log_df = tmp_this_log_df.set_index('IMSI', append=True)
                this_log_df = tmp_this_log_df.reorder_levels(['IMSI', 'wind_tstamp'])
                #print('this_log_df')
                #print(this_log_df.head(n=2))
                #print(this_log_df.columns)
                
                #this_log_df.columns = ['vr_frag_time', 'vr_frag_thput', 'vr_burst_time', 'vr_burst_thput']
                # add a prefix to the column names
                this_log_df = this_log_df.add_prefix(file+'_')
                
                #print(this_log_df.shape)
                #print(this_log_df.columns)
                #print(this_log_df.head(n=2))
                
                # horz concatenate it to the per_run_df
                #per_run_df = pd.concat([per_run_df, this_log_df], axis=1)
                # Concatenating along axis=1 and setting 'IMSI' and 'wind_tstamp' as the index again
                per_run_df = pd.concat([per_run_df, this_log_df], axis=1, sort=True)
                #print(per_run_df.head(n=2))
                
                print('time to parse file: ', (time.time() - start_time))
                
            elif file == 'delay_trace.txt': 
                # process the my_metrics part and skip the cell_metrics part 
                # just take the ul and dl delay cols and discard the rest 
                
                # IMSI perspective 
                
                # Create the UE and cell perspective for the dataset 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep  
                # Also drop the UL and DL markers so that only numerica features are left
                df_ul = df[df['dir'] == 'UL'].drop(['dir'], axis=1)
                df_dl = df[df['dir'] == 'DL'].drop(['dir'], axis=1)
                #print(df_ul.columns)
                #print(df_dl.columns)
                df_mi_imsi_log_ul = df_ul.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                
                df_mi_imsi_log_ul = df_mi_imsi_log_ul.resample(time_wind_str).mean()
                
                df_mi_imsi_log_dl = df_dl.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                df_mi_imsi_log_dl = df_mi_imsi_log_dl.resample(time_wind_str).mean()
                
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_imsi_log_ul = df_mi_imsi_log_ul.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                df_mi_imsi_log_dl = df_mi_imsi_log_dl.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                # give a name to the resampled time index 
                df_mi_imsi_log_ul.index = df_mi_imsi_log_ul.index.set_names(['IMSI', 'wind_tstamp'])
                df_mi_imsi_log_dl.index = df_mi_imsi_log_dl.index.set_names(['IMSI', 'wind_tstamp'])
                
                this_log_df = pd.DataFrame(columns=['ul_delay', 'dl_delay'])
                
                chosen_imsis = filtered_imsis(data_slice, sorted(df['IMSI'].unique()))

                this_log_df['ul_delay'] = df_mi_imsi_log_ul[df_mi_imsi_log_ul.index.get_level_values('IMSI').isin(chosen_imsis)]['delay']
                this_log_df['dl_delay'] = df_mi_imsi_log_dl[df_mi_imsi_log_dl.index.get_level_values('IMSI').isin(chosen_imsis)]['delay']
            
                # Make sure that the order of IMSIs is the same as when we do the RAN logs
                # add a prefix to the column names
                this_log_df = this_log_df.add_prefix(file+'_')
                
                #print(this_log_df.shape)
                #print(this_log_df.columns)
                
                # horz concatenate it to the per_run_df
                #print(per_run_df.head(2))
                #print(this_log_df.head(2))
                
                per_run_df = pd.concat([per_run_df, this_log_df], axis=1, sort=True)
                #print(per_run_df.head(n=2))
                
                print('time to parse file: ', (time.time() - start_time)) 
            
            # ends up being all the RAN files 
            else:
                # IMSI perspective 
                
                # Create the UE and cell perspective for the dataset 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep  
                df_mi_imsi_log = df.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                # identify the sum and mean columns from the set of columns in this log file
                sum_cols_feats = list(set(ran_sum_feats) & set(df.columns))
                mean_cols_feats = list(set(df.columns) - set(sum_cols_feats))
                #resample into windows and either mean or sum based on which group the column belongs to
                sum_df_mi_imsi_log = df_mi_imsi_log[sum_cols_feats].resample(time_wind_str).sum()
                mean_df_mi_imsi_log = df_mi_imsi_log[mean_cols_feats].resample(time_wind_str).mean()
                df_mi_imsi_log = pd.concat([mean_df_mi_imsi_log, sum_df_mi_imsi_log], axis=1)
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_imsi_log = df_mi_imsi_log.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                # give a name to the resampled time index 
                df_mi_imsi_log.index = df_mi_imsi_log.index.set_names(['IMSI', 'wind_tstamp'])
                df_mi_imsi_log = df_mi_imsi_log.drop(drop_cols_before_sep, axis=1, errors='ignore')

                # Cell perspective
                
                # Create the UE abnd cell perspective for the dataset 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep  
                df_mi_cell_log = df.sort_values(by=['cellId']).groupby(by=['cellId'])
                #resample into windows and either mean or sum based on which group the column belongs to 
                sum_df_mi_cell_log = df_mi_cell_log[sum_cols_feats].resample(time_wind_str).sum()
                mean_df_mi_cell_log = df_mi_cell_log[mean_cols_feats].resample(time_wind_str).mean()
                df_mi_cell_log = pd.concat([mean_df_mi_cell_log, sum_df_mi_cell_log], axis=1)
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_cell_log = df_mi_cell_log.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                # give a name to the resampled time index 
                df_mi_cell_log.index = df_mi_cell_log.index.set_names(['cellId', 'wind_tstamp'])
                df_mi_cell_log = df_mi_cell_log.drop(drop_cols_before_sep, axis=1, errors='ignore')
                
                # Align the cell perspective to the IMSI perspective
                
                this_log_df = pd.DataFrame()
                # the imsis from whose application's perspective we are constructing the dataset 
                # make sure we are reading from a fixed list and not something like for imsi in all_imsis_in_this_file
                # we need the same order of IMSIs to be iterated thru so that the data frame for each file 
                # will have the same imsi order 

                # THIS WAS ADDED 27 Dec
                chosen_imsis = filtered_imsis(data_slice, sorted(df['IMSI'].unique()))
                
                for i_imsi in chosen_imsis:
                    
                    num_rows = df_mi_imsi_log.loc[i_imsi].shape[0]
                    num_cols = df_mi_imsi_log.loc[i_imsi].shape[1]
                    basic_info_per_imsi = pd.DataFrame(index=df_mi_imsi_log.loc[i_imsi].index,
                                                   columns=['IMSI', 'cellId', 'conn_imsi_count'])
                    if first_ran_file:
                        # set the index for this as well 
                        basic_info_per_imsi['IMSI'] = [i_imsi] * num_rows
                        basic_info_per_imsi['cellId'] = [0] * num_rows #an array of the corresponding cellId
                        basic_info_per_imsi['conn_imsi_count'] = [0] * num_rows #as given by the imsis connected cell 
                        
                    # Get the metrics for this IMSI
                    my_metrics = df_mi_imsi_log.loc[i_imsi]

                    # Initialize a DataFrame to store metrics for other IMSIs
                    cell_metrics = pd.DataFrame(index=df_mi_imsi_log.loc[i_imsi].index,
                                                  columns=['cell_' + col for col in df_mi_imsi_log.loc[i_imsi].columns],
                                                  data=np.zeros((num_rows, num_cols)))
                    
                    # Iterate over each window for this IMSI
                    for wind in df_mi_imsi_log.loc[i_imsi].index:
                        # the cell that the UE was connected to
                        con_cell = np.round(df_mi_imsi_log.loc[i_imsi, wind]['cellId'])
                        if first_ran_file:
                            basic_info_per_imsi.loc[wind, 'cellId'] = con_cell
                        # Handle NaN case
                        if np.isnan(con_cell):
                            cell_metrics.loc[wind] = np.nan
                            continue

                        if first_ran_file:
                            basic_info_per_imsi.loc[wind, 'conn_imsi_count'] = len(np.unique(df_mi_cellId_imsi_conn.loc[con_cell, wind][0]))
                        cell_metrics.loc[wind] = df_mi_cell_log.loc[con_cell, wind].values
                        
                    # Combine metrics for the current IMSI and other IMSIs
                    this_imsi_df = pd.concat([my_metrics, cell_metrics], axis=1)

                    # Drop the columns that need to be removed
                    this_imsi_df = this_imsi_df.drop(drop_cols_after_sep, axis=1, errors='ignore')

                    # Append the metrics for this IMSI to the final DataFrame and include IMSI as a column to use as index later
                    this_imsi_df['IMSI'] = [i_imsi] * this_imsi_df.shape[0]
                    this_log_df = pd.concat([this_log_df, this_imsi_df], axis=0)
                    
                    if first_ran_file:
                        # combine the basic info 
                        basic_info = pd.concat([basic_info, basic_info_per_imsi], axis=0)
     
                # end of for over imsis

                # convert the IMSI column of this_log_df into an index
                this_log_df = this_log_df.set_index('IMSI', append=True)
                this_log_df = this_log_df.reorder_levels(['IMSI', 'wind_tstamp'])
                
                # add a prefix to the column names
                this_log_df = this_log_df.add_prefix(file+'_')
                #print(this_log_df.shape)
                #print(this_log_df.columns)
                
                # horz concatenate it to the per_run_df
                if first_ran_file:
                    basic_info.index = basic_info.index.set_names('wind_tstamp')                    
                    # Convert the IMSI column to an index
                    basic_info = basic_info.set_index('IMSI', append=True)
                    basic_info = basic_info.reorder_levels(['IMSI', 'wind_tstamp'])                 
                    #print(basic_info.head)
                    #print(per_run_df.head)
                    #print(this_log_df.head)
                    per_run_df = pd.concat([basic_info, per_run_df, this_log_df], axis=1)
                
                per_run_df = pd.concat([per_run_df, this_log_df], axis=1)
                first_ran_file = False
                print('time to parse file: ', (time.time() - start_time))   
            # end of elif file in ran_files:
        # end of if create_dataset:
    # end of for over files
    # each run is saved separately in a different file 
    # Save the dataset
    save_dir=data_dir+'parsed_data/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    per_run_df = per_run_df.reset_index(level=('IMSI',))    
    per_run_df.to_csv(save_dir+only_run+'_dataslice_'+data_slice+'_video_delay_vr_'+time_wind_str+'.csv', index=True)
    print('============================================================')   
    print('======================= RUN DONE =====================================')
    #master_df = pd.concat([master_df, per_run_df], axis=0)   
    #num_runs=num_runs+1
# end of for over runs

#if create_dataset:
#    # Save the dataset
#    save_dir=data_dir+'parsed_data/'
#    if not os.path.exists(save_dir):
#        os.makedirs(save_dir)
#    master_df.to_csv(save_dir+'dataset_'+data_slice+'_video_delay_vr_'+time_wind_str+'.csv', index=True)

#print(master_df.shape)
#print(list(master_df.columns))
print('============================================================')   
print('======================= ALL DONE =====================================')

../../dataset_ver1/run16
run16
{'macro_rings': {'value': '0'}, 'macro_num_bs': {'value': '3'}, 'macro_layer_ues': {'value': '30'}, 'simulation_time_seconds': {'value': '1000'}, 'rand_seed': {'value': '15'}, 'create_micro_layer': {'value': '1'}, 'scheduler': {'value': 'PF'}, 'handover_algo': {'value': 'A2A4Rsrq'}, 'micro_num_bs': {'value': '3'}, 'micro_layer_ues': {'value': '60'}, 'delay_app_installed': {'value': '1'}, 'delay_pkt_interval_seconds': {'value': '+0.1s'}, 'rtt_app_installed': {'value': '0'}, 'http_app_installed': {'value': '1'}, 'dash_app_installed': {'value': '1'}, 'vr_app_installed': {'value': '1'}}
separate_macro_micro  True
total_num_cells 6
total_num_ues 90
sim_time 1000
Macro UE IMSIs:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]
Micro UE IMSIs:  [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 

# Do not know what this code does dfferently
# Seems like it tries to take each IMSI versus the rest instead of taking the whole cell as I have done in the previous cell 

In [None]:
        if create_dataset:
            # this must come first, so make sure it is the first file being read 
            # by putting it first in the files list
            if file == 'mobility_trace.txt':
                # Need to take unique for each IMSI list
                df_mi_cellId = df[['IMSI', 'cellId']].sort_values(by=['cellId']).groupby(by=['cellId'])
                df_mi_cellId = df_mi_cellId.resample(time_wind_str).agg(list)
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                # lets just slice off a second since I goofed up in the beginning and dont want to include extra samples for each cell 
                df_mi_cellId = df_mi_cellId.drop(['cellId'], axis=1).loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                df_cellId = df_mi_cellId.reset_index(level='cellId')
                #for cell in (df_mi_cellId.index.get_level_values('cellId').unique()):
                #    print('cell ', cell)
                #    print(df_mi_cellId.loc[cell])
             
            elif file in ran_files:
            #elif False:    
                # Do the one versus rest thingy 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep 
                df_mi_imsi_log = df.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                df_mi_imsi_log = df_mi_imsi_log.resample(time_wind_str).mean()
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_imsi_log = df_mi_imsi_log.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                #df_imsi_log = df_mi_imsi_log.droplevel('IMSI')
                # give a name to the resampled time index 
                df_mi_imsi_log.index = df_mi_imsi_log.index.set_names(['IMSI', 'wind_tstamp'])


                full_df = pd.DataFrame()
                # the imsis from whose application's perspective we are constructing the dataset 
                for i_imsi in macro_imsis:
                    # Get the metrics for this IMSI
                    my_metrics = df_mi_imsi_log.loc[i_imsi]

                    # the time over which the simulation ran 
                    num_rows = df_mi_imsi_log.loc[i_imsi].shape[0]
                    num_cols = df_mi_imsi_log.loc[i_imsi].shape[1]

                    # Initialize a DataFrame to store metrics for other IMSIs
                    others_metrics = pd.DataFrame(index=df_mi_imsi_log.loc[i_imsi].index,
                                                  columns=['others_' + col for col in df_mi_imsi_log.loc[i_imsi].columns],
                                                  data=np.zeros((num_rows, num_cols)))

                    # Iterate over each window for this IMSI
                    for wind in df_mi_imsi_log.loc[i_imsi].index:
                        # the cell that the UE was connected to
                        con_cell = np.round(df_mi_imsi_log.loc[i_imsi, wind]['cellId'])

                        # Handle NaN case
                        if np.isnan(con_cell):
                            others_metrics.loc[wind] = np.nan
                            continue

                        # Find other IMSIs for this window
                        other_imsis = np.unique(df_mi_cellId.loc[con_cell, wind][0])
                        other_imsis = other_imsis[other_imsis != i_imsi]  # Exclude the current IMSI
                        #others_windows = df_mi_imsi_log.index[level='wind_tstamp']

                        # Calculate the mean of other IMSIs' metrics for each window
                        if len(other_imsis) > 0:
                            others_metrics.loc[wind] = df_mi_imsi_log.loc[pd.IndexSlice[other_imsis, wind], :].mean().values
                        
                        print(others_metrics.loc[wind])
                        ggg
                    
                    print(others_metrics)
                    # Combine metrics for the current IMSI and other IMSIs
                    this_imsi_df = pd.concat([my_metrics, others_metrics], axis=1)

                    # Drop the columns that need to be removed
                    this_imsi_df = this_imsi_df.drop(drop_cols_after_sep, axis=1, errors='ignore')
                    #print(this_imsi_df)

                    # Append the metrics for this IMSI to the final DataFrame
                    full_df = pd.concat([full_df, this_imsi_df], axis=0)

                print(full_df.shape)
                print(full_df.columns)
                print('time to parse file: ', (time.time() - start_time))
                ggg

            
            #elif file in ran_files: 
            elif False:   # the one I wrote, unoptimised  
                # Do the one versus rest thingy 
                # This has 2 levels of indexes, one is IMSI and the other is windowed timestep 
                df_mi_imsi_log = df.sort_values(by=['IMSI']).groupby(by=['IMSI'])
                df_mi_imsi_log = df_mi_imsi_log.resample(time_wind_str).mean()
                # This slices away any extras. It does not take care if there is a window missing i.e. the sequence is shorter 
                df_mi_imsi_log = df_mi_imsi_log.loc[pd.IndexSlice[:, sim_start_time : sim_end_time], :]
                df_mi_imsi_log.index = df_mi_imsi_log.index.set_names(['IMSI', 'wind_tstamp'])
                #df_imsi_log = df_mi_imsi_log.droplevel('IMSI')

                full_df = pd.DataFrame()
                # the imsis from whose application's perspective we are constructing the dataset 
                for i_imsi in macro_imsis:
                    # The metrics from this imsi 
                    my_metrics = df_mi_imsi_log.loc[i_imsi]
                    # the time over which the simulation ran 
                    num_rows = df_mi_imsi_log.loc[i_imsi].shape[0]
                    num_cols = df_mi_imsi_log.loc[i_imsi].shape[1]
                    others_metrics = pd.DataFrame(np.zeros((num_rows, num_cols)))
                    others_metrics.index = df_mi_imsi_log.loc[i_imsi].index
                    others_metrics.columns = ['others_'+col for col in df_mi_imsi_log.loc[i_imsi].columns]
                    
                    for wind in df_mi_imsi_log.loc[i_imsi].index:
                        # the cell that the UE was connected to
                        # This can be Nan, which becomes a problem later on
                        # Why can this be nan ? because of the windowing ? I guess if there are no samples in this window, then it shall be nan  
                        con_cell = np.round(df_mi_imsi_log.loc[i_imsi, wind]['cellId'])
                        if np.isnan(con_cell):
                            others_metrics.loc[wind] = np.nan 
                            continue
                        #con_cell = min(max_cell_id, max(1, np.round(df_mi_imsi_log[i_imsi, wind] ['cellId'])))                        
                        # This can be nan and needs to be handled 
                        # nan was triggered by the file UlTxPhyStats.txt
                        other_imsis = np.unique(df_mi_cellId.loc[con_cell, wind][0])
                        other_imsis = other_imsis[other_imsis != i_imsi] 
                        for o_imsi in other_imsis:
                            # The metrics related to this other imsi for this window  
                            # I need to drop repeated cols and non metric type cols before I do this
                            # lets join them and then see whats we end up with and then decide what to drop 
                            others_metrics.loc[wind] = others_metrics.loc[wind] + df_mi_imsi_log.loc[o_imsi, wind].values # add row by row  
                        others_metrics.loc[wind] = others_metrics.loc[wind]/len(other_imsis)
                    print(others_metrics)
                    this_imsi_df = pd.concat([my_metrics, others_metrics], axis=1)
                    this_imsi_df = this_imsi_df.drop(drop_cols_after_sep, axis=1, errors='ignore')
                    #print(this_imsi_df)
                    full_df = pd.concat([full_df, this_imsi_df], axis=0)
                print(full_df.shape)
                print(full_df.columns)
                print('time to parse file: ', (time.time() - start_time))
                ggg
    # end of for over files 
        
    #num_runs=num_runs+1
# end of for over runs
print('============================================================')   
print('======================= DONE =====================================')

In [66]:
import os
os.system('cp parse_visualize_data.ipynb '+'./saved_notebooks/'+data_dir.split('/')[-2]+'.ipynb')

0