# Investigating data 

### Goal

- First simply look at data
- Then clean it removing all the node junk
- split into vector and scalar
- export to a csv file
- convert to a script.

In [2]:
import pandas as pd
import numpy as np

In [6]:
import os
os.getcwd()

'/home/brian/results-analysis/jupyter-notebooks'

In [7]:
def parse_if_number(s):
    try: return float(s)
    except: return True if s=="true" else False if s=="false" else s if s else None

def parse_ndarray(s):
    return np.fromstring(s, sep=' ') if s else None

In [8]:
mode4_raw = pd.read_csv("../data/raw_data/mode4.csv", converters = {
    'attrvalue': parse_if_number,
    'binedges': parse_ndarray,
    'binvalues': parse_ndarray,
    'vectime': parse_ndarray,
    'vecvalue': parse_ndarray})

In [9]:
mode4_raw.head()

Unnamed: 0,run,type,module,name,attrname,attrvalue,value,vectime,vecvalue
0,Mode4-0-20190323-12:03:13-23303,runattr,,,configname,Mode4,,,
1,Mode4-0-20190323-12:03:13-23303,runattr,,,datetime,20190323-12:03:13,,,
2,Mode4-0-20190323-12:03:13-23303,runattr,,,experiment,Mode4,,,
3,Mode4-0-20190323-12:03:13-23303,runattr,,,inifile,omnetpp.ini,,,
4,Mode4-0-20190323-12:03:13-23303,runattr,,,iterationvars,"$cqi=7, $numUEs=10",,,


In [10]:
mode4_raw.module.describe()

count                              68812
unique                               842
top       Mode4World.node[69].lteNic.mac
freq                                 230
Name: module, dtype: object

In [11]:
mode4_raw.module.unique()

array([nan, 'Mode4World.node[0].lteNic.pdcpRrc',
       'Mode4World.node[0].lteNic.rlc.um',
       'Mode4World.node[0].lteNic.rlc.am',
       'Mode4World.node[0].lteNic.phy', 'Mode4World.node[0].lteNic.mac',
       'Mode4World.node[6].lteNic.pdcpRrc',
       'Mode4World.node[6].lteNic.rlc.um',
       'Mode4World.node[6].lteNic.rlc.am',
       'Mode4World.node[6].lteNic.phy', 'Mode4World.node[6].lteNic.mac',
       'Mode4World.node[60].lteNic.pdcpRrc',
       'Mode4World.node[60].lteNic.rlc.um',
       'Mode4World.node[60].lteNic.rlc.am',
       'Mode4World.node[60].lteNic.phy', 'Mode4World.node[60].lteNic.mac',
       'Mode4World.node[66].lteNic.pdcpRrc',
       'Mode4World.node[66].lteNic.rlc.um',
       'Mode4World.node[66].lteNic.rlc.am',
       'Mode4World.node[66].lteNic.phy', 'Mode4World.node[66].lteNic.mac',
       'Mode4World.node[72].lteNic.pdcpRrc',
       'Mode4World.node[72].lteNic.rlc.um',
       'Mode4World.node[72].lteNic.rlc.am',
       'Mode4World.node[72].lteNic.phy',

In [12]:
broken_module = mode4_raw['module'].str.split('.', 3, expand=True)

In [13]:
mode4_raw["network"]   = broken_module[0]
mode4_raw["node"]      = broken_module[1]
mode4_raw["interface"] = broken_module[2]
mode4_raw["layer"]     = broken_module[3]

mode4_raw = mode4_raw.drop("module", axis=1)

In [14]:
mode4_raw.node = mode4_raw.node.str.replace("node", "")
mode4_raw.node = mode4_raw.node.str.replace("[", "")
mode4_raw.node = mode4_raw.node.str.replace("]", "")

# df['range'] = df['range'].str.replace(',','-')

In [15]:
mode4_raw.node.unique()

array([nan, '0', '6', '60', '66', '72', '78', '84', '90', '96', '102',
       '108', '114', '12', '18', '24', '30', '36', '42', '48', '54', '1',
       '7', '61', '67', '73', '79', '85', '91', '97', '103', '109', '115',
       '13', '19', '25', '31', '37', '43', '49', '55', '2', '8', '62',
       '68', '74', '80', '86', '92', '98', '104', '110', '116', '14',
       '20', '26', '32', '38', '44', '50', '56', '3', '9', '63', '69',
       '75', '81', '87', '93', '99', '105', '111', '117', '15', '21',
       '27', '33', '39', '45', '51', '57', '4', '10', '64', '70', '76',
       '82', '88', '94', '100', '106', '112', '118', '16', '22', '28',
       '34', '40', '46', '52', '58', '5', '11', '65', '71', '77', '83',
       '89', '95', '101', '107', '113', '119', '17', '23', '29', '35',
       '41', '47', '53', '59', 'radioMedium'], dtype=object)

In [16]:
# Need to remove radioMedium this row is definitely not something we expect.
broken_run = mode4_raw['run'].str.split('-', 4, expand=True)

In [17]:
broken_run[0].unique()

array(['Mode4'], dtype=object)

In [18]:
mode4_raw["scenario"]  = broken_run[0]
mode4_raw["run"]       = broken_run[1]
mode4_raw["date"]      = broken_run[2]
mode4_raw["time"]      = broken_run[3]
mode4_raw["processId"] = broken_run[4]

In [19]:
mode4_raw.head()

Unnamed: 0,run,type,name,attrname,attrvalue,value,vectime,vecvalue,network,node,interface,layer,scenario,date,time,processId
0,0,runattr,,configname,Mode4,,,,,,,,Mode4,20190323,12:03:13,23303
1,0,runattr,,datetime,20190323-12:03:13,,,,,,,,Mode4,20190323,12:03:13,23303
2,0,runattr,,experiment,Mode4,,,,,,,,Mode4,20190323,12:03:13,23303
3,0,runattr,,inifile,omnetpp.ini,,,,,,,,Mode4,20190323,12:03:13,23303
4,0,runattr,,iterationvars,"$cqi=7, $numUEs=10",,,,,,,,Mode4,20190323,12:03:13,23303


In [20]:
mode4_raw.type.unique()

array(['runattr', 'itervar', 'param', 'scalar', 'attr', 'vector'],
      dtype=object)

In [21]:
runattr_df = mode4_raw[mode4_raw["type"]=="runattr"]
runattr_df = runattr_df.dropna(axis=1, how="all")

In [22]:
itervar_df = mode4_raw[mode4_raw["type"]=="itervar"]
itervar_df = itervar_df.dropna(axis=1, how="all")

In [23]:
param_df = mode4_raw[mode4_raw["type"]=="param"]
param_df = param_df.dropna(axis=1, how="all")

In [24]:
attr_df = mode4_raw[mode4_raw["type"]=="attr"]
attr_df = attr_df.dropna(axis=1, how="all")

In [25]:
vector_df = mode4_raw[mode4_raw["type"]=="vector"]
vector_df = vector_df.dropna(axis=1, how="all")

In [26]:
scalar_df = mode4_raw[mode4_raw["type"]=="scalar"]
scalar_df = scalar_df.dropna(axis=1, how="all")

In [27]:
attr_df.columns

Index(['run', 'type', 'name', 'attrname', 'attrvalue', 'network', 'node',
       'interface', 'layer', 'scenario', 'date', 'time', 'processId'],
      dtype='object')

In [28]:
vector_df.name.unique()

array(['posX:vector', 'posY:vector', 'servingCell:vector',
       'transmission:vector(camStationId)',
       'transmission:vector(camGenerationDeltaTime)',
       'receivedPacketFromUpperLayer:vector(packetBytes)',
       'sentPacketToLowerLayer:vector(packetBytes)',
       'selectedNumSubchannels:vector', 'grantRequests:vector',
       'receivedPacketFromLowerLayer:vector(packetBytes)',
       'sentPacketToUpperLayer:vector(packetBytes)', 'selectedMCS:vector',
       'scisSent:vector', 'tbsSent:vector', 'txRxDistance:vector',
       'scisReceived:vector', 'scisNotDecoded:vector',
       'tbsReceived:vector', 'tbsFailedDueToNoSCI:vector',
       'scisDecoded:vector', 'tbsDecoded:vector', 'macDelayD2D:vector',
       'reception:vector(camStationId)',
       'reception:vector(camGenerationDeltaTime)',
       'grantBreakSize:vector', 'maximumCapacity:vector',
       'grantBreak:vector', 'tbFailedButSCIReceived:vector'], dtype=object)

In [29]:
scalar_df.name.unique()

array(['sentPacketToLowerLayer:count',
       'sentPacketToLowerLayer:sum(packetBytes)',
       'sentPacketToUpperLayer:count',
       'sentPacketToUpperLayer:sum(packetBytes)',
       'receivedPacketFromLowerLayer:count',
       'receivedPacketFromLowerLayer:sum(packetBytes)',
       'receivedPacketFromUpperLayer:count',
       'receivedPacketFromUpperLayer:sum(packetBytes)', 'pdcpdrop3:mean',
       'pdcpdrop2:mean', 'pdcpdrop1:mean', 'pdcpdrop0:mean',
       'rlcPduPacketLossD2D:mean', 'rlcPduPacketLossDl:mean',
       'rlcPduPacketLossUl:mean', 'rlcPacketLossTotal:mean',
       'rlcPacketLossD2D:mean', 'rlcPacketLossDl:mean',
       'rlcPacketLossUl:mean', 'rlcCellPacketLossD2D:mean',
       'rlcCellPacketLossUl:mean', 'rlcCellPacketLossDl:mean',
       'rlcCellThroughputD2D:mean', 'rlcCellThroughputDl:mean',
       'rlcCellThroughputUl:mean', 'rlcPduThroughputD2D:mean',
       'rlcPduDelayD2D:mean', 'rlcPduThroughputUl:mean',
       'rlcPduDelayUl:mean', 'rlcPduThroughputDl:mean',

In [31]:
vector_df.columns

Index(['run', 'type', 'name', 'vectime', 'vecvalue', 'network', 'node',
       'interface', 'layer', 'scenario', 'date', 'time', 'processId'],
      dtype='object')

In [34]:
vector_df.vectime

58531    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58533    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58535                                                [1.0]
58539    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58541    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58543                                                [1.0]
58547    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58549    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58551                                                [1.0]
58555    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58557    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58559                                                [1.0]
58563    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58565    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58567                                                [1.0]
58571    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...
58573    [1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, .

```
pdf = pd.read_csv("/home/brian/results-analysis/data/processed_data/2019-03-23_12:47-mode4/vector.csv",usecols=["node","name", "vectime", "vecvalue"])
pdf2 = pdf[pdf["node"] == 0]

tbs_recived = pdf2[pdf2["name"] == "tbsReceived:vector"]["vecvalue"].values
tbs_received_time = pdf2[pdf2["name"] == "tbsReceived:vector"]["vectime"].values

txrxdistance = pdf2[pdf2["name"] == "txRxDistance:vector"]["vecvalue"].values
txrxdistance_time = pdf2[pdf2["name"] == "txRxDistance:vector"]["vectime"].values

tbs_failbutrec = pdf2[pdf2["name"] == "tbFailedButSCIReceived:vector"]["vecvalue"].values
tbs_failbutrec_time = pdf2[pdf2["name"] == "tbFailedButSCIReceived:vector"]["vectime"].values

tbs_failduesci = pdf2[pdf2["name"] == "tbsFailedDueToNoSCI:vector"]["vecvalue"].values
tbs_failduesci_time = pdf2[pdf2["name"] == "tbsFailedDueToNoSCI:vector"]["vectime"].values
```

In [36]:
def create_bins(lower_bound, width, quantity):
    """ create_bins returns an equal-width (distance) partitioning. 
        It returns an ascending list of tuples, representing the intervals.
        A tuple bins[i], i.e. (bins[i][0], bins[i][1])  with i > 0 
        and i < quantity, satisfies the following conditions:
            (1) bins[i][0] + width == bins[i][1]
            (2) bins[i-1][0] + width == bins[i][0] and
                bins[i-1][1] + width == bins[i][1]
    """
    
    bins = []
    for low in range(lower_bound, 
                     lower_bound + quantity*width + 1, width):
        bins.append((low, low+width))
    return bins

In [45]:
dict_df = {}

In [83]:
temp_df = vector_df[vector_df["node"]=="0"]

In [84]:
temp_df.head()

Unnamed: 0,run,type,name,vectime,vecvalue,network,node,interface,layer,scenario,date,time,processId
58531,0,vector,posX:vector,"[1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[2.6, 2.6, 17.266391688581, 31.114573689193, 4...",Mode4World,0,,,Mode4,20190323,12:03:13,23303
58533,0,vector,posY:vector,"[1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10....",Mode4World,0,,,Mode4,20190323,12:03:13,23303
58535,0,vector,servingCell:vector,[1.0],[0.0],Mode4World,0,lteNic,phy,Mode4,20190323,12:03:13,23303
58704,0,vector,transmission:vector(camStationId),"[1.181151847425, 1.281151847425, 1.38115184742...","[1804289383.0, 1804289383.0, 1804289383.0, 180...",Mode4World,0,middleware,CaService,Mode4,20190323,12:03:13,23303
58706,0,vector,transmission:vector(camGenerationDeltaTime),"[1.181151847425, 1.281151847425, 1.38115184742...","[232.0, 232.0, 232.0, 232.0, 232.0, 232.0, 232...",Mode4World,0,middleware,CaService,Mode4,20190323,12:03:13,23303


In [85]:
temp_df["vectime"] = temp_df["vectime"].apply(lambda x: np.around(x, decimals=2))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [None]:
for inst in tbs_received_time:
    itemindex_failbutrec = np.where(tbs_failbutrec_time==inst)
    tbs_failbutrec.append(len(itemindex_failbutrec[0]))
    itemindex_tbs_failduesci = np.where(tbs_failduesci_time==inst)
    tbs_failduesci.append(len(itemindex_tbs_failduesci[0]))

    itemindex_disttime = np.where(txrxdistance_time==inst)
    dist.append(tbs_txrxdistance[itemindex_disttime[0][0]])

In [86]:
temp_df.head()

Unnamed: 0,run,type,name,vectime,vecvalue,network,node,interface,layer,scenario,date,time,processId
58531,0,vector,posX:vector,"[1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[2.6, 2.6, 17.266391688581, 31.114573689193, 4...",Mode4World,0,,,Mode4,20190323,12:03:13,23303
58533,0,vector,posY:vector,"[1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, ...","[10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10....",Mode4World,0,,,Mode4,20190323,12:03:13,23303
58535,0,vector,servingCell:vector,[1.0],[0.0],Mode4World,0,lteNic,phy,Mode4,20190323,12:03:13,23303
58704,0,vector,transmission:vector(camStationId),"[1.18, 1.28, 1.38, 1.48, 1.58, 1.68, 1.78, 1.8...","[1804289383.0, 1804289383.0, 1804289383.0, 180...",Mode4World,0,middleware,CaService,Mode4,20190323,12:03:13,23303
58706,0,vector,transmission:vector(camGenerationDeltaTime),"[1.18, 1.28, 1.38, 1.48, 1.58, 1.68, 1.78, 1.8...","[232.0, 232.0, 232.0, 232.0, 232.0, 232.0, 232...",Mode4World,0,middleware,CaService,Mode4,20190323,12:03:13,23303


In [60]:
temp_df_1 = pd.DataFrame(temp_df.vectime.values.tolist(), index= temp_df.index)
temp_df_2 = pd.DataFrame(temp_df.vecvalue.values.tolist(), index= temp_df.index)

In [89]:
temp_df.vectime.values.tolist()
temp_df.vecvalue.values.tolist()

[array([  2.6       ,   2.6       ,  17.26639169,  31.11457369,
         45.43847788,  58.91799756,  73.32727908,  88.01417674,
        102.46118724, 115.9213422 , 130.19650584, 144.47908003,
        159.03382126, 173.64632376, 187.17564695, 201.28971153,
        215.9567632 , 230.00733537, 244.50400249, 258.56276897,
        273.11688779, 287.23760977, 301.79283214, 316.30700424,
        330.44711832, 344.27052862, 358.98871842, 373.19232098,
        387.36398663, 401.46461982, 415.26864388, 429.21860565,
        443.33632209, 457.24766739, 471.30852598, 484.79329733,
        499.32495518, 513.62278082, 527.43055393, 542.05167886,
        556.00314809, 570.39366204, 583.9239805 , 597.72170706,
        611.97420751, 626.58148357, 640.48231074, 655.00627032,
        669.01793899, 683.15337467, 697.72939512]),
 array([10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10., 10.,
        10., 10., 10., 10., 10., 1