In [1]:
import pandas as pd
import json
import os


# Replicates Java's hashCode function perfectly
def intify(i):
  # Python's int data type by default stores an unbounded amount of data, so we trim it to a signed 32-bit integer, which is used in Java's hashCode function
  return (i % 4294967296) - 2147483648
def hashCode(s):
  counter = 0
  for i in range(len(s)):
    counter += intify(ord(s[-i-1]) * (pow(31,i)))
    # The iterative algorithm that computes the hash in Java
  return intify(counter)


def create_start_df(lines):
    id_start = []
    # process it line to make first field the hash and second field the timestamp
    for line in lines:
        templ = []
        sepl = line.split(",")
        timestamp = sepl[-1]
        del sepl[-1]
        flow_data_record = json.loads(",".join(sepl))["netflow-v9:netflow"]["export-packet"]["flow-data-record"][0]
        fields = [
            str(flow_data_record["ipv4"]["src-address"]),
            str(flow_data_record["ipv4"]["dst-address"]),
            str(flow_data_record["src-port"]),
            str(flow_data_record["dst-port"]),
            str(flow_data_record["first-switched"]),
            str(flow_data_record["last-switched"])
        ]
        tempstr = "-".join(fields)
        templ.append(str(hashCode(tempstr)))
        templ.append(timestamp)
        id_start.append(templ)

    return pd.DataFrame(id_start, columns=["hash", "nano-end"])


def create_end_df(lines):
    id_end = []
# process it line to make first field the hash and second field the timestamp
    for line in lines:
        templ = []
        sepl = line.split(",")
        timestamp = sepl[-1]
        del sepl[0:8]
        del sepl[-1]
        tempstr = "-".join(sepl)
        templ.append(str(hashCode(tempstr)))
        templ.append(timestamp)
        id_end.append(templ)

    return pd.DataFrame(id_end, columns=["hash", "nanoTimestamp-End"])

In [None]:
raw_path = "../controlled_DATA/multiple-rates-driversapps/results_outputdriver/"

# Get the start and end files and create another file with the latency in milliseconds
iterations=[1, 2, 3]
delays=[0.016, 0.025, 0.05, 0.1, 0.2]

for delay in delays:

    for i in iterations:

        df_start = None
        df_end = None
        df = None

        ### READ AND PROCESS DRIVER INPUT START FILE
        with open(raw_path+"batch"+str(delay)+"_"+str(i)+"-start.txt") as file:
            start_lines = file.read().splitlines()

        with open(raw_path+"batch"+str(delay)+"_"+str(i)+"-end.txt") as file:
            end_lines = file.read().splitlines()

        df_start = create_start_df(start_lines)
        df_end = create_end_df(end_lines)

        ### MERGE DATAFRAMES id_start and id_end
        df = df_start.merge(df_end, left_on=['hash'], right_on=['hash'], how='outer')
        if not df.isnull().sum().sum() == 0:
            print("some hash do not coincide")
        df.dropna(inplace=True)

        df["processing_time"] = df["nano-end"].astype(float) - df["nano-start"].astype(float)
        df["processing_time_ms"] = df["processing_time"]/1000000
        if not os.path.exists(raw_path+"processed/"):
            os.makedirs(raw_path+"processed/")
        df.to_csv(raw_path+"processed/latency"+str(delay)+"_"+str(i)+".csv", index=False)

print("All processed files saved in:", raw_path+"processed/")

In [2]:
delays=[0.016, 0.025, 0.05, 0.1, 0.2]

processed_path = raw_path+"processed/"

for delay in delays:

    df1 = pd.read_csv(processed_path+"latency"+str(delay)+"_"+str(1)+".csv")
    df2 = pd.read_csv(processed_path+"latency"+str(delay)+"_"+str(2)+".csv")
    df3 = pd.read_csv(processed_path+"latency"+str(delay)+"_"+str(3)+".csv")

    df = df1.merge(df2[["hash", "processing_time_ms"]], left_on=['hash'], right_on=['hash'], how='outer')
    df = df.merge(df3[["hash", "processing_time_ms"]], left_on=['hash'], right_on=['hash'], how='outer')
    if not df.isnull().sum().sum() == 0:
        print("some hash do not coincide")
    df.dropna(inplace=True)

    df["processing_time_ms"] = (df["processing_time_ms"]+df["processing_time_ms_x"]+df["processing_time_ms_y"])/3
    df.drop(['nano-start', 'nano-end', 'processing_time', 'processing_time_ms_x', 'processing_time_ms_y'], axis='columns', inplace=True)

    if not os.path.exists(processed_path+"processed_averaged/"):
        os.makedirs(processed_path+"processed_averaged/")
    df.to_csv(processed_path+"processed_averaged/latency"+str(delay)+".csv", index=False)

print("All processed averaged files saved in:", processed_path+"processed_averaged/")

(1250, 2)


Unnamed: 0,hash,nanoTimestamp-Start
0,1499819829,989814904544581
1,-335479692,989815911313758
2,2004416604,989816918302764
3,1672878773,989817924437373
4,1999383356,989818931213247


In [3]:
### READ AND PROCESS DRIVER INPUT START FILE
with open("../controlled_DATA/latency-raw/consumerCDS-end.csv") as file:
    lines = file.read().splitlines()




(1250, 2)


Unnamed: 0,hash,nanoTimestamp-End
0,1499819829,989815014406968
1,-335479692,989816009863905
2,2004416604,989816941992649
3,1672878773,989818002618371
4,1999383356,989819005770104


In [4]:
### MERGE DATAFRAMES id_start and id_end
df = df_start.merge(df_end, left_on=['hash'], right_on=['hash'], how='outer')
if(not df.isnull().sum().sum()==0):
    print("some hash do not coincide")

df.head()

Unnamed: 0,hash,nanoTimestamp-Start,nanoTimestamp-End
0,1499819829,989814904544581,989815014406968
1,-335479692,989815911313758,989816009863905
2,2004416604,989816918302764,989816941992649
3,1672878773,989817924437373,989818002618371
4,1999383356,989818931213247,989819005770104


In [5]:
df.to_csv("../controlled_DATA/latency-parsed/consumerCDS-latency.csv", index=False)