In [1]:
# Replicates Java's hashCode function perfectly
def intify(i):
  # Python's int data type by default stores an unbounded amount of data, so we trim it to a signed 32-bit integer, which is used in Java's hashCode function
  return (i % 4294967296) - 2147483648
def hashCode(s):
  counter = 0
  for i in range(len(s)):
    counter += intify(ord(s[-i-1]) * (pow(31,i)))
    # The iterative algorithm that computes the hash in Java
  return intify(counter)

In [2]:
import pandas as pd
import json


### READ AND PROCESS DRIVER INPUT END FILE
with open("../controlled_DATA/latency-raw/consumerCDS-start.csv") as file:
    lines = file.read().splitlines()


id_start = []
# process it line to make first field the hash and second field the timestamp
for line in lines:
    templ = []
    sepl = line.split(",")
    timestamp = sepl[-1]
    del sepl[-1]
    flow_data_record = json.loads(",".join(sepl))["netflow-v9:netflow"]["export-packet"]["flow-data-record"][0]
    fields = [
        str(flow_data_record["ipv4"]["src-address"]),
        str(flow_data_record["ipv4"]["dst-address"]),
        str(flow_data_record["src-port"]),
        str(flow_data_record["dst-port"]),
        str(flow_data_record["first-switched"]),
        str(flow_data_record["last-switched"])
    ]
    tempstr = "-".join(fields)
    templ.append(str(hashCode(tempstr)))
    templ.append(timestamp)
    id_start.append(templ)

df_start = pd.DataFrame(id_start, columns=["hash", "nanoTimestamp-Start"])
print(df_start.shape)
df_start.head()

(1250, 2)


Unnamed: 0,hash,nanoTimestamp-Start
0,1499819829,989814904544581
1,-335479692,989815911313758
2,2004416604,989816918302764
3,1672878773,989817924437373
4,1999383356,989818931213247


In [3]:
### READ AND PROCESS DRIVER INPUT START FILE
with open("../controlled_DATA/latency-raw/consumerCDS-end.csv") as file:
    lines = file.read().splitlines()


id_end = []
# process it line to make first field the hash and second field the timestamp
for line in lines:
    templ = []
    sepl = line.split(",")
    timestamp = sepl[-1]
    del sepl[0:8]
    del sepl[-1]
    tempstr = "-".join(sepl)
    templ.append(str(hashCode(tempstr)))
    templ.append(timestamp)
    id_end.append(templ)

df_end = pd.DataFrame(id_end, columns=["hash", "nanoTimestamp-End"])
print(df_end.shape)
df_end.head()

(1250, 2)


Unnamed: 0,hash,nanoTimestamp-End
0,1499819829,989815014406968
1,-335479692,989816009863905
2,2004416604,989816941992649
3,1672878773,989818002618371
4,1999383356,989819005770104


In [4]:
### MERGE DATAFRAMES id_start and id_end
df = df_start.merge(df_end, left_on=['hash'], right_on=['hash'], how='outer')
if(not df.isnull().sum().sum()==0):
    print("some hash do not coincide")

df.head()

Unnamed: 0,hash,nanoTimestamp-Start,nanoTimestamp-End
0,1499819829,989814904544581,989815014406968
1,-335479692,989815911313758,989816009863905
2,2004416604,989816918302764,989816941992649
3,1672878773,989817924437373,989818002618371
4,1999383356,989818931213247,989819005770104


In [5]:
df.to_csv("../controlled_DATA/latency-parsed/consumerCDS-latency.csv", index=False)