In [1]:
# Replicates Java's hashCode function perfectly
def intify(i):
  # Python's int data type by default stores an unbounded amount of data, so we trim it to a signed 32-bit integer, which is used in Java's hashCode function
  return (i % 4294967296) - 2147483648
def hashCode(s):
  counter = 0
  for i in range(len(s)):
    counter += intify(ord(s[-i-1]) * (pow(31,i)))
    # The iterative algorithm that computes the hash in Java
  return intify(counter)

In [2]:

import pandas as pd
import json


### READ AND PROCESS DRIVER INPUT START FILE
with open("../controlled_DATA/latency-raw/input-driver-start.csv") as file:
    lines = file.read().splitlines()


id_start = []
# process it line to make first field the hash and second field the timestamp
for line in lines:
    templ = []
    sepl = line.split(",")
    timestamp = sepl[-1]
    del sepl[-1]
    goflow2 = json.loads(",".join(sepl))
    fields = [
        str(goflow2["SrcAddr"]),
        str(goflow2["DstAddr"]),
        str(goflow2["SrcPort"]),
        str(goflow2["DstPort"]),
        str(goflow2["TimeFlowStartMs"]),
        str(goflow2["TimeFlowEndMs"])
    ]
    tempstr = "-".join(fields)
    templ.append(str(hashCode(tempstr)))
    templ.append(timestamp)
    id_start.append(templ)

df_start = pd.DataFrame(id_start, columns=["hash", "nanoTimestamp-Start"])
print(df_start.shape)
df_start.head()

(2500, 2)


Unnamed: 0,hash,nanoTimestamp-Start
0,1499819829,981802224069956
1,-1334446317,981803230515733
2,-335479692,981804236255846
3,347162664,981805243306082
4,2004416604,981806250378378


In [3]:
### READ AND PROCESS DRIVER INPUT END FILE
with open("../controlled_DATA/latency-raw/input-driver-end.csv") as file:
    lines = file.read().splitlines()


id_end = []
# process it line to make first field the hash and second field the timestamp
for line in lines:
    templ = []
    sepl = line.split(",")
    timestamp = sepl[-1]
    del sepl[-1]
    flow_data_record = json.loads(",".join(sepl))["netflow-v9:netflow"]["export-packet"]["flow-data-record"][0]
    fields = [
        str(flow_data_record["ipv4"]["src-address"]),
        str(flow_data_record["ipv4"]["dst-address"]),
        str(flow_data_record["src-port"]),
        str(flow_data_record["dst-port"]),
        str(flow_data_record["first-switched"]),
        str(flow_data_record["last-switched"])
    ]
    tempstr = "-".join(fields)
    templ.append(str(hashCode(tempstr)))
    templ.append(timestamp)
    id_end.append(templ)

df_end = pd.DataFrame(id_end, columns=["hash", "nanoTimestamp-End"])
print(df_end.shape)
df_end.head()

(2500, 2)


Unnamed: 0,hash,nanoTimestamp-End
0,1499819829,981804507894434
1,-1334446317,981806307592963
2,-335479692,981808114281803
3,347162664,981809517056674
4,2004416604,981810821341114


In [4]:
### MERGE DATAFRAMES id_start and id_end
df = df_start.merge(df_end, left_on=['hash'], right_on=['hash'], how='outer')
if(not df.isnull().sum().sum()==0):
    print("some hash do not coincide")

df.head(20)

Unnamed: 0,hash,nanoTimestamp-Start,nanoTimestamp-End
0,1499819829,981802224069956,981804507894434
1,-1334446317,981803230515733,981806307592963
2,-335479692,981804236255846,981808114281803
3,347162664,981805243306082,981809517056674
4,2004416604,981806250378378,981810821341114
5,-1395350116,981807257382147,981811910892626
6,1672878773,981808264988852,981813021973845
7,-276277143,981809272322029,981814109936518
8,1999383356,981810279408275,981815013778387
9,-983806716,981811285969778,981816013717822


In [5]:
df.to_csv("../controlled_DATA/latency-parsed/input-driver-latency.csv", index=False)