In [5]:
# Replicates Java's hashCode function perfectly
def intify(i):
  # Python's int data type by default stores an unbounded amount of data, so we trim it to a signed 32-bit integer, which is used in Java's hashCode function
  return (i % 4294967296) - 2147483648
def hashCode(s):
  counter = 0
  for i in range(len(s)):
    counter += intify(ord(s[-i-1]) * (pow(31,i)))
    # The iterative algorithm that computes the hash in Java
  return intify(counter)

In [10]:

import pandas as pd
import json


filename = "netflowAgg"

### READ AND PROCESS DRIVER INPUT END FILE
with open("../controlled_DATA/latency-raw/"+filename+"-start.csv") as file:
    lines = file.read().splitlines()


id_start = []
# process it line to make first field the hash and second field the timestamp
for line in lines:
    templ = []
    sepl = line.split(",")
    timestamp = sepl[-1]
    del sepl[-1]
    flow_data_record = json.loads(",".join(sepl))["netflow-v9:netflow"]["export-packet"]["flow-data-record"][0]
    fields = [
        str(flow_data_record["ipv4"]["src-address"]),
        str(flow_data_record["ipv4"]["dst-address"]),
        str(flow_data_record["src-port"]),
        str(flow_data_record["dst-port"]),
        str(flow_data_record["first-switched"]),
        str(flow_data_record["last-switched"])
    ]
    tempstr = "-".join(fields)
    templ.append(str(hashCode(tempstr)))
    templ.append(timestamp)
    id_start.append(templ)

df_start = pd.DataFrame(id_start, columns=["hash", "nanoTimestamp-Start"])
print(df_start.shape)
df_start.head()

(1250, 2)


Unnamed: 0,hash,nanoTimestamp-Start
0,1499819829,988032744265024
1,-335479692,988033750205495
2,2004416604,988034757776690
3,1672878773,988035763820127
4,1999383356,988036770201814


In [11]:
### READ AND PROCESS DRIVER INPUT END FILE
with open("../controlled_DATA/latency-raw/"+filename+"-end.csv") as file:
    lines = file.read().splitlines()


id_end = []
# process it line to make first field the hash and second field the timestamp
for line in lines:
    templ = []
    sepl = line.split(",")
    timestamp = sepl[-1]
    del sepl[-1]
    flow_data_record = json.loads(",".join(sepl))["netflow-v9:netflow"]["export-packet"]["flow-data-record"][0]
    fields = [
        str(flow_data_record["ipv4"]["src-address"]),
        str(flow_data_record["ipv4"]["dst-address"]),
        str(flow_data_record["src-port"]),
        str(flow_data_record["dst-port"]),
        str(flow_data_record["first-switched"]),
        str(flow_data_record["last-switched"])
    ]
    tempstr = "-".join(fields)
    templ.append(str(hashCode(tempstr)))
    templ.append(timestamp)
    id_end.append(templ)

df_end = pd.DataFrame(id_end, columns=["hash", "nanoTimestamp-End"])
print(df_end.shape)
df_end.head()

(1250, 2)


Unnamed: 0,hash,nanoTimestamp-End
0,1499819829,988032768505755
1,-335479692,988033774229928
2,2004416604,988034782690163
3,1672878773,988035783864077
4,1999383356,988036790624808


In [12]:
### MERGE DATAFRAMES id_start and id_end
df = df_start.merge(df_end, left_on=['hash'], right_on=['hash'], how='outer')
if(not df.isnull().sum().sum()==0):
    print("some hash do not coincide")

df.head(20)

Unnamed: 0,hash,nanoTimestamp-Start,nanoTimestamp-End
0,1499819829,988032744265024,988032768505755
1,-335479692,988033750205495,988033774229928
2,2004416604,988034757776690,988034782690163
3,1672878773,988035763820127,988035783864077
4,1999383356,988036770201814,988036790624808
5,-1952606543,988037775982165,988037793723670
6,166934370,988038782255380,988038802026615
7,182334490,988039788081027,988039817639297
8,-847617856,988040794445086,988040814926398
9,797111117,988041800498038,988041819568666


In [13]:
df.to_csv("../controlled_DATA/latency-parsed/"+filename+"-latency.csv", index=False)