In [2]:
## Import necessary libraries

import ipaddress
import pandas as pd
import time
import bz2
import os
import sys
import json
import pickle 
import numpy as np

In [3]:
# DataSet is already sorted. Therefore we can use Binary search which has a complexity of O(logn)
   
def binarySearch(x, start):
    low = 0
    high = len(start)
    iterations = 0 
    while low <= high:
        mid = int((low + high)/2)
        if x >= start[mid] and x < start[mid+1]:
#             print(str(x) + " found at index " + str(mid) + ", within " + str(iterations) + " iterations.")
            return mid, iterations
        elif x < start[mid]:
            iterations += 1
            high = mid - 1
        else:
            iterations +=1
            low = mid + 1
#     return str(x) + " not found, " + str(iterations) + " iterations attempted."

In [4]:
#Obtaining necessary datasets
IPdata = pd.read_csv("Data/IP/IP2LOCATION-LITE-DB1.csv", header = None) 
IPdata.columns = ["startIP", "finalIP", "cc", "Country"]

In [6]:
#Obtaining necessary datasets
with open('Data/probe_dataset.pkl', 'rb') as probe_data:
    data = pickle.load(probe_data)
    

# Open AS dataset

with open('Data/AS_dataset.pkl', 'rb') as AS_data:
    data2 = pickle.load(AS_data)
    
# Filter AS dataset with only EU ASN
# Filter AS dataset on "hosting" at type

countryCodesEU = ["AT", "BE", "BG", "CY", "CZ", "DE", "DK", "EE", "ES", "FI",
"FR", "GB", "GR", "HR", "HU", "IE", "IT", "LT", "LU", "LV",
"MT", "NL", "PL", "PT", "RO", "SE", "SI", "SK"]

EUdata = data2[data2['Country'].isin(countryCodesEU)]
EUdata2 = EUdata[EUdata["type"] == "hosting"]


# Filter probes in probe dataset that occur in AS dataset

probs = data[data['ASN'].isin(EUdata2['ASN'])]

In [7]:
# Probe id is needed ("prb_id")
# Ip adress destination("dest_addr")
# Note that RIPE dataset also supports IPv6 adresses
# Filter only IPv4
# but first check if dst_addr exists in entry

filenr = '00'
testList = []
filename = "Data/RIPE/ping-2020-02-20T" + str(filenr) + "00.bz2"


with bz2.BZ2File(filename, "rb") as file:
    i = 0
    for line in file:
        if i < 100000:
            jsonline = json.loads(line)
            if 'dst_addr' in jsonline and ':' not in jsonline["dst_addr"] and jsonline["prb_id"] in probs["prb_id"]:
                testList.append(jsonline["dst_addr"])
            i += 1
        else:
            break
            

In [8]:
# # Show complexity of Binary Search Algorithm with iterations

size = 10000
j = 0
iterations = 0
estList = []

start = time.time()

for i in testList:
    if j < size:
        j += 1
        iterations = max(iterations, (binarySearch(int(ipaddress.ip_address(i)), list(IPdata["startIP"]))[1])) 
        index = binarySearch(int(ipaddress.ip_address(i)), list(IPdata["startIP"]))[0]
        if IPdata.iloc[index]['cc'] in countryCodesEU:
            estList.append([i, IPdata.iloc[index]['Country']])

end = time.time() 

print("Maximum amount of iterations needed: " + str(iterations))


# Print estimated time in minutes for binary search whole dataseet

nrValues = len(IPdata)
duration = ((end - start)/60)*(nrValues/size)
# time = (math.log2((nrValues - 2**duration)))

print("Estimated running time: " + str(duration) + " Minutes")

Maximum amount of iterations needed: 17
Estimated running time: 9.20374967171669 Minutes


In [9]:
#Check the IP list 
testList
from collections import Counter
Counter(testList)

Counter({'192.58.128.30': 4,
         '78.46.48.134': 3,
         '193.171.255.2': 3,
         '109.230.214.3': 2,
         '111.202.83.178': 1,
         '202.70.77.250': 1,
         '43.239.158.245': 1,
         '162.255.145.7': 1,
         '199.7.83.42': 6,
         '37.10.41.14': 3,
         '45.77.229.242': 2,
         '200.107.82.196': 2,
         '104.237.152.132': 4,
         '45.33.72.12': 2,
         '217.20.160.1': 2,
         '192.33.4.12': 5,
         '198.41.0.4': 8,
         '31.14.143.132': 2,
         '45.239.44.43': 3,
         '178.250.0.57': 3,
         '195.43.87.140': 3,
         '200.94.183.170': 2,
         '195.190.26.29': 3,
         '102.130.49.157': 5,
         '206.144.240.4': 2,
         '80.242.193.157': 2,
         '37.10.45.14': 1,
         '23.131.160.206': 3,
         '93.113.126.132': 5,
         '206.144.4.4': 2,
         '83.243.44.55': 5,
         '212.9.170.190': 4,
         '192.228.79.201': 4,
         '196.10.55.135': 1,
         '208.80.155.69

In [10]:
#Obtain example 
testList.index('193.171.255.2')

2

In [11]:
#Show corresponding country 
estList[1]

['193.171.255.2', 'Austria']

##We can see that the IP '193.171.255.2' repeats 3 times in testlist dataset and the country in estList is assigned with Austria