In [96]:
import ipinfo
from collections import defaultdict
import pickle
import requests
import pandas as pd 
import numpy as np
from geopy.geocoders import Nominatim
import csv
import multiprocessing
import json
import pycountry
import country_converter as coco  #Coco was found to be more accuracte. Pycountry had weird labels for e.g. russia


def get_ip():
    response = requests.get('https://api64.ipify.org?format=json').json()
    return response["ip"]


def get_location(ip):
    response = requests.get(f'https://ipapi.co/{ip}/json/').json()
    location_data = {
        "ip": ip,
        "city": response.get("city"),
        "region": response.get("region"),
        "country": response.get("country_name")
    }
    return location_data

def get_location_ipinfo(ip_address):
    ''' Best one of the three. However, IP requests is limited to the token
    '''
    try:
        access_token = '7dbb53d0419093'
        handler = ipinfo.getHandler(access_token)
        details = handler.getDetails(ip_address)
        if details.country == "US": 
            return details.region
        else:
            return details.country
    except:
        print("error")

In [97]:
def add_ip_locations(df):
    df["MinIP_loc"] = np.nan
    df["otherIP1_loc"] = np.nan 
    df["otherIP2_loc"] = np.nan 
    df["otherIP3_loc"] = np.nan 
    df["otherIP4_loc"] = np.nan 
    df["otherIP5_loc"] = np.nan 

    for index, row in df.iterrows():
        df.loc[index, "MinIP_loc"] = get_location_ipinfo(row["MinIP"])
        for i in range(1,6):  
            name = f"otherIP{i}"
            if pd.isna(row[name]):
                continue

            location = get_location_ipinfo(row[f"otherIP{i}"])
            df.loc[index, f"otherIP{i}_loc"] = location
    return df

In [98]:
def prb_location(df_prbs, prb_dic):
    '''Returns a dataframe with prb number and their respective country or state 
    '''
    geolocator = Nominatim(user_agent="geoapiExercises")
    df_prbs["prb_loc"] = ""
    for index, row in df_prbs.iterrows():
        prb_number = row["prb"]
        long,lat = tuple(prb_dic[prb_number]["geometry"]["coordinates"])
        location = ""
        try:
            location = geolocator.reverse(str(lat) + "," + str(long), language = 'en').raw
        except: 
            print("timeout at index: " + str(index))
            continue
        country = location["address"]["country"]
        
        if country == "Canada":
            state = location["address"]["state"]
            df_prbs.loc[index, "prb_loc"] = state
        elif country == "United States":
            state = str((location)).split(",")[-3]
            state = location["address"]["state"]
            df_prbs.loc[index, "prb_loc"] = state
        else:
            country_code = location["address"]["country_code"].upper() 
            df_prbs.loc[index, "prb_loc"] = country_code
        if index%1000 == 0:
            print("AT INDEX:   " + str(index))
    return df_prbs

In [99]:
def extract_prb_locs(df_edges,pickle_path):

    df_prbs_np_arr = df_edges["prb"].unique()
    df_prbs = pd.DataFrame(df_prbs_np_arr, columns=["prb"])

    with open(pickle_path, 'rb') as handle:
        prb_dic = pickle.load(handle)
        df_prb_loc = prb_location(df_prbs, prb_dic)
        df_prbs = df_prb_loc
        #df_prb_loc.to_csv(r"api\latencies\prbs_with_locations.csv", index = False)
    #return df_prbs
    return df_prb_loc

In [100]:
def merge_prb_locs_with_ip_locs(df_prb_loc, df_with_ip_locs):
    ''' Takes the csv with prbs and their respecitve locations to merge with the file containing ip locations
    ''' 
    prb_to_loc = {}
    for index, row in df_prb_loc.iterrows():
        prb_to_loc[row["prb"]] = row["prb_loc"]
    
    df_with_ip_locs["prb_loc"] = ""
    for index, row in df_with_ip_locs.iterrows(): 
        df_with_ip_locs.loc[index,"prb_loc"] = prb_to_loc[row["prb"]]

    return df_with_ip_locs

def reorder_columns_of_final_df(merged_df):
    ''' Reformats the file by column order
    '''
    cols = [
        "prb" , "prb_loc",
        "MinIP", "MinIP_loc", "Minlatency",
        "otherIP1", "otherIP1_loc", "otherlatency1",
        "otherIP2", "otherIP2_loc", "otherlatency2",
        "otherIP3", "otherIP3_loc", "otherlatency3",
        "otherIP4", "otherIP4_loc", "otherlatency4",
        "otherIP5", "otherIP5_loc", "otherlatency5",
        ]

    merged_df = merged_df[cols]
    
    merged_df = merged_df.rename({
        'otherIP1': 'IP1', 
        'otherIP2': 'IP2', 
        'otherIP3': 'IP3', 
        'otherIP4': 'IP4', 
        'otherIP5': 'IP5',
        'otherIP1_loc': 'IP1_loc', 
        'otherIP2_loc': 'IP2_loc', 
        'otherIP3_loc': 'IP3_loc', 
        'otherIP4_loc': 'IP4_loc', 
        'otherIP5_loc': 'IP5_loc',
        'otherlatency1': 'latency1', 
        'otherlatency2': 'latency2',
        'otherlatency3': 'latency3',
        'otherlatency4': 'latency4',
        'otherlatency5': 'latency5'
        }, axis='columns')

    return merged_df


In [101]:
def convert_country_code(df):
    ''' Converts non-'North American' countries to country code
    '''
    cc = coco.CountryConverter()
    #df = df.apply(lambda x: pycountry.countries.get(name=str(x)) if pycountry.countries.get(name=str(x)) != None else print(x))
    df = df.apply(lambda x: coco.convert(names=x, to='ISO2', not_found = x) if coco.convert(names=x, to='ISO2', not_found = None) != None else x)
    return df

In [102]:
def create_adj_mtx(edge_path, locations):
    '''Creates an adjacency matrix and returns it to later be saved with picke
    '''
    df_adj = pd.DataFrame(columns=locations, index=locations, dtype = object)

    df_edges = pd.read_csv(edge_path, index_col=False)
    cols = ["IP1_loc", "IP2_loc", "IP3_loc", "IP4_loc", "IP5_loc", "MinIP_loc"]
    
    for index, row in df_edges.iterrows():
        from_loc = row["prb_loc"]
        # skip if datacenter's location not sought after
        if from_loc not in locations:
            continue
        for col in cols: 
            to_loc = row[col]     
            # if user's request location not sought after  
            if to_loc not in locations:
                continue  
            col_idx = df_edges.columns.get_loc(col)
            latency = row.iloc[col_idx+1]    
            saved_latencies = df_adj.loc[from_loc,to_loc]
            if np.isnan(saved_latencies).all():
                df_adj.loc[from_loc,to_loc] = list([float(latency)])
                df_adj.loc[to_loc,from_loc] = list([float(latency)])
            else:
                df_adj.loc[from_loc,to_loc].append(float(latency))
                df_adj.loc[to_loc, from_loc].append(float(latency))
    return df_adj


In [103]:
def get_all_regions(df):
    ''' Returns all mentioned locations in edge latency file
    '''
    df_locs = pd.concat([df["MinIP_loc"], df["IP1_loc"], df["IP2_loc"], df["IP3_loc"], df["IP4_loc"], df["IP5_loc"]])
    return df_locs.unique().tolist()

In [104]:


def no_empty_cells(df):
    no = 0
    for index, row in df.iterrows():
        no += row.isna().sum()
    return no

def no_all_latencies(df):
    no = 0
    for index, row in df.iterrows():
        no += row.notna().count()
    return no

def no_all_latencies2(df):
    no = 0
    for index, row in df.iterrows():
        no += row.notna().astype(str).str.len().sum()
    return no

def all_latencies_dict(df):
    no = defaultdict(lambda:0)
    for index, row in df.iterrows():
        print(row.isna())
        no[index] += row.isna().count()
    return no

In [105]:
def get_prb_loc(prb_id):
    geolocator = Nominatim(user_agent="geoapiExercises")
    response = requests.get(f"https://atlas.ripe.net/api/v2/probes/{prb_id}/?format=json")
    resp_json = json.loads(response.content.decode("utf-8"))
    long, lat = tuple(resp_json["geometry"]["coordinates"])
    
    location = geolocator.reverse(str(lat) + "," + str(long), language = 'en')

    if location == None:
        print("None at location for prb: " + str(prb_id))
        return np.nan

    country = location.raw["address"]["country"]

    if country in ["United States", "Canada"]:
        state = location.raw["address"]["state"]
        return state
    else: 
        country_code = location.raw["address"]["country_code"].upper() 
        return country_code

In [106]:
def merge_locs(edge_df, cloud_df):
    """ As edge_df has prbs with corresponding locations, we merge them to avoid time-consuming queries on prb_id
    """
    edge_df = edge_df[["prb", "prb_loc"]]
    cloud_df = cloud_df.merge(edge_df, how="left", right_index = False, on = "prb")
    prb_locs = cloud_df.pop("prb_loc")
    cloud_df.insert(1, "prb_loc", prb_locs)
    return cloud_df

In [107]:
def translate_cloud():
    regions = pd.read_json(r"..\api\latencies\cloud_regions_na.json", orient= "records", dtype = dict)
    return regions["locations"].to_dict()

In [108]:
def values_flattened():
    locations = list(translate_cloud().values())
    # flatten 
    locations = [location for sub_locations in locations for location in sub_locations]
    # remove duplicates
    return list(set(locations))

In [109]:
def add_label_locations(cloud_df):
    # To run multiprocessing
    import pandas as pd
    region_cols = ["minLabel"] + [f"label.{i}" for i in range(1,68)]
    for col in region_cols:
        cloud_df[col + "_loc"] = pd.NA
    
    name_lookup = translate_cloud()
    print(name_lookup.keys())
    print(name_lookup.values())


    for index, row in cloud_df.iterrows():
        from_loc = row["prb_loc"]
        if not from_loc in values_flattened():
            continue
        print(from_loc)
        for col in region_cols:
            loc_str = col + "_loc"
            parsed_loc = row[loc_str]
            if not pd.isnull(parsed_loc) or pd.isnull(row[col]):
                continue
            unparsed_loc = row[col].replace(".csv", "")
            if unparsed_loc in name_lookup.keys():
                names = name_lookup[unparsed_loc]
                if len(names) == 1:
                    cloud_df.loc[index, loc_str] = names[0]
                else: 
                    cloud_df.loc[index, loc_str] = names[0] 

                    index_append = cloud_df.shape[0]
                    index_current_col = cloud_df.columns.get_loc(col) - 1
                    #print(str(index_current_col) + ","+ str(index))

                    print(index)
                    print(row["prb_loc"])
                    print(index_current_col)
                    print(index_append)
                    print(cloud_df.shape[0])
                    #print(cloud_df.loc[index, "prb"])
                    #print(cloud_df.loc[index, "prb_loc"])
                    
                    #latency_value = cloud_df.iloc[index, index_current_col]
                    latency_value = row.iloc[index_current_col]

                    # Split the names and append last string to last column to process later
                    cloud_df.loc[index_append] = pd.NA 
                    cloud_df.loc[index_append, "prb"] = row["prb"]
                    cloud_df.loc[index_append, "prb_loc"] = row["prb_loc"]
                    cloud_df.loc[index_append, "minLabel"] = unparsed_loc
                    cloud_df.loc[index_append, "minLabel_loc"] = names[1]
                    cloud_df.loc[index_append, "minMedian"] = latency_value

                    # No need for minLabel_loc, will be processed later
        if index%1000 == 0:
            print("AT INDEX:   " + str(index))
    return cloud_df


In [110]:
3
80
8569
8569
4
22
8569
8569
5
50
8569
8569
...
8570
50
8569
8569

8569

In [111]:
def create_adj_mtx2(cloud_path, locations):
    '''Creates an adjacency matrix and returns it to later be saved with picke
    '''
    df_adj = pd.DataFrame(columns=locations, index=locations, dtype = object)

    df_edges = pd.read_csv(cloud_path, index_col=False)
    cols = ["prb_loc", "minLabel_loc"] + [f"label.{i}_loc" for i in range(1,68)]
    
    for index, row in df_edges.iterrows():
        from_loc = row["prb_loc"]
        # skip if datacenter's location not sought after
        if from_loc not in locations:
            continue
        for col in cols: 
            to_loc = row[col]     
            # if user's request location not sought after  
            if to_loc not in locations:
                continue  
            label = col.replace("_loc", "")
            label_idx = df_edges.columns.get_loc(label)
            latency = row.iloc[label_idx-1]    
            saved_latencies = df_adj.loc[from_loc,to_loc]
            # true if saved latencies is empty list
            if not isinstance(saved_latencies, list):
                df_adj.loc[from_loc,to_loc] = list([float(latency)])
                df_adj.loc[to_loc,from_loc] = list([float(latency)])
            else:
                df_adj.loc[from_loc,to_loc].append(float(latency))
                df_adj.loc[to_loc, from_loc].append(float(latency))
    return df_adj

In [112]:
### This extracts probe id and maps it to a location

# df_edges = pd.read_csv(r"api\latencies\edge.csv", index_col= False)
# df_prb_locs = extract_prb_locs(df_edges, picke_path = r'api\latencies\probes_clean.pickle')
# df_prb_locs.to_csv(r"api\latencies\prbs_with_locations.csv", index = False)


# timeout at index: 7775
# AT INDEX:   8000
# timeout at index: 8215

### This assign a country code to non us/canada countries. they use states instead

#df_prbs = pd.read_csv(r"api\latencies\prbs_with_locations.csv", index_col= False)
#df_prbs["prb_loc"] = convert_country_code(df_prbs["prb_loc"])
#df_prbs.to_csv(r"api\latencies\prbs_with_locations_coco.csv", index = False)

### THIS adds location to the fields [MinIP, OtherIP{1,2,3,4,5}]

#df_with_ip_locs = add_ip_locations(df_edges)
#df_with_ip_locs.to_csv(r"api\latencies\edge_with_ip_locs.csv", index= False)


### BELOW Is to merge files and formatting

# df_prb_loc = pd.read_csv(r"api\latencies\prbs_with_locations.csv", index_col = False)
# df_with_ip_locs = pd.read_csv(r"api\latencies\edge_processed10.csv", index_col = False)

# df_merged = merge_prb_locs_with_ip_locs(df_prb_loc, df_with_ip_locs)

# df_merged_formated = reorder_columns_of_final_df(df_merged)
# df_merged_formated.to_csv(r"api\latencies\edge_feat_locations.csv", index = False)

### Creates adjacency matrix out of the edge latencies

df = pd.read_csv(r"..\api\latencies\edge_feat_locations.csv", index_col=False)
regions = get_all_regions(df)
# Filter out NaN and regions
na = [region for region in regions if type(region) == str and len(region) > 2]
eu = [region for region in regions if type(region) == str and len(region) < 3]
# na.remove("Oklahoma")
# na.remove("Alabama")
# na.remove("Mississippi")
# na.remove("South Dakota")
# na.remove("Nebraska")
# na.remove("Delaware")
# na.remove("Montana")
# na.remove("Alaska")
#na = [region for region in na if region in ["California", "Massachusetts", "Arizona"]]
# Is this number correct? 
print("Number of regions in NA: " + str(len(na)))
mtrx_df = create_adj_mtx(r"..\api\latencies\edge_feat_locations.csv", na)
mtrx_df.to_pickle(r"..\api\latencies\adjacency_mtrx.pickle")
print("N.o. all latencies: " + str(no_all_latencies2(mtrx_df)))
print(all_latencies_dict(mtrx_df))
print("N.o. missing datapoints: " + str(no_empty_cells(mtrx_df)) + " out of " + str(len(na)*len(na)))

# mtrx_df.to_csv(r"..\api\latencies\edge_feat_locations_na.csv")

### Add locations to cloud_data 

cloud_df = pd.read_csv(r"..\api\latencies\cloud.csv")
edge_df = pd.read_csv(r"..\api\latencies\edge_feat_locations.csv")
cloud_with_locs = merge_locs(edge_df, cloud_df)

print("This many prb_locs couldn't resolved by merge: " + str(cloud_with_locs["prb_loc"].isna().sum()))

#cloud_with_locs.apply(lambda x: x if pd.notna(x["prb_loc"]) else x["prb_loc"] = get_prb_loc(x["prb"]), axis= 1)
missing_locs = cloud_with_locs[cloud_with_locs["prb_loc"].isna()]
for index, row in missing_locs.iterrows():
    cloud_with_locs.loc[index, "prb_loc"] = get_prb_loc(row["prb"])

print("This many prb_locs couldn't resolved after applying lookups: " + str(cloud_with_locs["prb_loc"].isna().sum()))

cloud_with_locs = cloud_with_locs[cloud_with_locs["prb_loc"].notna()]
cloud_with_locs.to_csv(r"..\api\latencies\cloud_feat_locations.csv", index= False)



Number of regions in NA: 39
N.o. all latencies: 7069
Massachusetts       False
Washington          False
California          False
Texas               False
Ohio                False
Colorado            False
Illinois            False
Delaware            False
Georgia             False
Nebraska            False
Virginia            False
Alabama              True
Louisiana           False
Minnesota           False
Michigan            False
New York            False
Washington, D.C.    False
Idaho               False
Indiana             False
Florida             False
Utah                False
Kansas              False
New Jersey          False
Pennsylvania        False
Wisconsin           False
South Dakota         True
South Carolina      False
Alaska              False
Maryland            False
Hawaii              False
Vermont             False
Oregon              False
Montana              True
New Hampshire       False
Missouri            False
Arizona             False
Maine      

In [None]:
mtrx_df

In [113]:
#import multiprocesspandas

#cloud_with_locs.apply_parallel(add_label_locations, num_processes=4, axis=1)

In [114]:
cloud_df_with_labels = add_label_locations(cloud_with_locs)
cloud_df_with_labels.to_csv(r"..\api\latencies\cloud_feats_real_locations.csv")



dict_keys(['CanadaCentral', 'Chicago', 'Dallas', 'Dallas-DAL01', 'Montreal-MON01', 'NorthVirginia', 'Seattle-SEA01', 'Toronto-TOR01', 'USEastNVirginia', 'USEastOhio', 'USWestNCalifornia', 'USWestOregon', 'Us-central1 2', 'Us-east1 2', 'Us-east5', 'Us-south1', 'Us-west2', 'Us-west3', 'Us-west4', 'Washington-WDC01', 'northamerica-northeast1', 'northamerica-northeast2', 'us-ashburn', 'us-central1', 'us-east1', 'us-east4', 'us-phoenix', 'us-west1'])
dict_values([['Ontario', 'Quebec'], ['Illinois'], ['Texas'], ['Texas'], ['Quebec'], ['North Virginia'], ['Washington'], ['Ontario'], ['Virginia'], ['Ohio'], ['California'], ['Oregon'], ['Iowa'], ['South Carolina'], ['Ohio'], ['Texas'], ['California'], ['Utah'], ['Nevada'], ['Washington, D.C.'], ['Quebec'], ['Ontario'], ['Virginia'], ['Iowa'], ['South Carolina'], ['Virginia'], ['Arizona'], ['Oregon']])
Virginia
4
Virginia
22
8569
8569
South Carolina
40
South Carolina
24
8569
8569
Texas
41
Texas
32
8569
8569
Illinois
44
Illinois
20
8569
8569
Queb

In [115]:
locations = list(translate_cloud().values())
# flatten 
locations = [location for sub_locations in locations for location in sub_locations]
# remove duplicates
locations = list(set(locations))

mtx = create_adj_mtx2(r"..\api\latencies\cloud_feats_real_locations.csv", locations)


print("Number of regions in NA: " + str(len(locations)))
print("N.o. all latencies: " + str(no_all_latencies2(mtx)))
#print(all_latencies_dict(mtx))
print("N.o. missing datapoints: " + str(no_empty_cells(mtx)) + " out of " + str(len(locations)*len(locations)))


mtx

  import sys


Number of regions in NA: 16
N.o. all latencies: 1030
N.o. missing datapoints: 6 out of 256


Unnamed: 0,Illinois,Oregon,Nevada,Ontario,Virginia,Arizona,"Washington, D.C.",Texas,South Carolina,North Virginia,Iowa,Utah,Quebec,Ohio,Washington,California
Illinois,"[44.0, 13.476285, 13.476285, 91.0, 91.0, 35.25...","[61.58389, 68.098415, 83.036925, 91.00907, 69....","[75.65416, 55.913735, 60.681195, 50.642993, 56...","[36.774265, 53.27702, 57.06466, 57.681055, 23....","[44.298083, 31.805075, 32.63252, 54.064375, 58...","[56.28286, 84.911095, 75.139115, 75.79848, 59....","[36.344465, 55.86803, 30.80291, 34.3028, 29.18...","[36.029055, 42.35856, 52.943125, 44.692325, 60...","[46.909215, 37.694405, 43.897785, 64.56154, 65...","[37.31001, 54.195505, 36.0558125, 35.704245, 3...","[22.02003, 42.2921, 11.39893, 25.758385, 21.25...","[33.55977, 57.016735, 35.78654, 45.878485, 44....","[35.77399, 37.12223, 42.10619, 20.15081, 61.00...","[35.052545, 42.224715, 13.107085, 32.956575, 3...","[56.688255, 49.802315, 77.402495, 65.534545, 6...","[61.38206, 83.120005, 57.142165, 50.874415, 67..."
Oregon,"[61.58389, 68.098415, 83.036925, 91.00907, 69....","[779.0, 16.704655, 16.704655, 22.69529, 22.695...","[40.121745, 45.199805, 27.11825, 29.205515, 49...","[75.190325, 75.76591, 65.381715, 111.13775, 66...","[82.488042, 105.437333, 74.41192, 74.57822, 80...","[44.649435, 44.84669, 44.79237, 47.301655, 44....","[83.12892, 58.84308, 85.46613, 93.160195, 83.0...","[57.86651, 70.93145, 59.40553, 72.795315, 90.9...","[88.370535, 112.33778, 93.651015, 94.961285, 7...","[92.339355, 79.093955, 90.959185, 103.373505, ...","[54.782005, 56.4031, 59.910245, 67.156965, 63....","[33.03488, 36.49501, 45.54358, 45.9665425, 29....","[83.25652, 85.16506, 67.29274, 72.55196, 85.14...","[57.63365, 58.934675, 97.35732, 66.13762, 69.9...","[17.688305, 16.1814, 16.8845, 10.47993, 16.181...","[34.397245, 34.70684, 26.95198, 31.160255, 20...."
Nevada,"[75.65416, 55.913735, 60.681195, 50.642993, 56...","[40.121745, 45.199805, 27.11825, 29.205515, 49...","[2777.0, 3833.0, 3833.0, 5087.0, 5087.0, 6847....","[85.75513, 93.449775, 76.46502, 80.55496, 98.1...","[68.78059, 89.426815, 67.356655, 76.148585, 87...","[23.71405, 28.96592, 44.05112, 25.844436, 44.1...","[85.73756, 68.910545, 85.82197, 62.856998, 68....","[47.398205, 55.96463, 47.00003, 54.13785, 58.7...","[78.271375, 100.687225, 68.196175, 77.521845, ...","[73.92489, 74.53401, 79.09379, 67.79389, 75.04...","[82.315385, 87.89608, 50.72637, 52.615995, 70....",,"[87.803, 95.278305, 76.386375, 76.503195, 94.9...","[83.90852, 75.34189, 94.28823, 76.673739, 82.7...","[44.96273, 24.60512, 53.81, 23.905864, 25.9422...","[29.557885, 31.915135, 32.18151]"
Ontario,"[36.774265, 53.27702, 57.06466, 57.681055, 23....","[75.190325, 75.76591, 65.381715, 111.13775, 66...","[85.75513, 93.449775, 76.46502, 80.55496, 98.1...","[161.0, 10.82885, 10.82885, 24.79816, 24.79816...","[35.19625, 47.211041, 29.99473, 31.22278, 21.6...","[81.74662, 77.32533, 86.65878, 80.38572, 88.31...","[34.55486, 22.65031, 29.979065, 28.390545, 24....","[48.83172, 51.24197, 50.37617, 50.884585, 43.4...","[46.72073, 49.194, 37.580665, 43.277645, 27.03...","[34.42029, 25.41491, 24.61789, 27.407095, 22.5...","[21.13946, 27.77786, 31.610195, 37.47628, 37.3...","[56.310135, 57.6441, 73.25439, 85.68489, 73.36...","[24.34433, 45.78744, 9.39735, 21.32107, 39.329...","[34.822315, 22.95374, 27.21635, 34.649945, 31....","[61.11847, 67.357945, 64.459355, 78.92306, 92....","[89.485105, 89.60074, 77.310205, 85.507415, 58..."
Virginia,"[44.298083, 31.805075, 32.63252, 54.064375, 58...","[82.488042, 105.437333, 74.41192, 74.57822, 80...","[68.78059, 89.426815, 67.356655, 76.148585, 87...","[35.19625, 47.211041, 29.99473, 31.22278, 21.6...","[4.0, 9.527875, 9.527875, 17.795916, 17.795916...","[75.236459, 77.3327, 62.23959, 79.345415, 62.6...","[12.257166, 11.028055, 4.201585, 11.88441, 9.6...","[45.95525, 51.963917, 34.579945, 46.642255, 42...","[12.086958, 31.467208, 31.280345, 32.857215, 1...","[13.433125, 9.4397, 16.152855, 2.635205, 11.67...","[35.426833, 45.047417, 27.017795, 28.728105, 3...","[58.990825, 64.738505, 64.00839, 75.457655, 56...","[27.75525, 28.821666, 32.6279, 32.701415, 17.1...","[24.817875, 19.69237, 22.23409, 18.937055, 27....","[82.942292, 66.181285, 84.709775, 75.23258, 75...","[92.194041, 75.899475, 84.54834, 74.448065, 77..."
Arizona,"[56.28286, 84.911095, 75.139115, 75.79848, 59....","[44.649435, 44.84669, 44.79237, 47.301655, 44....","[23.71405, 28.96592, 44.05112, 25.844436, 44.1...","[81.74662, 77.32533, 86.65878, 80.38572, 88.31...","[75.236459, 77.3327, 62.23959, 79.345415, 62.6...","[204.0, 8.846565, 8.846565, 211.0, 211.0, 9.65...","[80.103125, 83.674365, 67.81904, 80.142865, 76...","[42.636045, 51.994045, 34.8675, 48.257245, 62....","[60.407165, 73.85149, 86.780255, 72.67193, 91....","[68.627955, 70.35748, 61.676255, 69.74566, 64....","[62.800265, 77.98344, 80.49902, 78.45939, 81.9...","[37.855135, 36.670245, 38.49908, 39.879225, 45...","[87.18111, 65.885185, 84.856605, 90.55393, 84....","[71.871625, 71.764625, 72.506425, 86.162915, 6...","[55.835005, 44.543235, 44.669385, 44.63635, 60...","[27.050275, 42.053375, 27.797885, 25.542695, 2..."
"Washington, D.C.","[36.344465, 55.86803, 30.80291, 34.3028, 29.18...","[83.12892, 58.84308, 85.46613, 93.160195, 83.0...","[85.73756, 68.910545, 85.82197, 62.856998, 68....","[34.55486, 22.65031, 29.979065, 28.390545, 24....","[12.257166, 11.028055, 4.201585, 11.88441, 9.6...","[80.103125, 83.674365, 67.81904, 80.142865, 76...",,"[38.24714, 40.03631, 44.369645, 40.81574, 44.9...","[33.46226, 23.535845, 34.28536, 23.24335, 41.7...",,"[30.519735, 43.11209, 43.671575, 32.94055, 49....","[60.7131, 80.447695, 77.996825, 62.938715, 65....","[30.609135, 17.121675, 38.444265, 16.771025, 3...","[23.010375, 34.769245, 38.911185, 29.1559, 28....","[64.185625, 85.30906, 83.17924, 68.371295, 81....","[82.28963, 78.79356, 62.062335, 68.0653, 65.47..."
Texas,"[36.029055, 42.35856, 52.943125, 44.692325, 60...","[57.86651, 70.93145, 59.40553, 72.795315, 90.9...","[47.398205, 55.96463, 47.00003, 54.13785, 58.7...","[48.83172, 51.24197, 50.37617, 50.884585, 43.4...","[45.95525, 51.963917, 34.579945, 46.642255, 42...","[42.636045, 51.994045, 34.8675, 48.257245, 62....","[38.24714, 40.03631, 44.369645, 40.81574, 44.9...","[41.0, 8.440455, 8.440455, 61.0, 61.0, 10.1163...","[40.09851, 57.77601, 34.65577, 51.50483, 42.10...","[38.42919, 39.519825, 45.18897, 44.32643, 45.7...","[33.220535, 62.638855, 34.91844, 62.971485, 25...","[34.066, 37.700485, 47.708245, 50.805325, 37.2...","[51.044255, 59.782565, 63.62611, 71.466595, 59...","[41.62104, 42.98251, 53.53678, 33.98193, 52.68...","[59.04227, 62.56105, 49.289265, 62.324165, 56....","[53.21958, 54.88945, 41.642655, 45.46812, 46.7..."
South Carolina,"[46.909215, 37.694405, 43.897785, 64.56154, 65...","[88.370535, 112.33778, 93.651015, 94.961285, 7...","[78.271375, 100.687225, 68.196175, 77.521845, ...","[46.72073, 49.194, 37.580665, 43.277645, 27.03...","[12.086958, 31.467208, 31.280345, 32.857215, 1...","[60.407165, 73.85149, 86.780255, 72.67193, 91....","[33.46226, 23.535845, 34.28536, 23.24335, 41.7...","[40.09851, 57.77601, 34.65577, 51.50483, 42.10...","[40.0, 35.6066775, 35.6066775, 45.067785, 45.0...","[33.80935, 34.64189, 19.32101, 12.25692]","[59.55886, 66.141325, 31.03814, 38.26014, 42.2...","[50.353055, 74.860645, 82.51022, 85.868305, 58...","[46.02569, 47.277935, 32.279775, 44.501725, 25...","[39.118095, 30.807235, 35.45095, 45.10884, 37....","[92.970985, 73.005435, 77.96182, 90.256035, 92...","[81.30987, 76.00457, 85.871275, 81.50452, 85.1..."
North Virginia,"[37.31001, 54.195505, 36.0558125, 35.704245, 3...","[92.339355, 79.093955, 90.959185, 103.373505, ...","[73.92489, 74.53401, 79.09379, 67.79389, 75.04...","[34.42029, 25.41491, 24.61789, 27.407095, 22.5...","[13.433125, 9.4397, 16.152855, 2.635205, 11.67...","[68.627955, 70.35748, 61.676255, 69.74566, 64....",,"[38.42919, 39.519825, 45.18897, 44.32643, 45.7...","[33.80935, 34.64189, 19.32101, 12.25692]",,"[30.386955, 59.21197, 33.123445, 35.72539, 49....","[76.34322, 80.571235, 54.834875, 69.31386, 67....","[39.612405, 22.38536, 17.3499, 27.000605, 16.9...","[29.99192, 36.29896, 35.98119, 33.19634, 21.98...","[78.82073, 89.709205, 90.809645, 76.14033, 90....","[67.468035, 80.563635, 61.13136, 70.36708, 69...."


In [120]:
mtx.to_pickle(r"..\api\latencies\adjacency_mtrx_cloud.pickle")


In [121]:
# cloud_df = pd.read_csv(r"..\api\latencies\cloud.csv")
# edge_df = pd.read_csv(r"..\api\latencies\edge_feat_locations.csv")
# cloud_with_locs = merge_locs(edge_df, cloud_df).tail(100)
# missing_locs = cloud_with_locs[cloud_with_locs["prb_loc"].isna()]
# for index, row in missing_locs.iterrows():
#     cloud_with_locs.loc[index, "prb_loc"] = get_prb_loc(row["prb"])

# cloud_with_locs.to_csv(r"..\api\latencies\testing4.csv")


In [122]:
cloud_with_locs["prb_loc"].unique()

array(['PL', 'NL', 'DE', 'KE', 'Virginia', 'GB', 'FR', 'AT', 'LK', 'CZ',
       'HT', 'AU', 'TT', 'FI', 'MX', 'Oklahoma', 'EE', 'CY', 'NO', 'SE',
       'LU', 'HU', 'RU', 'Michigan', 'South Carolina', 'Texas', 'LV',
       'Georgia', 'Illinois', 'NZ', 'New York', 'North Carolina',
       'Quebec', 'New Jersey', 'LT', 'Wisconsin', 'CL', 'Washington',
       'California', 'Wyoming', 'CN', 'SK', 'Tennessee', 'Maine', 'IN',
       'Massachusetts', 'Kentucky', 'Minnesota', 'IR', 'MA', 'New Mexico',
       'Delaware', 'Pennsylvania', 'BO', 'Iowa', 'New Brunswick',
       'Colorado', 'TG', 'JP', 'UA', 'Indiana', 'IE', 'DO', 'Ontario',
       'CH', 'Florida', 'Missouri', 'South Dakota', 'Arizona', 'Alabama',
       'TR', 'Ohio', 'Louisiana', 'Mississippi', 'BE', 'DK', 'Alberta',
       'GE', 'Connecticut', 'TW', 'KZ', 'JO', 'Saskatchewan', 'KY', 'SG',
       'LB', 'ID', 'IT', 'MY', 'Nebraska', 'IQ', 'SA', 'ME', 'VE', 'NP',
       'AR', 'IS', 'RS', 'PE', 'PH', 'BR', 'ET', 'KG', 'Idaho', 'AM',
 

In [123]:


#https://cloud.google.com/compute/docs/regions-zones

In [124]:
@DeprecationWarning
def prb_location_old(df_edges):
    geolocator = Nominatim(user_agent="geoapiExercises")
    df_edges["prb_loc"] = ""
    for index, row in df_edges.iterrows():
        prb_number = row["prb"]
        response = requests.get(f"https://atlas.ripe.net/api/v2/probes/{prb_number}/?format=json")
        resp_json = json.loads(response.content.decode("utf-8"))
        long, lat = tuple(resp_json["geometry"]["coordinates"])
        
        location = geolocator.reverse(str(lat) + "," + str(long), language = 'en')
        country = str((location)).split(",")[-1]
        
        if location == "United States":
            state = str((location)).split(",")[-3]
            df_edges.loc[index, "prb_loc"] = state
        else: 
            df_edges.loc[index, "prb_loc"] = country
    return df_edges

    
@DeprecationWarning
def intersects_of_prbs_to_csv():

    df = pd.read_csv(r"C:\Users\Admin\Documents\GitHub\umass\api\latencies\edge.csv", index_col= False)
    dfs = [df[df["prb"].isin(dic[region])] for region in dic.keys()]
    for df in dfs: 
        for index, row in df.iterrows():
            df["MinIP_loc"] = get_location_ipinfo(row["MinIP"])
            for i in range(1,6):  
                name = f"otherIP{i}"
                if pd.isna(row[name]):
                    continue
                location_data = get_location_ipinfo(row[f"otherIP{i}"])
                df[f"otherIP{i}_loc"] = location_data
    concated = pd.concat([df for df in dfs], ignore_index=True)
    concated.to_csv(r"C:\Users\Admin\Documents\GitHub\umass\api\latencies\edge_processed4.csv", index= False)

@DeprecationWarning
def singleprocess_prb_location():
    df_edges = pd.read_csv(r"api\latencies\edge.csv", index_col= False)
    df_prbs_np_arr = df_edges["prb"].unique()
    df_prbs = pd.DataFrame(df_prbs_np_arr, columns=["prb"])
    df_prbs = prb_location(df_prbs)
    df_prbs.to_csv(r"api\latencies\edge_processed_testing2.csv", index = False)

@DeprecationWarning
def multiprocess_prb_location():
    df_edges = pd.read_csv(r"api\latencies\edge.csv", index_col= False)
    df_prbs_np_arr = df_edges["prb"].unique()
    df_prbs = pd.DataFrame(df_prbs_np_arr, columns=["prb"])
    num_processes = multiprocessing.cpu_count() - 1
    chunk_size = int(df_prbs.shape[0]/num_processes)
    chunks = [df_prbs.iloc[df_prbs.index[i:i + chunk_size]] for i in range(0, df_prbs.shape[0], chunk_size)]
    df_prbs = prb_location(df_prbs)

    pool = multiprocessing.Pool(processes=num_processes)
    result = pool.map(prb_location, chunks)

    for i in range(len(result)):
    # we can reassign the original dataframe based on the index of each chunk
        df_prbs.iloc[result[i].index] = result[i]

    df_prbs.to_csv(r"api\latencies\edge_processed_testing2.csv", index = False)

@DeprecationWarning
def create_adj_matrix(df):
    regions_set = set()
    cols = ["otherIP1_loc", "otherIP2_loc", "otherIP3_loc", "otherIP4_loc", "otherIP5_loc", "MinIP_loc"]
    for index, row in df.iterrows():
        for col in cols:
            regions_set.add(str(row[col]))
    regions_set.remove("None")
    regions_set.remove("nan")

    adj = pd.DataFrame(columns = list(regions_set))

    for index, row in df.iterrows():
        for col in cols: 
            lat_col = col.replace("_loc", "")
            lat_col = lat_col.replace("IP", "latency")
            region_from = row[col]
            region_to = row["prb_loc"]
            region_from_idx = adj.columns.get_loc(region_from)
            region_to_idx = adj.columns.get_loc(region_to)
            
            latency = row.loc[lat_col]
            adj.iloc[region_from_idx, region_to_idx] = latency
            adj.iloc[region_to] = latency
    #adj.to_csv(r"api\latencies\adjance_latency.csv", index=False)
    return adj