In [346]:
import ipinfo
from collections import defaultdict
import pickle
import requests
import pandas as pd 
import numpy as np
from geopy.geocoders import Nominatim
import csv
import multiprocessing
import json
import pycountry
import country_converter as coco  #Coco was found to be more accuracte. Pycountry had weird labels for e.g. russia


def get_ip():
    response = requests.get('https://api64.ipify.org?format=json').json()
    return response["ip"]


def get_location(ip):
    response = requests.get(f'https://ipapi.co/{ip}/json/').json()
    location_data = {
        "ip": ip,
        "city": response.get("city"),
        "region": response.get("region"),
        "country": response.get("country_name")
    }
    return location_data

def get_location_ipinfo(ip_address):
    ''' Best one of the three. However, IP requests is limited to the token
    '''
    try:
        access_token = '7dbb53d0419093'
        handler = ipinfo.getHandler(access_token)
        details = handler.getDetails(ip_address)
        if details.country == "US": 
            return details.region
        else:
            return details.country
    except:
        print("error")

In [347]:
def add_ip_locations(df):
    df["MinIP_loc"] = np.nan
    df["otherIP1_loc"] = np.nan 
    df["otherIP2_loc"] = np.nan 
    df["otherIP3_loc"] = np.nan 
    df["otherIP4_loc"] = np.nan 
    df["otherIP5_loc"] = np.nan 

    for index, row in df.iterrows():
        df.loc[index, "MinIP_loc"] = get_location_ipinfo(row["MinIP"])
        for i in range(1,6):  
            name = f"otherIP{i}"
            if pd.isna(row[name]):
                continue

            location = get_location_ipinfo(row[f"otherIP{i}"])
            df.loc[index, f"otherIP{i}_loc"] = location
    return df

In [348]:
def prb_location(df_prbs, prb_dic):
    '''Returns a dataframe with prb number and their respective country or state 
    '''
    geolocator = Nominatim(user_agent="geoapiExercises")
    df_prbs["prb_loc"] = ""
    for index, row in df_prbs.iterrows():
        prb_number = row["prb"]
        long,lat = tuple(prb_dic[prb_number]["geometry"]["coordinates"])
        location = ""
        try:
            location = geolocator.reverse(str(lat) + "," + str(long), language = 'en').raw
        except: 
            print("timeout at index: " + str(index))
            continue
        country = location["address"]["country"]
        
        if country == "Canada":
            state = location["address"]["state"]
            df_prbs.loc[index, "prb_loc"] = state
        elif country == "United States":
            state = str((location)).split(",")[-3]
            state = location["address"]["state"]
            df_prbs.loc[index, "prb_loc"] = state
        else:
            country_code = location["address"]["country_code"].upper() 
            df_prbs.loc[index, "prb_loc"] = country_code
        if index%1000 == 0:
            print("AT INDEX:   " + str(index))
    return df_prbs

In [349]:
def extract_prb_locs(df_edges,pickle_path):

    df_prbs_np_arr = df_edges["prb"].unique()
    df_prbs = pd.DataFrame(df_prbs_np_arr, columns=["prb"])

    with open(pickle_path, 'rb') as handle:
        prb_dic = pickle.load(handle)
        df_prb_loc = prb_location(df_prbs, prb_dic)
        df_prbs = df_prb_loc
        #df_prb_loc.to_csv(r"api\latencies\prbs_with_locations.csv", index = False)
    #return df_prbs
    return df_prb_loc

In [350]:
def merge_prb_locs_with_ip_locs(df_prb_loc, df_with_ip_locs):
    ''' Takes the csv with prbs and their respecitve locations to merge with the file containing ip locations
    ''' 
    prb_to_loc = {}
    for index, row in df_prb_loc.iterrows():
        prb_to_loc[row["prb"]] = row["prb_loc"]
    
    df_with_ip_locs["prb_loc"] = ""
    for index, row in df_with_ip_locs.iterrows(): 
        df_with_ip_locs.loc[index,"prb_loc"] = prb_to_loc[row["prb"]]

    return df_with_ip_locs

def reorder_columns_of_final_df(merged_df):
    ''' Reformats the file by column order
    '''
    cols = [
        "prb" , "prb_loc",
        "MinIP", "MinIP_loc", "Minlatency",
        "otherIP1", "otherIP1_loc", "otherlatency1",
        "otherIP2", "otherIP2_loc", "otherlatency2",
        "otherIP3", "otherIP3_loc", "otherlatency3",
        "otherIP4", "otherIP4_loc", "otherlatency4",
        "otherIP5", "otherIP5_loc", "otherlatency5",
        ]

    merged_df = merged_df[cols]
    
    merged_df = merged_df.rename({
        'otherIP1': 'IP1', 
        'otherIP2': 'IP2', 
        'otherIP3': 'IP3', 
        'otherIP4': 'IP4', 
        'otherIP5': 'IP5',
        'otherIP1_loc': 'IP1_loc', 
        'otherIP2_loc': 'IP2_loc', 
        'otherIP3_loc': 'IP3_loc', 
        'otherIP4_loc': 'IP4_loc', 
        'otherIP5_loc': 'IP5_loc',
        'otherlatency1': 'latency1', 
        'otherlatency2': 'latency2',
        'otherlatency3': 'latency3',
        'otherlatency4': 'latency4',
        'otherlatency5': 'latency5'
        }, axis='columns')

    return merged_df


In [351]:
def convert_country_code(df):
    ''' Converts non-'North American' countries to country code
    '''
    cc = coco.CountryConverter()
    #df = df.apply(lambda x: pycountry.countries.get(name=str(x)) if pycountry.countries.get(name=str(x)) != None else print(x))
    df = df.apply(lambda x: coco.convert(names=x, to='ISO2', not_found = x) if coco.convert(names=x, to='ISO2', not_found = None) != None else x)
    return df

In [352]:
def create_adj_mtx(edge_path, locations):
    '''Creates an adjacency matrix and returns it to later be saved with picke
    '''
    df_adj = pd.DataFrame(columns=locations, index=locations, dtype = object)

    df_edges = pd.read_csv(edge_path, index_col=False)
    cols = ["IP1_loc", "IP2_loc", "IP3_loc", "IP4_loc", "IP5_loc", "MinIP_loc"]
    
    for index, row in df_edges.iterrows():
        from_loc = row["prb_loc"]
        # skip if datacenter's location not sought after
        if from_loc not in locations:
            continue
        for col in cols: 
            to_loc = row[col]     
            # if user's request location not sought after  
            if to_loc not in locations:
                continue  
            col_idx = df_edges.columns.get_loc(col)
            latency = row.iloc[col_idx+1]    
            saved_latencies = df_adj.loc[from_loc,to_loc]
            if np.isnan(saved_latencies).all():
                df_adj.loc[from_loc,to_loc] = list([float(latency)])
                df_adj.loc[to_loc,from_loc] = list([float(latency)])
            else:
                df_adj.loc[from_loc,to_loc].append(float(latency))
                df_adj.loc[to_loc, from_loc].append(float(latency))
    return df_adj


In [353]:
def get_all_regions(df):
    ''' Returns all mentioned locations in edge latency file
    '''
    df_locs = pd.concat([df["prb_loc"], df["MinIP_loc"], df["IP1_loc"], df["IP2_loc"], df["IP3_loc"], df["IP4_loc"], df["IP5_loc"]])
    return df_locs.unique().tolist()

In [354]:


def no_empty_cells(df):
    no = 0
    for index, row in df.iterrows():
        no += row.isna().sum()
    return no

def no_all_latencies(df):
    no = 0
    for index, row in df.iterrows():
        no += row.notna().count()
    return no

def no_all_latencies2(df):
    no = 0
    for index, row in df.iterrows():
        no += row.notna().astype(str).str.len().sum()
    return no

def all_latencies_dict(df):
    no = defaultdict(lambda:0)
    for index, row in df.iterrows():
        print(row.isna())
        no[index] += row.isna().count()
    return no

In [355]:
def get_prb_loc(prb_id):
    geolocator = Nominatim(user_agent="geoapiExercises")
    response = requests.get(f"https://atlas.ripe.net/api/v2/probes/{prb_id}/?format=json")
    resp_json = json.loads(response.content.decode("utf-8"))
    long, lat = tuple(resp_json["geometry"]["coordinates"])
    
    location = geolocator.reverse(str(lat) + "," + str(long), language = 'en')

    if location == None:
        print("None at location for prb: " + str(prb_id))
        return np.nan

    country = location.raw["address"]["country"]

    if country in ["United States", "Canada"]:
        state = location.raw["address"]["state"]
        return state
    else: 
        country_code = location.raw["address"]["country_code"].upper() 
        return country_code

In [356]:
def merge_locs(edge_df, cloud_df):
    """ As edge_df has prbs with corresponding locations, we merge them to avoid time-consuming queries on prb_id
    """
    edge_df = edge_df[["prb", "prb_loc"]]
    cloud_df = cloud_df.merge(edge_df, how="left", right_index = False, on = "prb")
    prb_locs = cloud_df.pop("prb_loc")
    cloud_df.insert(1, "prb_loc", prb_locs)
    return cloud_df

In [357]:
def translate_cloud():
    regions = pd.read_json(r"..\api\latencies\cloud_regions_na.json", orient= "records", dtype = dict)
    return regions["locations"].to_dict()

In [358]:
def values_flattened():
    locations = list(translate_cloud().values())
    # flatten 
    locations = [location for sub_locations in locations for location in sub_locations]
    # remove duplicates
    return list(set(locations))

In [359]:
def add_label_locations(cloud_df):
    # To run multiprocessing
    import pandas as pd
    region_cols = ["minLabel"] + [f"label.{i}" for i in range(1,68)]
    for col in region_cols:
        cloud_df[col + "_loc"] = pd.NA
    
    name_lookup = translate_cloud()

    for index, row in cloud_df.iterrows():
        from_loc = row["prb_loc"]
        if not from_loc in values_flattened():
            continue
        print(from_loc)
        for col in region_cols:
            loc_str = col + "_loc"
            parsed_loc = row[loc_str]
            if not pd.isnull(parsed_loc) or pd.isnull(row[col]):
                continue
            unparsed_loc = row[col].replace(".csv", "")
            if unparsed_loc in name_lookup.keys():
                names = name_lookup[unparsed_loc]
                if len(names) == 1:
                    cloud_df.loc[index, loc_str] = names[0]
                else: 
                    cloud_df.loc[index, loc_str] = names[0] 

                    index_append = cloud_df.shape[0]
                    index_current_col = cloud_df.columns.get_loc(col) - 1
                    #latency_value = cloud_df.iloc[index, index_current_col]
                    latency_value = row.iloc[index_current_col]

                    # Split the names and append last string to last column to process later
                    cloud_df.loc[index_append] = pd.NA 
                    cloud_df.loc[index_append, "prb"] = row["prb"]
                    cloud_df.loc[index_append, "prb_loc"] = row["prb_loc"]
                    cloud_df.loc[index_append, "minLabel"] = unparsed_loc
                    cloud_df.loc[index_append, "minLabel_loc"] = names[1]
                    cloud_df.loc[index_append, "minMedian"] = latency_value

                    # No need for minLabel_loc, will be processed later
        if index%1000 == 0:
            print("AT INDEX:   " + str(index))
    return cloud_df


In [360]:
3
80
8569
8569
4
22
8569
8569
5
50
8569
8569
...
8570
50
8569
8569

8569

In [361]:
def create_adj_mtx2(cloud_path, locations):
    '''Creates an adjacency matrix and returns it to later be saved with picke
    '''
    df_adj = pd.DataFrame(columns=locations, index=locations, dtype = object)

    df_edges = pd.read_csv(cloud_path, index_col=False)
    cols = ["prb_loc", "minLabel_loc"] + [f"label.{i}_loc" for i in range(1,68)]
    
    for index, row in df_edges.iterrows():
        from_loc = row["prb_loc"]
        # skip if datacenter's location not sought after
        if from_loc not in locations:
            continue
        for col in cols: 
            to_loc = row[col]     
            # if user's request location not sought after  
            if to_loc not in locations:
                continue  
            label = col.replace("_loc", "")
            label_idx = df_edges.columns.get_loc(label)
            latency = row.iloc[label_idx-1]    
            saved_latencies = df_adj.loc[from_loc,to_loc]
            # true if saved latencies is empty list
            if not isinstance(saved_latencies, list):
                df_adj.loc[from_loc,to_loc] = list([float(latency)])
                df_adj.loc[to_loc,from_loc] = list([float(latency)])
            else:
                df_adj.loc[from_loc,to_loc].append(float(latency))
                df_adj.loc[to_loc, from_loc].append(float(latency))
    return df_adj

In [362]:
def merge_edge_cloud_latencies(edge_df, cloud_df): 
    cloud_locs = list(cloud_df.index)
    edge_locs = list(edge_df.index)
    for loc1, row in cloud_df.iterrows():
        for loc2, col in zip(cloud_locs, row):
            if loc2 not in edge_locs: 
                edge_df[loc2] = ""
                #edge_df.reindex(edge_df.index.values.tolist()+[loc])
                edge_df.loc[loc2] = ""
                edge_df.loc[loc2, loc1] = cloud_df.loc[loc2, loc1]
                edge_df.loc[loc1, loc2] = cloud_df.loc[loc2, loc1]
            else:                 
                edge_df.loc[loc1, loc2] = cloud_df.loc[loc2, loc1]
                edge_df.loc[loc2, loc1] = cloud_df.loc[loc2, loc1]

    return edge_df

In [363]:
def print_mtx_info(df, regions = "", type = ""):
    print("Number of regions: " + str(len(regions)))

    print("N.o. all latencies: " + str(no_all_latencies2(df)))
    #print(all_latencies_dict(df))
    if regions != "":
        print("N.o. missing datapoints: " + str(no_empty_cells(df)) + " out of " + str(len(regions)*len(regions)))
    if type == "cloud":
        print(no_all_latencies2(df))
    if type == "edge":
        print(no_all_latencies(df))

In [364]:
### This extracts probe id and maps it to a location

# df_edges = pd.read_csv(r"api\latencies\edge.csv", index_col= False)
# df_prb_locs = extract_prb_locs(df_edges, picke_path = r'api\latencies\probes_clean.pickle')
# df_prb_locs.to_csv(r"api\latencies\prbs_with_locations.csv", index = False)


# timeout at index: 7775
# AT INDEX:   8000
# timeout at index: 8215

### This assign a country code to non us/canada countries. they use states instead

#df_prbs = pd.read_csv(r"api\latencies\prbs_with_locations.csv", index_col= False)
#df_prbs["prb_loc"] = convert_country_code(df_prbs["prb_loc"])
#df_prbs.to_csv(r"api\latencies\prbs_with_locations_coco.csv", index = False)

### THIS adds location to the fields [MinIP, OtherIP{1,2,3,4,5}]

#df_with_ip_locs = add_ip_locations(df_edges)
#df_with_ip_locs.to_csv(r"api\latencies\edge_with_ip_locs.csv", index= False)


### BELOW Is to merge files and formatting

# df_prb_loc = pd.read_csv(r"api\latencies\prbs_with_locations.csv", index_col = False)
# df_with_ip_locs = pd.read_csv(r"api\latencies\edge_processed10.csv", index_col = False)

# df_merged = merge_prb_locs_with_ip_locs(df_prb_loc, df_with_ip_locs)

# df_merged_formated = reorder_columns_of_final_df(df_merged)
# df_merged_formated.to_csv(r"api\latencies\edge_feat_locations.csv", index = False)

### Creates adjacency matrix out of the edge latencies

# df = pd.read_csv(r"..\api\latencies\edge_feat_locations.csv", index_col=False)
# regions = get_all_regions(df)
# # # Filter out NaN and regions
# print(regions)
# na = [region for region in regions if type(region) == str and len(region) > 2]
# na.remove("Guam")
# na.remove("Prince Edward Island")
# na.remove("United States Virgin Islands")

# eu = [region for region in regions if type(region) == str and len(region) < 3]

# edge_mtx = create_adj_mtx(r"..\api\latencies\edge_feat_locations.csv", na)
# edge_mtx.to_pickle(r"..\api\latencies\adjacency_mtrx.pickle")

# print_mtx_info(edge_mtx, na, type="edge")

# edge_mtx.to_csv(r"..\api\latencies\edge_feat_locations_na.csv")

# ## Add locations to cloud_data 

# cloud_df = pd.read_csv(r"..\api\latencies\cloud.csv")
# edge_df = pd.read_csv(r"..\api\latencies\edge_feat_locations.csv")
# cloud_with_locs = merge_locs(edge_df, cloud_df)

# print("This many prb_locs couldn't resolved by merge: " + str(cloud_with_locs["prb_loc"].isna().sum()))

# missing_locs = cloud_with_locs[cloud_with_locs["prb_loc"].isna()]
# for index, row in missing_locs.iterrows():
#     cloud_with_locs.loc[index, "prb_loc"] = get_prb_loc(row["prb"])

# print("This many prb_locs couldn't resolved after applying lookups: " + str(cloud_with_locs["prb_loc"].isna().sum()))

# cloud_with_locs = cloud_with_locs[cloud_with_locs["prb_loc"].notna()]
# cloud_with_locs.to_csv(r"..\api\latencies\cloud_feat_locations.csv", index= False)

# # Convert "blabla.csv" to real names
# cloud_df_with_labels = add_label_locations(cloud_with_locs)
# cloud_df_with_labels.to_csv(r"..\api\latencies\cloud_feats_real_locations.csv")

# locations = values_flattened()

# cloud_mtx = create_adj_mtx2(r"..\api\latencies\cloud_feats_real_locations.csv", locations)

# print_mtx_info(cloud_mtx, na, type="cloud")


# print("Number of regions in NA: " + str(len(locations)))
# print("N.o. all latencies: " + str(no_all_latencies2(cloud_mtx)))
# #print(all_latencies_dict(mtx))
# print("N.o. missing datapoints: " + str(no_empty_cells(cloud_mtx)) + " out of " + str(len(locations)*len(locations)))

# cloud_mtx.to_pickle(r"..\api\latencies\adjacency_mtrx_cloud.pickle")




In [365]:
#import multiprocesspandas

#cloud_with_locs.apply_parallel(add_label_locations, num_processes=4, axis=1)

In [366]:
# merge_edge_cloud_latencies(mtrx_df, cloud_mtx)
merge_edge_cloud_latencies(edge_mtx, cloud_mtx)

Index(['New Brunswick', 'Washington', 'California', 'Virginia', 'Ohio',
       'Louisiana', 'Wisconsin', 'Mississippi', 'Texas', 'Delaware', 'Ontario',
       'Alberta', 'North Carolina', 'Massachusetts', 'Connecticut', 'Florida',
       'Colorado', 'Michigan', 'Indiana', 'Iowa', 'Saskatchewan', 'Oklahoma',
       'Tennessee', 'Georgia', 'Maine', 'Nebraska', 'New York', 'Idaho',
       'British Columbia', 'Manitoba', 'New Jersey', 'Minnesota', 'Oregon',
       'Pennsylvania', 'Utah', 'Arizona', 'Rhode Island', 'Quebec',
       'Newfoundland and Labrador', 'New Mexico', 'Arkansas', 'Maryland',
       'Illinois', 'Kentucky', 'District of Columbia', 'New Hampshire',
       'Missouri', 'Nova Scotia', 'Nevada', 'Kansas', 'North Dakota',
       'Vermont', 'South Carolina', 'Puerto Rico', 'Alaska', 'Hawaii',
       'West Virginia', 'Montana', 'Northwest Territories', 'South Dakota',
       'Alabama', 'Wyoming', 'Yukon', 'Washington, D.C.', 'North Virginia'],
      dtype='object')


Unnamed: 0,New Brunswick,Washington,California,Virginia,Ohio,Louisiana,Wisconsin,Mississippi,Texas,Delaware,...,Hawaii,West Virginia,Montana,Northwest Territories,South Dakota,Alabama,Wyoming,Yukon,"Washington, D.C.",North Virginia
New Brunswick,,,,,,,,,,,...,,,,,,,,,,
Washington,,"[69.0, 10.139825, 10.139825, 180.0, 180.0, 9.1...","[29.202135, 35.159335, 27.645745, 19.37087, 36...","[82.942292, 66.181285, 84.709775, 75.23258, 75...","[60.256055, 94.605435, 94.780135, 52.91472, 60...",,,,"[59.04227, 62.56105, 49.289265, 62.324165, 56....",,...,[93.171645],"[29.628005, 44.27234]","[22.22458, 23.069665, 21.51524]",,,,"[23.67485, 23.714485, 24.20839, 24.5869, 23.20...",,"[64.185625, 85.30906, 83.17924, 68.371295, 81....","[78.82073, 89.709205, 90.809645, 76.14033, 90...."
California,,"[29.202135, 35.159335, 27.645745, 19.37087, 36...","[71.0, 84.0, 84.0, 90.0, 90.0, 107.0, 107.0, 2...","[92.194041, 75.899475, 84.54834, 74.448065, 77...","[82.364125, 85.271905, 70.71036, 70.85003, 67....",,"[13.396625, 9.066555]",,"[53.21958, 54.88945, 41.642655, 45.46812, 46.7...",,...,"[48.03675, 49.73371, 49.861415, 68.899845, 71....","[12.108515, 10.154895]","[75.05065, 75.1289925, 130.427795, 54.67809, 5...",,,,,,"[82.28963, 78.79356, 62.062335, 68.0653, 65.47...","[67.468035, 80.563635, 61.13136, 70.36708, 69...."
Virginia,,"[82.942292, 66.181285, 84.709775, 75.23258, 75...","[92.194041, 75.899475, 84.54834, 74.448065, 77...","[4.0, 9.527875, 9.527875, 17.795916, 17.795916...","[24.817875, 19.69237, 22.23409, 18.937055, 27....","[22.99056, 23.233545, 13.67115, 18.18783, 26.9...","[46.39941, 48.11897, 35.135105, 39.439455, 42....",,"[45.95525, 51.963917, 34.579945, 46.642255, 42...","[8.07943, 14.73068, 14.742405, 14.828805, 15.0...",...,,[34.85445],,,"[48.78525, 66.86637]",,,,"[12.257166, 11.028055, 4.201585, 11.88441, 9.6...","[13.433125, 9.4397, 16.152855, 2.635205, 11.67..."
Ohio,,"[60.256055, 94.605435, 94.780135, 52.91472, 60...","[82.364125, 85.271905, 70.71036, 70.85003, 67....","[24.817875, 19.69237, 22.23409, 18.937055, 27....","[224.0, 19.357735, 19.357735, 1191.0, 1191.0, ...","[32.96968, 54.99881, 32.89856]",,,"[41.62104, 42.98251, 53.53678, 33.98193, 52.68...",,...,,,,,,,,,"[23.010375, 34.769245, 38.911185, 29.1559, 28....","[29.99192, 36.29896, 35.98119, 33.19634, 21.98..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Alabama,,,,,,"[3.32026, 5.40437, 19.8191, 3.27144]",,,,,...,,,,,,,,,[12.588695],
Wyoming,,"[23.67485, 23.714485, 24.20839, 24.5869, 23.20...",,,,,,,,,...,,,,,,,,,,
Yukon,,,,,,,,,,,...,,,,,,,,,,
"Washington, D.C.",,"[64.185625, 85.30906, 83.17924, 68.371295, 81....","[82.28963, 78.79356, 62.062335, 68.0653, 65.47...","[12.257166, 11.028055, 4.201585, 11.88441, 9.6...","[23.010375, 34.769245, 38.911185, 29.1559, 28....",,,,"[38.24714, 40.03631, 44.369645, 40.81574, 44.9...",,...,,"[33.967305, 37.970095, 38.703295, 39.13436, 32...",,,,[12.588695],,,,


In [367]:
# cloud_df = pd.read_csv(r"..\api\latencies\cloud.csv")
# edge_df = pd.read_csv(r"..\api\latencies\edge_feat_locations.csv")
# cloud_with_locs = merge_locs(edge_df, cloud_df).tail(100)
# missing_locs = cloud_with_locs[cloud_with_locs["prb_loc"].isna()]
# for index, row in missing_locs.iterrows():
#     cloud_with_locs.loc[index, "prb_loc"] = get_prb_loc(row["prb"])

# cloud_with_locs.to_csv(r"..\api\latencies\testing4.csv")


In [368]:
cloud_with_locs["prb_loc"].unique()

array(['PL', 'NL', 'DE', 'KE', 'Virginia', 'GB', 'FR', 'AT', 'LK', 'CZ',
       'HT', 'AU', 'TT', 'FI', 'MX', 'Oklahoma', 'EE', 'CY', 'NO', 'SE',
       'LU', 'HU', 'RU', 'Michigan', 'South Carolina', 'Texas', 'LV',
       'Georgia', 'Illinois', 'NZ', 'New York', 'North Carolina',
       'Quebec', 'New Jersey', 'LT', 'Wisconsin', 'CL', 'Washington',
       'California', 'Wyoming', 'CN', 'SK', 'Tennessee', 'Maine', 'IN',
       'Massachusetts', 'Kentucky', 'Minnesota', 'IR', 'MA', 'New Mexico',
       'Delaware', 'Pennsylvania', 'BO', 'Iowa', 'New Brunswick',
       'Colorado', 'TG', 'JP', 'UA', 'Indiana', 'IE', 'DO', 'Ontario',
       'CH', 'Florida', 'Missouri', 'South Dakota', 'Arizona', 'Alabama',
       'TR', 'Ohio', 'Louisiana', 'Mississippi', 'BE', 'DK', 'Alberta',
       'GE', 'Connecticut', 'TW', 'KZ', 'JO', 'Saskatchewan', 'KY', 'SG',
       'LB', 'ID', 'IT', 'MY', 'Nebraska', 'IQ', 'SA', 'ME', 'VE', 'NP',
       'AR', 'IS', 'RS', 'PE', 'PH', 'BR', 'ET', 'KG', 'Idaho', 'AM',
 

In [369]:


#https://cloud.google.com/compute/docs/regions-zones

In [370]:
@DeprecationWarning
def prb_location_old(df_edges):
    geolocator = Nominatim(user_agent="geoapiExercises")
    df_edges["prb_loc"] = ""
    for index, row in df_edges.iterrows():
        prb_number = row["prb"]
        response = requests.get(f"https://atlas.ripe.net/api/v2/probes/{prb_number}/?format=json")
        resp_json = json.loads(response.content.decode("utf-8"))
        long, lat = tuple(resp_json["geometry"]["coordinates"])
        
        location = geolocator.reverse(str(lat) + "," + str(long), language = 'en')
        country = str((location)).split(",")[-1]
        
        if location == "United States":
            state = str((location)).split(",")[-3]
            df_edges.loc[index, "prb_loc"] = state
        else: 
            df_edges.loc[index, "prb_loc"] = country
    return df_edges

    
@DeprecationWarning
def intersects_of_prbs_to_csv():

    df = pd.read_csv(r"C:\Users\Admin\Documents\GitHub\umass\api\latencies\edge.csv", index_col= False)
    dfs = [df[df["prb"].isin(dic[region])] for region in dic.keys()]
    for df in dfs: 
        for index, row in df.iterrows():
            df["MinIP_loc"] = get_location_ipinfo(row["MinIP"])
            for i in range(1,6):  
                name = f"otherIP{i}"
                if pd.isna(row[name]):
                    continue
                location_data = get_location_ipinfo(row[f"otherIP{i}"])
                df[f"otherIP{i}_loc"] = location_data
    concated = pd.concat([df for df in dfs], ignore_index=True)
    concated.to_csv(r"C:\Users\Admin\Documents\GitHub\umass\api\latencies\edge_processed4.csv", index= False)

@DeprecationWarning
def singleprocess_prb_location():
    df_edges = pd.read_csv(r"api\latencies\edge.csv", index_col= False)
    df_prbs_np_arr = df_edges["prb"].unique()
    df_prbs = pd.DataFrame(df_prbs_np_arr, columns=["prb"])
    df_prbs = prb_location(df_prbs)
    df_prbs.to_csv(r"api\latencies\edge_processed_testing2.csv", index = False)

@DeprecationWarning
def multiprocess_prb_location():
    df_edges = pd.read_csv(r"api\latencies\edge.csv", index_col= False)
    df_prbs_np_arr = df_edges["prb"].unique()
    df_prbs = pd.DataFrame(df_prbs_np_arr, columns=["prb"])
    num_processes = multiprocessing.cpu_count() - 1
    chunk_size = int(df_prbs.shape[0]/num_processes)
    chunks = [df_prbs.iloc[df_prbs.index[i:i + chunk_size]] for i in range(0, df_prbs.shape[0], chunk_size)]
    df_prbs = prb_location(df_prbs)

    pool = multiprocessing.Pool(processes=num_processes)
    result = pool.map(prb_location, chunks)

    for i in range(len(result)):
    # we can reassign the original dataframe based on the index of each chunk
        df_prbs.iloc[result[i].index] = result[i]

    df_prbs.to_csv(r"api\latencies\edge_processed_testing2.csv", index = False)

@DeprecationWarning
def create_adj_matrix(df):
    regions_set = set()
    cols = ["otherIP1_loc", "otherIP2_loc", "otherIP3_loc", "otherIP4_loc", "otherIP5_loc", "MinIP_loc"]
    for index, row in df.iterrows():
        for col in cols:
            regions_set.add(str(row[col]))
    regions_set.remove("None")
    regions_set.remove("nan")

    adj = pd.DataFrame(columns = list(regions_set))

    for index, row in df.iterrows():
        for col in cols: 
            lat_col = col.replace("_loc", "")
            lat_col = lat_col.replace("IP", "latency")
            region_from = row[col]
            region_to = row["prb_loc"]
            region_from_idx = adj.columns.get_loc(region_from)
            region_to_idx = adj.columns.get_loc(region_to)
            
            latency = row.loc[lat_col]
            adj.iloc[region_from_idx, region_to_idx] = latency
            adj.iloc[region_to] = latency
    #adj.to_csv(r"api\latencies\adjance_latency.csv", index=False)
    return adj