In [4]:
import ipinfo
from collections import defaultdict
import pickle
import requests
import pandas as pd 
import numpy as np
from geopy.geocoders import Nominatim
import csv
import multiprocessing
import json
import pycountry
import country_converter as coco  #Coco was found to be more accuracte. Pycountry had weird labels for e.g. russia


def get_ip():
    response = requests.get('https://api64.ipify.org?format=json').json()
    return response["ip"]


def get_location(ip):
    response = requests.get(f'https://ipapi.co/{ip}/json/').json()
    location_data = {
        "ip": ip,
        "city": response.get("city"),
        "region": response.get("region"),
        "country": response.get("country_name")
    }
    return location_data

def get_location_ipinfo(ip_address):
    ''' Best one of the three. However, IP requests is limited to the token
    '''
    try:
        access_token = '7dbb53d0419093'
        handler = ipinfo.getHandler(access_token)
        details = handler.getDetails(ip_address)
        if details.country == "US": 
            return details.region
        else:
            return details.country
    except:
        print("error")



In [5]:
def add_ip_locations(df):
    df["MinIP_loc"] = np.nan
    df["otherIP1_loc"] = np.nan 
    df["otherIP2_loc"] = np.nan 
    df["otherIP3_loc"] = np.nan 
    df["otherIP4_loc"] = np.nan 
    df["otherIP5_loc"] = np.nan 

    for index, row in df.iterrows():
        df.loc[index, "MinIP_loc"] = get_location_ipinfo(row["MinIP"])
        for i in range(1,6):  
            name = f"otherIP{i}"
            if pd.isna(row[name]):
                continue

            location = get_location_ipinfo(row[f"otherIP{i}"])
            df.loc[index, f"otherIP{i}_loc"] = location
    return df

In [6]:
def prb_location(df_prbs, prb_dic):
    '''Returns a dataframe with prb number and their respective country or state 
    '''
    geolocator = Nominatim(user_agent="geoapiExercises")
    df_prbs["prb_loc"] = ""
    for index, row in df_prbs.iterrows():
        prb_number = row["prb"]
        long,lat = tuple(prb_dic[prb_number]["geometry"]["coordinates"])
        location = ""
        try:
            location = geolocator.reverse(str(lat) + "," + str(long), language = 'en').raw
        except: 
            print("timeout at index: " + str(index))
            continue
        country = location["address"]["country"]
        
        if country == "Canada":
            state = location["address"]["state"]
            df_prbs.loc[index, "prb_loc"] = state
        elif country == "United States":
            state = str((location)).split(",")[-3]
            state = location["address"]["state"]
            df_prbs.loc[index, "prb_loc"] = state
        else:
            country_code = location["address"]["country_code"].upper() 
            df_prbs.loc[index, "prb_loc"] = country_code
        if index%1000 == 0:
            print("AT INDEX:   " + str(index))
    return df_prbs

In [7]:
def extract_prb_locs(df_edges,pickle_path):

    df_prbs_np_arr = df_edges["prb"].unique()
    df_prbs = pd.DataFrame(df_prbs_np_arr, columns=["prb"])

    with open(pickle_path, 'rb') as handle:
        prb_dic = pickle.load(handle)
        df_prb_loc = prb_location(df_prbs, prb_dic)
        df_prbs = df_prb_loc
        #df_prb_loc.to_csv(r"api\latencies\prbs_with_locations.csv", index = False)
    #return df_prbs
    return df_prb_loc

In [8]:
def merge_prb_locs_with_ip_locs(df_prb_loc, df_with_ip_locs):
    ''' Takes the csv with prbs and their respecitve locations to merge with the file containing ip locations
    ''' 
    prb_to_loc = {}
    for index, row in df_prb_loc.iterrows():
        prb_to_loc[row["prb"]] = row["prb_loc"]
    
    df_with_ip_locs["prb_loc"] = ""
    for index, row in df_with_ip_locs.iterrows(): 
        df_with_ip_locs.loc[index,"prb_loc"] = prb_to_loc[row["prb"]]

    return df_with_ip_locs

def reorder_columns_of_final_df(merged_df):
    ''' Reformats the file by column order
    '''
    cols = [
        "prb" , "prb_loc",
        "MinIP", "MinIP_loc", "Minlatency",
        "otherIP1", "otherIP1_loc", "otherlatency1",
        "otherIP2", "otherIP2_loc", "otherlatency2",
        "otherIP3", "otherIP3_loc", "otherlatency3",
        "otherIP4", "otherIP4_loc", "otherlatency4",
        "otherIP5", "otherIP5_loc", "otherlatency5",
        ]

    merged_df = merged_df[cols]
    
    merged_df = merged_df.rename({
        'otherIP1': 'IP1', 
        'otherIP2': 'IP2', 
        'otherIP3': 'IP3', 
        'otherIP4': 'IP4', 
        'otherIP5': 'IP5',
        'otherIP1_loc': 'IP1_loc', 
        'otherIP2_loc': 'IP2_loc', 
        'otherIP3_loc': 'IP3_loc', 
        'otherIP4_loc': 'IP4_loc', 
        'otherIP5_loc': 'IP5_loc',
        'otherlatency1': 'latency1', 
        'otherlatency2': 'latency2',
        'otherlatency3': 'latency3',
        'otherlatency4': 'latency4',
        'otherlatency5': 'latency5'
        }, axis='columns')

    return merged_df


In [9]:
def convert_country_code(df):
    ''' Converts non-'North American' countries to country code
    '''
    cc = coco.CountryConverter()
    #df = df.apply(lambda x: pycountry.countries.get(name=str(x)) if pycountry.countries.get(name=str(x)) != None else print(x))
    df = df.apply(lambda x: coco.convert(names=x, to='ISO2', not_found = x) if coco.convert(names=x, to='ISO2', not_found = None) != None else x)
    return df

In [10]:
def create_adj_mtx(edge_path, locations):
    '''Creates an adjacency matrix and returns it to later be saved with picke
    '''
    df_adj = pd.DataFrame(columns=locations, index=locations, dtype = object)

    df_edges = pd.read_csv(edge_path, index_col=False)
    cols = ["IP1_loc", "IP2_loc", "IP3_loc", "IP4_loc", "IP5_loc", "MinIP_loc"]
    
    for index, row in df_edges.iterrows():
        from_loc = row["prb_loc"]
        # skip if datacenter's location not sought after
        if from_loc not in locations:
            continue
        for col in cols: 
            to_loc = row[col]     
            # if user's request location not sought after  
            if to_loc not in locations:
                continue  
            col_idx = df_edges.columns.get_loc(col)
            latency = row.iloc[col_idx+1]    
            saved_latencies = df_adj.loc[from_loc,to_loc]
            if np.isnan(saved_latencies).all():
                df_adj.loc[from_loc,to_loc] = list([float(latency)])
                df_adj.loc[to_loc,from_loc] = list([float(latency)])
            else:
                df_adj.loc[from_loc,to_loc].append(float(latency))
                df_adj.loc[to_loc, from_loc].append(float(latency))
    return df_adj


In [11]:
def get_all_regions(df):
    ''' Returns all mentioned locations in edge latency file
    '''
    df_locs = pd.concat([df["MinIP_loc"], df["IP1_loc"], df["IP2_loc"], df["IP3_loc"], df["IP4_loc"], df["IP5_loc"]])
    return df_locs.unique().tolist()

In [12]:


def no_empty_cells(df):
    no = 0
    for index, row in df.iterrows():
        no += row.isna().sum()
    return no

def no_all_latencies(df):
    no = 0
    for index, row in df.iterrows():
        no += row.notna().count()
    return no

def all_latencies_dict(df):
    no = defaultdict(lambda:0)
    for index, row in df.iterrows():
        print(row.isna())
        no[index] += row.isna().count()
    return no

In [19]:
def get_prb_loc(prb_id):
    geolocator = Nominatim(user_agent="geoapiExercises")
    response = requests.get(f"https://atlas.ripe.net/api/v2/probes/{prb_id}/?format=json")
    resp_json = json.loads(response.content.decode("utf-8"))
    long, lat = tuple(resp_json["geometry"]["coordinates"])
    
    location = geolocator.reverse(str(lat) + "," + str(long), language = 'en')

    country = location.raw["address"]["country"]

    if country is "United States" or "Canada":
        state = location.raw["address"]["state"]
        return state
    else: 
        country_code = location["address"]["country_code"].upper() 
        return country_code

In [20]:
get_prb_loc(50050)

Poland


NameError: name 'country' is not defined

In [None]:
### This extracts probe id and maps it to a location

# df_edges = pd.read_csv(r"api\latencies\edge.csv", index_col= False)
# df_prb_locs = extract_prb_locs(df_edges, picke_path = r'api\latencies\probes_clean.pickle')
# df_prb_locs.to_csv(r"api\latencies\prbs_with_locations.csv", index = False)


# timeout at index: 7775
# AT INDEX:   8000
# timeout at index: 8215

### This assign a country code to non us/canada countries. they use states instead

#df_prbs = pd.read_csv(r"api\latencies\prbs_with_locations.csv", index_col= False)
#df_prbs["prb_loc"] = convert_country_code(df_prbs["prb_loc"])
#df_prbs.to_csv(r"api\latencies\prbs_with_locations_coco.csv", index = False)

### THIS adds location to the fields [MinIP, OtherIP{1,2,3,4,5}]

#df_with_ip_locs = add_ip_locations(df_edges)
#df_with_ip_locs.to_csv(r"api\latencies\edge_with_ip_locs.csv", index= False)


### BELOW Is to merge files and formatting

# df_prb_loc = pd.read_csv(r"api\latencies\prbs_with_locations.csv", index_col = False)
# df_with_ip_locs = pd.read_csv(r"api\latencies\edge_processed10.csv", index_col = False)

# df_merged = merge_prb_locs_with_ip_locs(df_prb_loc, df_with_ip_locs)

# df_merged_formated = reorder_columns_of_final_df(df_merged)
# df_merged_formated.to_csv(r"api\latencies\edge_feat_locations.csv", index = False)

### Creates adjacency matrix out of the edge latencies

# df = pd.read_csv(r"..\api\latencies\edge_feat_locations.csv", index_col=False)
# regions = get_all_regions(df)
# # Filter out NaN and regions
# na = [region for region in regions if type(region) == str and len(region) > 2]
# eu = [region for region in regions if type(region) == str and len(region) < 3]
# na.remove("Oklahoma")
# na.remove("Alabama")
# na.remove("Mississippi")
# na.remove("South Dakota")
# na.remove("Nebraska")
# na.remove("Delaware")
# na.remove("Montana")
# na.remove("Alaska")
#na = [region for region in na if region in ["California", "Massachusetts", "Arizona"]]
# # Is this number correct? 
# print("Number of regions in NA: " + str(len(na)))
# print("N.o. all latencies: " + str(no_all_latencies(df)))
# mtrx_df = create_adj_mtx(r"..\api\latencies\edge_feat_locations.csv", na)
# mtrx_df.to_pickle(r"..\api\latencies\adjacency_mtrx.pickle")
# print(all_latencies_dict(mtrx_df))
# print("N.o. missing datapoints: " + str(no_empty_cells(mtrx_df)) + " out of " + str(len(na)*len(na)))

# mtrx_df.to_csv(r"..\api\latencies\edge_feat_locations_na.csv")

### Add locations to cloud_data

cloud_df = pd.read_csv(r"..\api\latencies\cloud.csv")
edge_df = pd.read_csv(r"..\api\latencies\edge_feat_locations.csv")
cloud_with_locs = merge_locs(edge_df, cloud_df)
cloud_with_locs.to_csv(r"..\api\latencies\cloud_feat_locations.csv", index= False)

print("This many prb_locs couldn't resolved by merge: " + check_empty(cloud_with_locs))




  exec(code_obj, self.user_global_ns, self.user_ns)


119


In [None]:
@DeprecationWarning
def prb_location_old(df_edges):
    geolocator = Nominatim(user_agent="geoapiExercises")
    df_edges["prb_loc"] = ""
    for index, row in df_edges.iterrows():
        prb_number = row["prb"]
        response = requests.get(f"https://atlas.ripe.net/api/v2/probes/{prb_number}/?format=json")
        resp_json = json.loads(response.content.decode("utf-8"))
        long, lat = tuple(resp_json["geometry"]["coordinates"])
        
        location = geolocator.reverse(str(lat) + "," + str(long), language = 'en')
        country = str((location)).split(",")[-1]
        
        if location == "United States":
            state = str((location)).split(",")[-3]
            df_edges.loc[index, "prb_loc"] = state
        else: 
            df_edges.loc[index, "prb_loc"] = country
    return df_edges

    
@DeprecationWarning
def intersects_of_prbs_to_csv():

    df = pd.read_csv(r"C:\Users\Admin\Documents\GitHub\umass\api\latencies\edge.csv", index_col= False)
    dfs = [df[df["prb"].isin(dic[region])] for region in dic.keys()]
    for df in dfs: 
        for index, row in df.iterrows():
            df["MinIP_loc"] = get_location_ipinfo(row["MinIP"])
            for i in range(1,6):  
                name = f"otherIP{i}"
                if pd.isna(row[name]):
                    continue
                location_data = get_location_ipinfo(row[f"otherIP{i}"])
                df[f"otherIP{i}_loc"] = location_data
    concated = pd.concat([df for df in dfs], ignore_index=True)
    concated.to_csv(r"C:\Users\Admin\Documents\GitHub\umass\api\latencies\edge_processed4.csv", index= False)

@DeprecationWarning
def singleprocess_prb_location():
    df_edges = pd.read_csv(r"api\latencies\edge.csv", index_col= False)
    df_prbs_np_arr = df_edges["prb"].unique()
    df_prbs = pd.DataFrame(df_prbs_np_arr, columns=["prb"])
    df_prbs = prb_location(df_prbs)
    df_prbs.to_csv(r"api\latencies\edge_processed_testing2.csv", index = False)

@DeprecationWarning
def multiprocess_prb_location():
    df_edges = pd.read_csv(r"api\latencies\edge.csv", index_col= False)
    df_prbs_np_arr = df_edges["prb"].unique()
    df_prbs = pd.DataFrame(df_prbs_np_arr, columns=["prb"])
    num_processes = multiprocessing.cpu_count() - 1
    chunk_size = int(df_prbs.shape[0]/num_processes)
    chunks = [df_prbs.iloc[df_prbs.index[i:i + chunk_size]] for i in range(0, df_prbs.shape[0], chunk_size)]
    df_prbs = prb_location(df_prbs)

    pool = multiprocessing.Pool(processes=num_processes)
    result = pool.map(prb_location, chunks)

    for i in range(len(result)):
    # we can reassign the original dataframe based on the index of each chunk
        df_prbs.iloc[result[i].index] = result[i]

    df_prbs.to_csv(r"api\latencies\edge_processed_testing2.csv", index = False)

@DeprecationWarning
def create_adj_matrix(df):
    regions_set = set()
    cols = ["otherIP1_loc", "otherIP2_loc", "otherIP3_loc", "otherIP4_loc", "otherIP5_loc", "MinIP_loc"]
    for index, row in df.iterrows():
        for col in cols:
            regions_set.add(str(row[col]))
    regions_set.remove("None")
    regions_set.remove("nan")

    adj = pd.DataFrame(columns = list(regions_set))

    for index, row in df.iterrows():
        for col in cols: 
            lat_col = col.replace("_loc", "")
            lat_col = lat_col.replace("IP", "latency")
            region_from = row[col]
            region_to = row["prb_loc"]
            region_from_idx = adj.columns.get_loc(region_from)
            region_to_idx = adj.columns.get_loc(region_to)
            
            latency = row.loc[lat_col]
            adj.iloc[region_from_idx, region_to_idx] = latency
            adj.iloc[region_to] = latency
    #adj.to_csv(r"api\latencies\adjance_latency.csv", index=False)
    return adj