In [4]:
import pandas as pd
import json
import time

In [1]:
def ispByCountry(filename, mobile = None):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Code","ISP","Usage Type (Reserved)"])
    if mobile == True:
        data = data.loc[data["Usage Type (Reserved)"].str.contains("MOB")]
    elif mobile == False:
        data = data.loc[~data["Usage Type (Reserved)"].str.contains("MOB")]
    
    data = data.dropna()
    countries = data["Country Code"].drop_duplicates().tolist()

    distrib_counts = {}
    for country in countries:
        df = data.loc[data["Country Code"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = int(ip_cnt) #To deal with overflow/rounding of numpy type int64
            
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    json.dump(distrib_counts,open("Results/Countries/Splits/"+filename.split("/")[-1].split(".")[0]+".json","w"),indent=4)
    return distrib_counts