In [1]:
import pandas as pd
import pickle
import statistics as stat
import time

In [2]:
def ispByCountry(filename):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Code","ISP"])
    
    countries = data.loc[:,"Country Code"]
    countries = countries.drop_duplicates()
    #country_dict = {key:[] for key in countries.tolist()}

    unique_counts = {}
    distrib_counts = {}
    for country in countries.tolist():
        df = data.loc[data["Country Code"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        #country_dict[country] = isp_list
        unique_counts[country] = len(isp_list)
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = int(ip_cnt) #To deal with overflow/rounding of numpy type int64
            
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    #return (unique_counts, distrib_counts)
    pickle.dump(distrib_counts,open("Results/Country/"+filename.split("/")[-1].split(".")[0]+".p","wb"))
    return distrib_counts

In [5]:
def ispByCountryMobile(filename):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Code","ISP","Usage Type (Reserved)"])
    data = data.loc[data["Usage Type (Reserved)"].str.contains("MOB")]
    
    countries = data.loc[:,"Country Code"]
    countries = countries.drop_duplicates()
    #country_dict = {key:[] for key in countries.tolist()}

    unique_counts = {}
    distrib_counts = {}
    for country in countries.tolist():
        df = data.loc[data["Country Code"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        #country_dict[country] = isp_list
        unique_counts[country] = len(isp_list)
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = int(ip_cnt) #To deal with overflow/rounding of numpy type int64
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    #return (unique_counts, distrib_counts)
    pickle.dump(distrib_counts,open("Results/Country/mob_"+filename.split("/")[-1].split(".")[0]+".p","wb"))
    return distrib_counts

In [1]:
def ispByCountryNoMobile(filename):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Code","ISP","Usage Type (Reserved)"])
    data = data.loc[~data["Usage Type (Reserved)"].str.contains("MOB")]
    
    countries = data.loc[:,"Country Code"]
    countries = countries.drop_duplicates()
    #country_dict = {key:[] for key in countries.tolist()}

    unique_counts = {}
    distrib_counts = {}
    for country in countries.tolist():
        df = data.loc[data["Country Code"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        #country_dict[country] = isp_list
        unique_counts[country] = len(isp_list)
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = int(ip_cnt) #To deal with overflow/rounding of numpy type int64
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    #return (unique_counts, distrib_counts)
    pickle.dump(distrib_counts,open("Results/Country/no_mob_"+filename.split("/")[-1].split(".")[0]+".p","wb"))
    return distrib_counts

In [4]:
def analyticSummary(data):
    result = {}
    for country in data:
        val = list(data[country].values())
        try:
            avg = stat.mean(val)
        except stat.StatisticsError:
            avg = 0
        try:
            std = stat.stdev(val)
        except stat.StatisticsError:
            std = 0
        except AssertionError:
            print(country)
            pickle.dump(val,open(country+".p","wb"))
            std = None
        result[country] = (avg,std)
    return result