In [None]:
import pandas as pd
import os
from multiprocessing import Pool
import merge
import pickle
import time
import statistics as stat

In [None]:
def ispByCountry(filename):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Name","ISP"])
    
    countries = data.loc[:,"Country Name"]
    countries = countries.drop_duplicates()
    #country_dict = {key:[] for key in countries.tolist()}

    unique_counts = {}
    distrib_counts = {}
    for country in countries.tolist():
        df = data.loc[data["Country Name"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        #country_dict[country] = isp_list
        unique_counts[country] = len(isp_list)
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = ip_cnt
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    #return (unique_counts, distrib_counts)
    pickle.dump(distrib_counts,open("Results/"+filename.split("/")[-1].split(".")[0]+".p","wb"))
    return distrib_counts

In [None]:
def ispByCountryMobile(filename):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Name","ISP","Usage Type (Reserved)"])
    data = data.loc[data["Usage Type (Reserved)"] == "MOB"]
    
    countries = data.loc[:,"Country Name"]
    countries = countries.drop_duplicates()
    #country_dict = {key:[] for key in countries.tolist()}

    unique_counts = {}
    distrib_counts = {}
    for country in countries.tolist():
        df = data.loc[data["Country Name"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        #country_dict[country] = isp_list
        unique_counts[country] = len(isp_list)
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = ip_cnt
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    #return (unique_counts, distrib_counts)
    pickle.dump(distrib_counts,open("Results/mob_"+filename.split("/")[-1].split(".")[0]+".p","wb"))
    return distrib_counts

In [None]:
def ispByCountryNoMobile(filename):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Name","ISP","Usage Type (Reserved)"])
    data = data.loc[data["Usage Type (Reserved)"] != "MOB"]
    
    countries = data.loc[:,"Country Name"]
    countries = countries.drop_duplicates()
    #country_dict = {key:[] for key in countries.tolist()}

    unique_counts = {}
    distrib_counts = {}
    for country in countries.tolist():
        df = data.loc[data["Country Name"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        #country_dict[country] = isp_list
        unique_counts[country] = len(isp_list)
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = ip_cnt
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    #return (unique_counts, distrib_counts)
    pickle.dump(distrib_counts,open("Results/no_mob_"+filename.split("/")[-1].split(".")[0]+".p","wb"))
    return distrib_counts

In [None]:
def analyticSummary(data):
    result = {}
    for country in data:
        val = list(data[country].values())
        avg = stat.mean(val)
        try:
            std = stat.stdev(val)
        except stat.StatisticsError:
            std = 0
        result[country] = (avg,std)
    return result

In [None]:
ispByCountryMobile("Data/sample.txt")
ispByCountryNoMobile("Data/sample.txt")
#result = ispByCountry("Data/Split/data_1.csv")
os.system('say "your program has finished"')
#print("Done")

In [None]:
st_tm = time.time()
p = Pool()
parts = p.map(ispByCountry,[f for f in os.listdir("Data/Split") if f[-4:] == ".csv"])
print("Pool is done")
p.close()
p.join()
result = merge.merge_dicts(parts)
pickle.dump(result, open("Results/result.p","wb"))
delta_tm = time.time() - st_tm
m, s = divmod(delta_tm, 60)
h, m = divmod(m, 60)
print("Full finished in %d:%02d:%02d" % (h, m, s))

In [None]:
st_tm = time.time()
p = Pool()
parts = p.map(ispByCountryMobile,[f for f in os.listdir("Data/Split") if f[-4:] == ".csv"])
print("Pool is done")
p.close()
p.join()
result = merge.merge_dicts(parts)
pickle.dump(result, open("Results/mob_result.p","wb"))
delta_tm = time.time() - st_tm
m, s = divmod(delta_tm, 60)
h, m = divmod(m, 60)
print("Mobile finished in %d:%02d:%02d" % (h, m, s))

In [None]:
st_tm = time.time()
p = Pool()
parts = p.map(ispByCountryNoMobile,[f for f in os.listdir("Data/Split") if f[-4:] == ".csv"])
print("Pool is done")
p.close()
p.join()
result = merge.merge_dicts(parts)
pickle.dump(result, open("Results/no_mob_result.p","wb"))
delta_tm = time.time() - st_tm
m, s = divmod(delta_tm, 60)
h, m = divmod(m, 60)
print("No Mobile finished in %d:%02d:%02d" % (h, m, s))

In [None]:
st_tm = time.time()
data = pickle.load(open("Results/result.p","rb"))
pickle.dump(analyticSummary(data),open("Results/AS-whole.p","wb"))
data = pickle.load(open("Results/mob_result.p","rb"))
pickle.dump(analyticSummary(data),open("Results/AS-mob.p","wb"))
data = pickle.load(open("Results/no_mob_result.p","rb"))
pickle.dump(analyticSummary(data),open("Results/AS-no_mob.p","wb"))
delta_tm = time.time() - st_tm
m, s = divmod(delta_tm, 60)
h, m = divmod(m, 60)
print("finished in %d:%02d:%02d" % (h, m, s))