In [1]:
import pandas as pd
import os
from multiprocessing import Pool
import merge
import pickle
import time

In [2]:
def ispByCountry(filename):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Name","ISP"])
    
    countries = data.loc[:,"Country Name"]
    countries = countries.drop_duplicates()
    #country_dict = {key:[] for key in countries.tolist()}

    unique_counts = {}
    distrib_counts = {}
    for country in countries.tolist():
        df = data.loc[data["Country Name"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        #country_dict[country] = isp_list
        unique_counts[country] = len(isp_list)
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = ip_cnt
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    #return (unique_counts, distrib_counts)
    pickle.dump(distrib_counts,open("Results/"+filename.split("/")[-1].split(".")[0]+".p","wb"))
    return distrib_counts

In [3]:
def ispByCountryMobile(filename):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Name","ISP","Usage Type (Reserved)"])
    data = data.loc[data["Usage Type (Reserved)"] == "MOB"]
    
    countries = data.loc[:,"Country Name"]
    countries = countries.drop_duplicates()
    #country_dict = {key:[] for key in countries.tolist()}

    unique_counts = {}
    distrib_counts = {}
    for country in countries.tolist():
        df = data.loc[data["Country Name"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        #country_dict[country] = isp_list
        unique_counts[country] = len(isp_list)
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = ip_cnt
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    #return (unique_counts, distrib_counts)
    pickle.dump(distrib_counts,open("Results/mob_"+filename.split("/")[-1].split(".")[0]+".p","wb"))
    return distrib_counts

In [4]:
def ispByCountryNoMobile(filename):
    start_tm = time.time()
    labels=["IP From","IP To","Country Code","Country Name","Region Name","City Name","Latitude","Longitude",
                "Zipcode","Time Zone","ISP","Domain","Net speed","IDD Code","Area Code","Weather Station Code",
                "Weather Station Name","MCC","MNC","Mobile Brand","Elevation","Usage Type (Reserved)"]
    data = pd.read_csv(filename,header=None,names=labels, usecols=["IP From","IP To","Country Name","ISP","Usage Type (Reserved)"])
    data = data.loc[data["Usage Type (Reserved)"] != "MOB"]
    
    countries = data.loc[:,"Country Name"]
    countries = countries.drop_duplicates()
    #country_dict = {key:[] for key in countries.tolist()}

    unique_counts = {}
    distrib_counts = {}
    for country in countries.tolist():
        df = data.loc[data["Country Name"] == country]
        isp_list = df.loc[0:,"ISP"].drop_duplicates().tolist()
        #country_dict[country] = isp_list
        unique_counts[country] = len(isp_list)
        
        distrib_counts[country] = {}
        for isp in isp_list:
            selected = df.loc[df["ISP"] == isp]
            
            ip_cnt = 0
            for start,end in zip(selected["IP From"],selected["IP To"]):
                ip_cnt = ip_cnt + end - start + 1 #+1 to make it inclusive
                
            distrib_counts[country][isp] = ip_cnt
    delta_tm = time.time() - start_tm
    m, s = divmod(delta_tm, 60)
    h, m = divmod(m, 60)
    print(filename + " finished counting IPs in %d:%02d:%02d" % (h, m, s))

    #return (unique_counts, distrib_counts)
    pickle.dump(distrib_counts,open("Results/no_mob_"+filename.split("/")[-1].split(".")[0]+".p","wb"))
    return distrib_counts

In [6]:
ispByCountryMobile("Data/sample.txt")
ispByCountryNoMobile("Data/sample.txt")
#result = ispByCountry("Data/Split/data_1.csv")
os.system('say "your program has finished"')
#print("Done")

Data/sample.txt finished counting IPs in 0:00:00
Data/sample.txt finished counting IPs in 0:00:00


0

In [None]:
p = Pool()
parts = p.map(ispByCountry,['Data/Split/data_2.csv','Data/Split/data_3.csv','Data/Split/data_4.csv','Data/Split/data_5.csv','Data/Split/data_6.csv','Data/Split/data_1.csv','Data/Split/data_7.csv'])
print("Pool is done")

In [None]:
p.close()
p.join()
st_tm = time.time()
result = merge.merge_dicts(parts)
pickle.dump(result, open("Results/result.p","wb"))
delta_tm = time.time() - st_tm
m, s = divmod(delta_tm, 60)
h, m = divmod(m, 60)
print(" finished in %d:%02d:%02d" % (h, m, s))

In [None]:
p = Pool()
parts = p.map(ispByCountryMobile,['Data/Split/data_2.csv','Data/Split/data_3.csv','Data/Split/data_4.csv','Data/Split/data_5.csv','Data/Split/data_6.csv','Data/Split/data_1.csv','Data/Split/data_7.csv'])
print("Pool is done")
p.close()
p.join()
st_tm = time.time()
result = merge.merge_dicts(parts)
pickle.dump(result, open("Results/mob_result.p","wb"))
delta_tm = time.time() - st_tm
m, s = divmod(delta_tm, 60)
h, m = divmod(m, 60)
print("Mobile finished in %d:%02d:%02d" % (h, m, s))

In [None]:
p = Pool()
parts = p.map(ispByCountryNoMobile,['Data/Split/data_2.csv','Data/Split/data_3.csv','Data/Split/data_4.csv','Data/Split/data_5.csv','Data/Split/data_6.csv','Data/Split/data_1.csv','Data/Split/data_7.csv'])
print("Pool is done")
p.close()
p.join()
st_tm = time.time()
result = merge.merge_dicts(parts)
pickle.dump(result, open("Results/no_mob_result.p","wb"))
delta_tm = time.time() - st_tm
m, s = divmod(delta_tm, 60)
h, m = divmod(m, 60)
print("No Mobile finished in %d:%02d:%02d" % (h, m, s))