In [1]:
import os

import joblib as jl
import pandas as pd

from src import graph
from src import file_io
from src import publish
from src import preprocess
from src.publish import convert_to_regex
from src.graph import build_graph, build_graph_mp
from src.preprocess import make_domain_path_split, ac_map_local
from src.file_io import load_urls, dump_urls, dump_edge_list, dump_node_list, dump_regex_list

In [2]:
work_dir = "./data/EXP_2"
if not os.path.isdir(work_dir):
    os.mkdir(work_dir)
    print("mkdir %s" %work_dir)

In [3]:
urls = load_urls("../data/sangfor/safe_0419.csv", csv = True)
domain_path_map = make_domain_path_split(urls, path_level_thresh = 1)

DEBUG:url_network:URLs Count:	1461039


In [6]:
#category = ["色情", "赌博", "搜索引擎", "教育", "政府机构", "法律信息", "网上购物"]
category = ["色情", "赌博", "政府机构", "教育"]
cate_domain_map, domain_cate_map, domain_path_map_sel = ac_map_local(
    domain_path_map, 
    category = category, 
    ac_root_path = "../data/ac_white")

DEBUG:url_network:URLs Count:	453745
INFO:url_network:色情	count:4547
DEBUG:url_network:URLs Count:	529136
INFO:url_network:赌博	count:3324
DEBUG:url_network:URLs Count:	138855
INFO:url_network:政府机构	count:2562
DEBUG:url_network:URLs Count:	441342
INFO:url_network:教育	count:3433


In [7]:
node_list = build_graph_mp(domain_path_map_sel, thresh=6, batch_size=4000000, n_jobs=8)

INFO:url_network:Total Domain Count:13866
INFO:url_network:Total Comp Count:96132978
DEBUG:url_network:Params ready, time consume:24.976774


batch:0	0/4000000
batch:0	500000/4000000
batch:0	1000000/4000000
batch:1	0/4000000
batch:0	1500000/4000000
batch:1	500000/4000000
batch:0	2000000/4000000
batch:1	1000000/4000000
batch:2	0/4000000
batch:0	2500000/4000000
batch:2	500000/4000000
batch:1	1500000/4000000
batch:0	3000000/4000000
batch:1	2000000/4000000
batch:2	1000000/4000000
batch:0	3500000/4000000
batch:3	0/4000000
batch:1	2500000/4000000
batch:2	1500000/4000000
batch:3	500000/4000000
batch:2	2000000/4000000
batch:1	3000000/4000000
batch:3	1000000/4000000
batch:2	2500000/4000000
batch:4	0/4000000
batch:1	3500000/4000000
batch:2	3000000/4000000
batch:3	1500000/4000000
batch:4	500000/4000000
batch:3	2000000/4000000
batch:2	3500000/4000000
batch:4	1000000/4000000
batch:5	0/4000000
batch:3	2500000/4000000
batch:4	1500000/4000000
batch:5	500000/4000000
batch:3	3000000/4000000
batch:4	2000000/4000000
batch:3	3500000/4000000
batch:6	0/4000000
batch:5	1000000/4000000
batch:4	2500000/4000000
batch:6	500000/4000000
batch:5	1500000/4

In [14]:
jl.dump(node_list, os.path.join(work_dir, "node.jl.z"))
node_list = jl.load(os.path.join(work_dir, "node.jl.z"))
dump_edge_list(node_list, domain_cate_map, output_path=os.path.join(work_dir,"edge.csv"))
dump_node_list(node_list, domain_cate_map, output_path=os.path.join(work_dir,"node.csv"))

In [16]:
df = graph.make_modularity(node_list, domain_cate_map, resolution = 1)
community_dict = graph.modularity_class_analyze(input_path=df,
                                                min_cluster_thresh=2,
                                                mis_classify_count_thresh=1)

INFO:url_network:input community size:561
INFO:url_network:filter community size:501
INFO:url_network:partition acc 0.353470


In [17]:
label_path_map = graph.path_extract(input_path=os.path.join(work_dir,"edge.csv"), 
                                    community_dict = community_dict)

INFO:url_network:label:0	path extract:179
INFO:url_network:label:1	path extract:205
INFO:url_network:label:2	path extract:98
INFO:url_network:label:3	path extract:157


In [22]:
jl.dump(label_path_map, os.path.join(work_dir,"label_path_map.jl.z"))
porn_regex = publish.convert_to_regex(label_path_map[0], change_digit = True)
gambing_regex = publish.convert_to_regex(label_path_map[1], change_digit=True)
gov_regex = publish.convert_to_regex(label_path_map[2], change_digit=True)
edu_regex = publish.convert_to_regex(label_path_map[3], change_digit=True)

dump_regex_list(porn_regex, os.path.join(work_dir,"porn_regex_raw.txt"))
dump_regex_list(gambing_regex, os.path.join(work_dir,"gambing_regex_raw.txt"))
dump_regex_list(gov_regex, os.path.join(work_dir,"gov_regex_raw.txt"))
dump_regex_list(edu_regex, os.path.join(work_dir,"edu_regex_raw.txt"))

DEBUG:url_network:OLD DATA FIND! REMOVING	./data/EXP_2/porn_regex_raw.txt
DEBUG:url_network:Regex has been dump	./data/EXP_2/porn_regex_raw.txt
DEBUG:url_network:OLD DATA FIND! REMOVING	./data/EXP_2/gambing_regex_raw.txt
DEBUG:url_network:Regex has been dump	./data/EXP_2/gambing_regex_raw.txt
DEBUG:url_network:OLD DATA FIND! REMOVING	./data/EXP_2/gov_regex_raw.txt
DEBUG:url_network:Regex has been dump	./data/EXP_2/gov_regex_raw.txt
DEBUG:url_network:OLD DATA FIND! REMOVING	./data/EXP_2/edu_regex_raw.txt
DEBUG:url_network:Regex has been dump	./data/EXP_2/edu_regex_raw.txt


### papare webpage for downloading

In [5]:
import random
import urlparse
sangfor_domain = [urlparse.urlparse(_).hostname for _ in urls]
ac_white_domain = file_io.load_ac_domain(ac_root_path = "../data/ac_white")
ac_porn_domain = load_urls("../data/ac_white/色情", csv = False)

In [20]:
no_porn_domain = set(ac_white_domain) & set(sangfor_domain)
porn_domain = set(ac_porn_domain) & set(sangfor_domain)
print len(no_porn_domain)
print len(porn_domain)

81056
8574


In [21]:
sample_no_porn_domain = random.sample(no_porn_domain, 80000)
sample_porn_domain = random.sample(porn_domain, 8000)

In [22]:
file_io.dump_urls(sample_no_porn_domain, '/home/sparrow/data/sample_no_porn2.txt', csv = False)
file_io.dump_urls(sample_porn_domain, '/home/sparrow/data/sample_porn2.txt', csv = False)
file_io.dump_urls(sample_no_porn_domain + sample_porn_domain, '/home/sparrow/data/domain_queue2.txt', csv = False)

DEBUG:url_network:URLs has been dump	/home/sparrow/data/sample_no_porn2.txt
DEBUG:url_network:URLs has been dump	/home/sparrow/data/sample_porn2.txt
DEBUG:url_network:URLs has been dump	/home/sparrow/data/domain_queue2.txt
