# 文件说明
根据case_path使用图表征方法获得place的表征，city inmigration通过CAE方法获得了case path中place的表征，最后将这两个表征通过AE方法获得最后的place的表征
得到以下文件：
1. poi_concatenate_embedding.pickle,将各层的embedding拼接后得到poi最后得到的向量字典，格式如下：

        {
            poiaddr1:emb1
            poiaddr2:emb2
        }

2. city_poi_emb_dict.pickle, 将city的embedding和POI的embedding进行拼接
3. autoae_poi_emb_dict.pickle, 经过autoAE之后的每个place的embedding

In [2]:
import pickle
import numpy as np
import pandas as pd
import sys
import difflib
import torch
import networkx as nx

sys.path.append('../data/pengpai')
import address
sys.path.append('../')

from argparse import *
from deepwalkpytorch.deepwalk import *
import os

In [3]:
idx_data_base = '../data/pengpai/labeled_data/idx_address_network_no_edgedata/'
levels = ['poi','village','township','county','city','province','full']
level2embeddingsize = {'full':120,'province':4,'city':8,'county':12,'township':16,'village':32,'poi':48,}

# 层次化地点表征 （HPRL）

In [4]:
def parser_init():
    # deepwalk 参数设置
    parser = ArgumentParser("deepwalk",
                            formatter_class=ArgumentDefaultsHelpFormatter,
                            conflict_handler='resolve')

    parser.add_argument("--debug", dest="debug", action='store_true', default=False,
                        help="drop a debugger if an exception is raised.")

    parser.add_argument('--format', default='edgelist',
                        help='File format of input file')

    parser.add_argument('--input', nargs='?', required=True,
                        help='Input graph file')

    parser.add_argument("-l", "--log", dest="log", default="INFO",
                        help="log verbosity level")

    parser.add_argument('--matfile-variable-name', default='network',
                        help='variable name of adjacency matrix inside a .mat file.')

    parser.add_argument('--max-memory-data-size', default=1000000000, type=int,
                        help='Size to start dumping walks to disk, instead of keeping them in memory.')

    parser.add_argument('--number-walks', default=10, type=int,
                        help='Number of random walks to start at each node')

    parser.add_argument('--output', required=True,
                        help='Output representation file')

    parser.add_argument('--representation-size', default=64, type=int,
                        help='Number of latent dimensions to learn for each node.')

    parser.add_argument('--seed', default=0, type=int,
                        help='Seed for random walk generator.')

    parser.add_argument('--undirected', default=True, type=bool,
                        help='Treat graph as undirected.')

    parser.add_argument('--vertex-freq-degree', default=False, action='store_true',
                        help='Use vertex degree to estimate the frequency of nodes '
                            'in the random walks. This option is faster than '
                            'calculating the vocabulary.')

    parser.add_argument('--walk-length', default=40, type=int,
                        help='Length of the random walk started at each node')

    parser.add_argument('--window-size', default=5, type=int,
                        help='Window size of skipgram model.')

    parser.add_argument('--workers', default=1, type=int,
                        help='Number of parallel processes.')
    
    return parser



In [None]:
input_path = os.path.join(idx_data_base,'poi'+'_net_edgelist_intnode.txt')
g = nx.read_edgelist(input_path,nodetype = int)  # 读取edgelist

In [None]:
def level_parser(level):
    input = os.path.join(idx_data_base,level+'_net_edgelist_intnode.txt')
    output = 'levels_embeddings/'+level+'embeddings'
    p = parser_init()
    parser = p.parse_args(['--input',input,'--output',output,'--representation-size',str(level2embeddingsize[level])])
    return parser

In [None]:
# 分层处理 层次化embedding
# 分层dw
def level_dw():
    for level in levels:
    # level = 'poi'
        if level == 'poi':
            continue
        input = os.path.join(idx_data_base,level,'_net_edgelist_intnode.txt')
        parser = level_parser(level)
        dw.process(parser)
        print(level+' representation learning done!')
        # data/pengpai/labeled_data/idx_address_network_no_edgedata/poi_net_edgelist_intnode.txt
    print('All levels representation learning done!')

In [None]:
def get_embdict(embedding):
    """
    获得 地点与embedding 的对应关系
    """
    with open('../data/pengpai/labeled_data/idx_address_network_no_edgedata/nodeidxmaps.pickle','rb') as file:
        addr2idxmaps = pickle.load(file)
        file.close()
    addr2idx = addr2idxmaps['poi']
    idx2addr = {val:key for key,val in addr2idx.items()}
    addr2emb = {}
    for i in range(len(embedding)):
        addr2emb[idx2addr[i]] = embedding[i]
    return addr2emb


In [None]:
import pickle 
with open('../data/pengpai/labeled/idx_address_network_no_edgedata/nodeidxmaps.pickle','rb') as file:
    addr2idxmaps = pickle.load(file)
    file.close()
with open('../data/pengpai/labeled/case_path.pickle','rb') as file:
    case_path = pickle.load(file)
    file.close()

In [None]:
# 获得不同层次的embedding和address的对应关系，广播机制被应用其中
addr2embedding = {}
for level in levels:
    embeddings = np.load('levels_embeddings/'+level+'embeddings.npy')
    addr2idx =  addr2idxmaps[level]
    addr2embedding[level] = {addr:embeddings[idx] for addr,idx in addr2idx.items()} # 广播机制应用其中

In [None]:
def find_most_sim_addr(address,level):
    # 在level中的地点中找到与address最相近的地址
    word = ''
    sim = 0
    max = 0
    for addr in addr2embedding[level].keys():
        sim = difflib.SequenceMatcher(None,addr,address).quick_ratio()
        if sim > max:
            max = sim
            word = addr

    return word

In [None]:
# 将不同层次的地点表征拼接
def get_hprl_con_emb():
    final_embedding = {}
    # for level in levels: # 不同level的图
    count = 0
    for addr in list(addr2embedding['poi'].keys()): # 该等级的图的地点
        final_embedding[addr] = np.array([],dtype=float)
        for level in levels:
            # find_exact = False
            if (level == 'full'):
                continue
            try:
                final_embedding[addr] = np.concatenate((final_embedding[addr],addr2embedding[level][addr]))
            except KeyError:
                word = find_most_sim_addr(addr,level)
                if word != '':
                    final_embedding[addr] = np.concatenate((final_embedding[addr],addr2embedding[level][word]))
                else:
                    count+=1
                    final_embedding[addr] = np.concatenate((final_embedding[addr],np.random.random((level2embeddingsize[level]))))
    poi_embedding = final_embedding
    return poi_embedding
poi_concatenate_embedding = get_hprl_con_emb()

In [5]:
with open('poi_concatenate_embedding.pickle','rb') as file:
    poi_concatenate_embedding = pickle.load(file=file)
    file.close()

# 加载城市的表征（CARL）

In [6]:
# 加载城市表征
def get_CARL_emb():
    city_img_mvin_emb = np.fromfile('../data/city_inmigration/data/move_in/2020-01-19/weighted_1.emb',dtype=np.float32)
    city_img_mvout_emb = np.fromfile('../data/city_inmigration/data/move_out/2020-01-19/weighted_1.emb',dtype=np.float32)
    city_img_mvin_emb = city_img_mvin_emb.reshape((-1,64))
    city_img_mvout_emb = city_img_mvout_emb.reshape((-1,64))
    city_img_emb = np.concatenate([city_img_mvin_emb,city_img_mvout_emb],axis=1)
    return city_img_emb
city_img_emb = get_CARL_emb()

# 将HPRL地点的表征与城市的表征拼接

In [7]:
def city_poi_concat(city_emb,poi_embedding):

    with open('../data/city_inmigration/data/city2idx.pickle','rb') as file:
        city2idx = pickle.load(file)
        file.close()
    city_poi_emb_dict = {}
    count = 0
    for poi_addr,poi_emb in poi_embedding.items():
        got_city = 0
        for city,idx in city2idx.items():
            if city in poi_addr: # 有些只有省，那么对于那些只有省的就可以丢弃了？
                got_city = 1
                city_poi_emb_dict[poi_addr] = np.concatenate([city_emb[city2idx[city]],poi_embedding[poi_addr]])
        if got_city==0: # 没找到的话用随机的一个128维的向量代替cityemb
            count += 1 
            city_poi_emb_dict[poi_addr] = np.concatenate([np.random.random(128),poi_embedding[poi_addr]])
    return city_poi_emb_dict

In [8]:
HPRL_CARL = city_poi_concat(city_img_emb,poi_concatenate_embedding)

In [15]:
def get_autoencoder_dict(concat_emb,patience):
    refined_input_data,seq_len = core.prepare_dataset(np.array(list(concat_emb.values())))
    encoded,decoded,final_loss = core.QuickEncode(refined_input_data,embedding_dim=128,patience = patience)
    i=0
    enc_emb = dict()
    for poi_addr in concat_emb.keys():
        enc_emb[poi_addr] = np.array(encoded[i])
        i+=1
    return enc_emb

# 拼接后的表征通过autoencoder降维融合处理

In [9]:
from Autoencoder import core

In [16]:
HPRL_CARL = get_autoencoder_dict(HPRL_CARL,patience=20)

epoch : 100, loss_sum : 116631.1718750
epoch : 200, loss_sum : 85295.8359375
epoch : 300, loss_sum : 68704.3828125
epoch : 400, loss_sum : 59088.7304688
epoch : 500, loss_sum : 52372.1835938
epoch : 600, loss_sum : 47671.8828125
epoch : 700, loss_sum : 43041.4492188
epoch : 800, loss_sum : 39330.9648438
epoch : 900, loss_sum : 35928.2812500
epoch : 1000, loss_sum : 34721.2851562
Early Stopping activated. Final validation loss : 33248.2617188


# represatation learning done

In [18]:
with open('poi_embeddings/hprl_carl_poi_embedding.pickle','wb') as file:
    pickle.dump(file=file,obj = HPRL_CARL)
    file.close()

# Ablation EXP

## HPRL

In [128]:
poi_concatenate_embedding = get_hprl_con_emb()

KeyboardInterrupt: 

In [138]:
HPRL = get_autoencoder_dict(poi_concatenate_embedding,patience=20)

  data_in_tensor = torch.tensor(sequential_data, dtype=torch.float)


epoch : 100, loss_sum : 60608.3906250
epoch : 200, loss_sum : 37221.5156250
epoch : 300, loss_sum : 25889.2480469
epoch : 400, loss_sum : 19105.4785156
epoch : 500, loss_sum : 14824.8564453
epoch : 600, loss_sum : 11998.0468750
epoch : 700, loss_sum : 10460.7089844
epoch : 800, loss_sum : 8345.3886719
epoch : 900, loss_sum : 7654.8154297
Early Stopping activated. Final validation loss : 7291.1884766


In [137]:
with open('poi_embeddings/hprl_poi_embedding.pickle','wb') as file:
    pickle.dump(file=file,obj=HPRL)

## LOC

In [146]:
with open('../data/pengpai/labeled_data/poi_log_lat_embedding.pickle','rb') as file:
    poi_log_lat_embedding = pickle.load(file)

with open('poi_embeddings/loc_poi_embedding.pickle','wb') as file:
    pickle.dump(file=file,obj=poi_log_lat_embedding)

In [147]:
with open('poi_embeddings/loc_poi_embedding.pickle','rb') as file:
    a = pickle.load(file=file)

# CARL_LOC

In [126]:
CARL_LOC = city_poi_concat(city_img_emb,poi_log_lat_embedding)
CARL_LOC = get_autoencoder_dict(CARL_LOC)

  data_in_tensor = torch.tensor(sequential_data, dtype=torch.float)


epoch : 100, loss_sum : 407557.8125000
epoch : 200, loss_sum : 389230.9062500
epoch : 300, loss_sum : 287570.5625000
epoch : 400, loss_sum : 109389.5312500
epoch : 500, loss_sum : 112824.1953125
Early Stopping activated. Final validation loss : 100848.5625000


In [127]:
with open('poi_embeddings/carl_loc_poi_embedding.pickle','wb') as file:
    pickle.dump(file=file,obj = CARL_LOC)

In [123]:
CARL_LOC['广西壮族自治区桂林市叠彩区'].shape

(128,)

# Baselines

In [None]:
from karateclub import DeepWalk,RandNE,GraRep,GLEE,NodeSketch,Node2Vec,BoostNE,HOPE

m_dic = {'deepwalk':DeepWalk,'randne':RandNE,'grarep':GraRep,'nodesketch':NodeSketch,
         'grarep':GraRep,'node2vec':Node2Vec,'boostne':BoostNE,'hope':HOPE}

def get_save_emb(model_name,net):
    # 根据不同的模型对poi层次的网络进行表征学习作为baseline
#     dimensions = 128
    if model_name == 'boostne':
        model = m_dic[model_name](iterations = 15)
    if model_name == 'nodesketch':
        model = m_dic[model_name](iterations = 20,dimensions = dimensions)
    else:
        model = m_dic[model_name]()
    print('training ...')

    model.fit(net)
    embedding = model.get_embedding()
    embedding = get_embdict(embedding)
    save_path = 'poi_embeddings/'
    if os.path.exists(save_path):
        pass
    else:
        os.mkdir(save_path)
        
    with open(os.path.join(save_path,model_name+'_poi_embedding.pickle'),'wb') as file:
        pickle.dump(file = file, obj = embedding)
        file.close()
    print('saved')

get_save_emb('hope',g)

get_save_emb('nodesketch',g)

get_save_emb('node2vec',g)

# 图规模太大，无法训练
get_save_emb('grarep',g)

get_save_emb('deepwalk',g)

get_save_emb('randne',g)

get_save_emb('grarep',g)

get_save_emb('boostne',g)

In [None]:
# 获得地点的类型，得到idx2type的字典并将其保存为np array
def get_address_idx2type():
    addrset = {}

    # 获得不同level的地址
    for level in levels:
        addrset[level] = set()
        for case,path in case_path.items():
            for addr in path:
                if addr.get_addr(level) == '':
                    continue
                addrset[level].add(addr.get_addr(level))


    addr2type = {}

    for case, path in case_path.items():
        for addr in path:
            for level in levels[:-1]:
                if addr.get_addr(level) == addr.get_addr('full'):
                    addr2type[addr.get_addr('full')]=level

    idx2type = {}
    for addr,idx in addr2idxmaps['poi'].items():
        try:
            idx2type[idx] = addr2type[addr]
        except:
            idx2type[idx] = 'poi'

    def one_hot(length,idx):
        arr = np.zeros(length)
        arr[idx]=1
        return arr

    typemap = {'province':0,'city':1,'county':2,'township':3,'village':4,'poi':5}
    for key,val in idx2type.items():
        idx2type[key] = one_hot(len(typemap),typemap[val])

    np.savetxt(os.path.join(idx_data_base,'poi'+'_feature.txt'),np.array(list(idx2type.values())))

In [None]:
# 获得网络的树状层次结构，用于层次化的baseline
def get_net_tree():
    
    tree = nx.DiGraph()

    def start_with(pro,addr):
        flag = True
        for i in range(min(len(pro),len(addr))):
            if pro[i] == addr[i]:
                continue
            else:
                flag = False
                break
        return flag

    flag = False
    for level in levels:
        if level == 'province':
            for addr in addrset[level]:
                tree.add_edge(10275,pro2idx[addr])
        if level == 'poi':
            for addr in addrset[level]:
                flag = False
                for pro in addrset['province']:
                    if pro in addr:
                        tree.add_edge(pro2idx[pro],addr2idxmaps[level][addr])
                        flag = True
                        break
                if not flag:
                    print(addr)
                    tree.add_edge(pro2idx['黑龙江省'],addr2idxmaps[level][addr])
    nx.write_edgelist(tree,'node_tree.txt')
    return tree
                    
# tree.add_edge()