In [33]:
import requests
import json
import pandas as pd
import numpy as np
from urllib.parse import urljoin
import time
import logging
import csv
from tqdm import tqdm
import glob

GRAPHSQL_URL = "http://192.168.20.241:9000/query/OneMonthNet/"

############################ util functions ############################
def query_graphsql_ori(query_name, para_string):
    remaining_url = "{}?{}".format(query_name, para_string)
    url=urljoin(GRAPHSQL_URL, remaining_url)
    print('query:{}\nparas:{}\nrequest for url: {}'.format(query_name, para_string.split('&'), url))
    print('---------------------------------------------------------------------------------------------------')
    result = requests.get(url)
    result_json = json.loads(result.text)
    return result_json


def query_graphsql(query_name, para_string):
    remaining_url = "{}?{}".format(query_name, para_string)
    url=urljoin(GRAPHSQL_URL, remaining_url)
    st = time.time()
    print('query:{}\nparas:{}\nrequest for url: {}'.format(query_name, para_string.split('&'), url))
    print('---------------------------------------------------------------------------------------------------')
    try:
        result = requests.get(url)
        result_json = json.loads(result.text)
        if result_json['error']:
            logging.error(result_json['message'])
            print('run query failed')
            return None
        print('run query finish, use {} seconds\n\n'.format(time.time() - st))
        return result_json
    except Exception as e:
        print('failed')
        
        
def _update_max_min_date(node_name):
    '''
    get the max & min date for the given node type based on its history update dates;
    para: node_name: 'Device'; check all the node_name type in gsql graph schema: OneMonthNet;
    '''
    query_name = 'update_max_min_date'
    paras = 'node={}'.format(node_name)
    return query_graphsql(query_name, paras)


def _node_cutoff_filter(start_time, end_time, node_type):
    '''
    note that call _update_max_min_date before run this query;
    select those nodes with given node_type exist between start_time and end_time;
    paras:start_time:'2019-06-01 18:42:22'
    paras:end_time:'2019-07-01 18:42:22'
    paras:node_type:'User'; check all the node_name type in gsql graph schema: OneMonthNet;
    '''
    query_name = 'node_cutoff_filter'
    paras = 'start_t={}&end_t={}&node={}'.format(start_time, end_time, node_type) 
    return query_graphsql(query_name, paras)
    
    
########################################################################

# def label_get(start_date):
#     'old version and has been deprecated now, pls use label_get2 query;'
#     query_name = 'label_get'
#     paras = 'start_date={}'.format(start_date) 
#     return query_graphsql(query_name, paras)


def label_get2(start_date, end_date, loanstyle):
    '''
    get loans with the given loanstyle whose fundtime between start_date & end_date;
    paras: start_date: '2019-06-01 00:00:00'
    paras: end_date: '2019-07-01 00:00:00'
    paras: loanstyle: '绿卡30天1期'
    '''
    query_name = 'label_get2'
    paras = 'start_date={}&end_date={}&loan_type={}'.format(start_date, end_date, loanstyle) 
    return query_graphsql(query_name, paras)['results'][0]['loanlabelSHOW']


#1
def reset(node):
    '''
    '''
    query_name = 'reset'
    paras = 'node={}'.format(node) 
    return query_graphsql(query_name, paras)


#2
def pageRank_train(start_t, end_t, node, maxChange, maxIter, damping, query_name):
    '''
    '''
    paras = 'start_t={}&end_t={}&node={}&maxChange={}&maxIter={}&damping={}'.format(start_t, end_t, node, maxChange,maxIter,damping) 
    return query_graphsql(query_name, paras)


#3
def pageRank_appr_files(file_abs_path, query_name):
    '''
    draft 1 for pageRank in cashbus graph; 
    Only works for user-device relationship(single edge type & double vertex type);
    Only output nodes(type: user or device) with pg_score !=0;
    '''
    paras = 'file_path={}'.format(file_abs_path) 
    return query_graphsql(query_name, paras) 


def pyG_prepare_label(st, et, node):
    query_name = 'get_default_now'
    paras = 'start_t={}&end_t={}&node={}'.format(st, et, node)
    return query_graphsql(query_name, paras) 

    
def pyG_prepare_edge(st, et, node):
    query_name = 'pyG_pre'
    paras = 'start_t={}&end_t={}&node={}'.format(st,et,node)
    return query_graphsql(query_name, paras) 


def pyG_prepare_feat(st, et, node):
    query_name = 'pyG_pre_ori'
    paras = 'start_t={}&end_t={}&node={}'.format(st,et,node)
    return query_graphsql(query_name, paras) 


#2
def connected_comp_train(start_t, end_t, node):
    '''
    '''
    query_name='conn_comp'
    paras = 'start_t={}&end_t={}&node={}'.format(start_t, end_t, node) 
    return query_graphsql(query_name, paras)


#3
def connected_comp_appr_files(file_abs_path):
    '''
    draft 1 for pageRank in cashbus graph; 
    Only works for user-device relationship(single edge type & double vertex type);
    Only output nodes(type: user or device) with pg_score != 0;
    '''
    paras = 'file_path={}'.format(file_abs_path)
    query_name = 'conn_comp_check'
    return query_graphsql(query_name, paras)


def train(st, et, node_type, reset_bool):
    """
    paras: st: start time of a period, only node within the restriction will be trained and given a pgscore;
    paras: et: end time of the period;
    paras: node_type: determine which nodetype will be trained;
    """
    assert node_type in ["Device", "PhoneNumber"]
    if reset_bool:
        reset(node_type)
    pageRank_train(st, 
                   et, 
                   node_type, 
                   10, 3, 0.6,
                   "pageRank_train_{}".format(node_type.lower()))
    return None


def test_with_local_files(nodetype, file_path):
    """
    paras: node_type: determine which nodetype will be test;
    paras: file_path: only provide a path to save test nodes id;
    """
    test_res2 = pageRank_appr_files(file_path, 
                                    "pageRank_appr_files_{}".format(nodetype.lower()))
    user_pg_res = [i['attributes'] for i in test_res2['results'][0]['test_set']]
    user_pg_df2 = pd.DataFrame.from_dict(user_pg_res)
    user_pg_df2.rename(columns={'prim_id': 'username'}, inplace=True)
    return test_res2, user_pg_df2


def get_edge(res):
    edge = []
    for dic in res['results'][0]['vv']:
        b = dic['v_id']
        a = dic['attributes']['vv.@node_all_neighs']
        bb = len(a)*[b]
        temp_edge = list(zip(a, bb))
        edge.extend(temp_edge)
    return edge


def get_feat(res_ori):
    feat_DIC = {}
    for dic in res_ori['results'][0]['vv']:
        feat_dic = dic['attributes']['vv.@node_date_calls']
        feat = []
        v_id = dic['v_id']
        for date in ['20190601', '20190602', '20190603', '20190604']:
            if date not in feat_dic:
                feat.append(0)
            else:
                feat.append(len(feat_dic[date]))
        feat_DIC[v_id] = feat
    return feat_DIC


def get_label(res_label):
    DIC = {}
    for dic in res_label['results'][0]['vv']:
        label = dic['attributes']['vv.@node_all_labels']
        DIC[dic['v_id']]=label
    return DIC
    

In [2]:
test_st='20190601'
test_et='20190604'

res_edge = pyG_prepare_edge(test_st, test_et, "PhoneNumber")
res_feat = pyG_prepare_feat(test_st, test_et, "PhoneNumber")
res_label = pyG_prepare_label(test_st, test_et, "PhoneNumber")


query:pyG_pre
paras:['start_t=20190601', 'end_t=20190604', 'node=PhoneNumber']
request for url: http://192.168.20.241:9000/query/OneMonthNet/pyG_pre?start_t=20190601&end_t=20190604&node=PhoneNumber
---------------------------------------------------------------------------------------------------
run query finish, use 94.4853093624115 seconds


query:pyG_pre_ori
paras:['start_t=20190601', 'end_t=20190604', 'node=PhoneNumber']
request for url: http://192.168.20.241:9000/query/OneMonthNet/pyG_pre_ori?start_t=20190601&end_t=20190604&node=PhoneNumber
---------------------------------------------------------------------------------------------------
run query finish, use 115.0154185295105 seconds


query:get_default_now
paras:['start_t=20190601', 'end_t=20190604', 'node=PhoneNumber']
request for url: http://192.168.20.241:9000/query/OneMonthNet/get_default_now?start_t=20190601&end_t=20190604&node=PhoneNumber
-----------------------------------------------------------------------------------

In [4]:
feat_DIC = get_feat(res_feat)
edge = get_edge(res_edge)
label = get_label(res_label)

source = set(feat_DIC.keys())
a1 = np.array(edge).transpose()[0]
a2 = np.array(edge).transpose()[1]
edge_set = set(a1).union(set(a2))

In [5]:
root ='/home/tigergraph/GraphProject/OneMonthGraph/Querys/pyG/data/raw/'

with open(root + 'feat.json', 'w') as json_file:
    json.dump(feat_DIC, json_file)
with open(root + 'edge.json', 'w') as json_file:
    json.dump(edge, json_file)
with open(root + 'label.json', 'w') as json_file:
    json.dump(label, json_file)


In [None]:
# get feats;
test_st='20190601'
test_et='20190604'
res_ori = pyG_prepare_feat(test_st, test_et, "PhoneNumber")
feat_DIC = get_feat(res_ori)

        
final2 = final[['v_id', 'new_20190601','new_20190602','new_20190603','new_20190604']]
cd
final2.head()

In [None]:
# get labels

labels = pd.read_csv('/home/tigergraph/GraphProject/OneMonthGraph/Querys/pyG/data/temp1.csv')

In [156]:
# get edges;

for dic in tqdm(res['results'][0]['vv']):
    dic['lis'] = []
    for i,v in dic.items():
        if i in ['20190601','20190602','20190603','20190604']:
            dic['lis'].extend(v)

edges = pd.DataFrame.from_dict(res['results'][0]['vv'])

EDGES = []
for a, b in tqdm(edges[['lis','v_id']].values):
    bb=len(a)*[b]
    EDGES.extend(list(zip(a, bb)))

100%|██████████| 5938366/5938366 [00:10<00:00, 579205.73it/s]
100%|██████████| 5938366/5938366 [00:14<00:00, 404202.88it/s]


In [159]:
edge = np.array(EDGES).transpose()

In [161]:
default_now

(2, 19482805)