In [1]:
import requests
import json
import pandas as pd
import numpy as np
from urllib.parse import urljoin
import time
import logging
import csv
from tqdm import tqdm
import glob

GRAPHSQL_URL = "http://192.168.20.241:9000/query/OneMonthNet/"

############################ util functions ############################
def query_graphsql_ori(query_name, para_string):
    remaining_url = "{}?{}".format(query_name, para_string)
    url=urljoin(GRAPHSQL_URL, remaining_url)
    print('query:{}\nparas:{}\nrequest for url: {}'.format(query_name, para_string.split('&'), url))
    print('---------------------------------------------------------------------------------------------------')
    result = requests.get(url)
    result_json = json.loads(result.text)
    return result_json


def query_graphsql(query_name, para_string):
    remaining_url = "{}?{}".format(query_name, para_string)
    url=urljoin(GRAPHSQL_URL, remaining_url)
    st = time.time()
    print('query:{}\nparas:{}\nrequest for url: {}'.format(query_name, para_string.split('&'), url))
    print('---------------------------------------------------------------------------------------------------')
    try:
        result = requests.get(url)
        result_json = json.loads(result.text)
        if result_json['error']:
            logging.error(result_json['message'])
            print('run query failed')
            return None
        print('run query finish, use {} seconds\n\n'.format(time.time() - st))
        return result_json
    except Exception as e:
        print('failed')
        
        
def _update_max_min_date(node_name):
    '''
    get the max & min date for the given node type based on its history update dates;
    para: node_name: 'Device'; check all the node_name type in gsql graph schema: OneMonthNet;
    '''
    query_name = 'update_max_min_date'
    paras = 'node={}'.format(node_name)
    return query_graphsql(query_name, paras)


def _node_cutoff_filter(start_time, end_time, node_type):
    '''
    note that call _update_max_min_date before run this query;
    select those nodes with given node_type exist between start_time and end_time;
    paras:start_time:'2019-06-01 18:42:22'
    paras:end_time:'2019-07-01 18:42:22'
    paras:node_type:'User'; check all the node_name type in gsql graph schema: OneMonthNet;
    '''
    query_name = 'node_cutoff_filter'
    paras = 'start_t={}&end_t={}&node={}'.format(start_time, end_time, node_type) 
    return query_graphsql(query_name, paras)
    
    
########################################################################

# def label_get(start_date):
#     'old version and has been deprecated now, pls use label_get2 query;'
#     query_name = 'label_get'
#     paras = 'start_date={}'.format(start_date) 
#     return query_graphsql(query_name, paras)


def label_get2(start_date, end_date, loanstyle):
    '''
    get loans with the given loanstyle whose fundtime between start_date & end_date;
    paras: start_date: '2019-06-01 00:00:00'
    paras: end_date: '2019-07-01 00:00:00'
    paras: loanstyle: '绿卡30天1期'
    '''
    query_name = 'label_get2'
    paras = 'start_date={}&end_date={}&loan_type={}'.format(start_date, end_date, loanstyle) 
    return query_graphsql(query_name, paras)['results'][0]['loanlabelSHOW']


#1
def reset(node):
    '''
    '''
    query_name = 'reset'
    paras = 'node={}'.format(node) 
    return query_graphsql(query_name, paras)


#2
def pageRank_train(start_t, end_t, node, maxChange, maxIter, damping, query_name):
    '''
    '''
    paras = 'start_t={}&end_t={}&node={}&maxChange={}&maxIter={}&damping={}'.format(start_t, end_t, node, maxChange,maxIter,damping) 
    return query_graphsql(query_name, paras)


#3
def pageRank_appr_files(file_abs_path, query_name):
    '''
    draft 1 for pageRank in cashbus graph; 
    Only works for user-device relationship(single edge type & double vertex type);
    Only output nodes(type: user or device) with pg_score !=0;
    '''
    paras = 'file_path={}'.format(file_abs_path) 
    return query_graphsql(query_name, paras) 

    
def pyG_prepare():
    query_name = 'pyG_pre'
    paras = ''
    return query_graphsql(query_name, paras) 


#2
def connected_comp_train(start_t, end_t, node):
    '''
    res = connected_comp_train('20190517', '20190610', 'PhoneNumber')
    '''
    query_name='conn_comp'
    paras = 'start_t={}&end_t={}&node={}'.format(start_t, end_t, node) 
    return query_graphsql(query_name, paras)


#3
def connected_comp_appr_files(file_abs_path):
    '''
    draft 1 for pageRank in cashbus graph; 
    Only works for user-device relationship(single edge type & double vertex type);
    Only output nodes(type: user or device) with pg_score != 0;
    '''
    paras = 'file_path={}'.format(file_abs_path)
    query_name = 'conn_comp_check'
    return query_graphsql(query_name, paras)


def train(st, et, node_type, reset_bool):
    """
    paras: st: start time of a period, only node within the restriction will be trained and given a pgscore;
    paras: et: end time of the period;
    paras: node_type: determine which nodetype will be trained;
    """
    assert node_type in ["Device", "PhoneNumber"]
    if reset_bool:
        reset(node_type)
    pageRank_train(st, 
                   et, 
                   node_type, 
                   10, 3, 0.6,
                   "pageRank_train_{}".format(node_type.lower()))
    return None


def test_with_local_files(nodetype, file_path):
    """
    paras: node_type: determine which nodetype will be test;
    paras: file_path: only provide a path to save test nodes id;
    """
    test_res2 = pageRank_appr_files(file_path, 
                                    "pageRank_appr_files_{}".format(nodetype.lower()))
    user_pg_res = [i['attributes'] for i in test_res2['results'][0]['test_set']]
    user_pg_df2 = pd.DataFrame.from_dict(user_pg_res)
    user_pg_df2.rename(columns={'prim_id': 'username'}, inplace=True)
    return test_res2, user_pg_df2


In [4]:
res = connected_comp_train('20190604', '20190610', 'PhoneNumber')

query:conn_comp
paras:['start_t=20190604', 'end_t=20190610', 'node=PhoneNumber']
request for url: http://192.168.20.241:9000/query/OneMonthNet/conn_comp?start_t=20190604&end_t=20190610&node=PhoneNumber
---------------------------------------------------------------------------------------------------
run query finish, use 514.5107645988464 seconds




In [9]:
test_day_file = '/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_phone/cc_test_{}_{}.txt'.format('2019-06-11 00:00:00', '2019-06-12 00:00:00')


In [12]:
test_cc = connected_comp_appr_files(test_day_file)

query:conn_comp_check
paras:['file_path=/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_phone/cc_test_2019-06-11 00:00:00_2019-06-12 00:00:00.txt']
request for url: http://192.168.20.241:9000/query/OneMonthNet/conn_comp_check?file_path=/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_phone/cc_test_2019-06-11 00:00:00_2019-06-12 00:00:00.txt
---------------------------------------------------------------------------------------------------
run query finish, use 0.4313061237335205 seconds




In [16]:
test_cc['results'][0]['vv']

[{'v_id': '18105972345',
  'v_type': 'PhoneNumber',
  'attributes': {'prim_id': '18105972345',
   'datetime_set': [20190611],
   'temp_save_bool': False,
   'temp_pgscore': 0,
   'cc_id': 0,
   '@node_cluster_id': [29403608,
    27293586,
    25213522,
    24165028,
    24165026,
    21019360,
    0,
    30456164,
    11549679]}},
 {'v_id': '13981682202',
  'v_type': 'PhoneNumber',
  'attributes': {'prim_id': '13981682202',
   'datetime_set': [20190611],
   'temp_save_bool': False,
   'temp_pgscore': 0,
   'cc_id': 0,
   '@node_cluster_id': [28361076, 10485822, 8408556, 0, 19957416]}},
 {'v_id': '18981847226',
  'v_type': 'PhoneNumber',
  'attributes': {'prim_id': '18981847226',
   'datetime_set': [20190611],
   'temp_save_bool': False,
   'temp_pgscore': 0,
   'cc_id': 0,
   '@node_cluster_id': [0]}},
 {'v_id': '18065666612',
  'v_type': 'PhoneNumber',
  'attributes': {'prim_id': '18065666612',
   'datetime_set': [20190611],
   'temp_save_bool': False,
   'temp_pgscore': 0,
   'cc_id'

In [17]:
label1 = pd.read_csv('/data-0/gsm/qb_one_month/loan_label_1_1901_1907.csv')
label2 = pd.read_csv('/data-0/gsm/qb_one_month/loan_label_2_new_1901_1907.csv')
label2.columns = list(label1.columns)+['nouse']
loan_detail_08_09_df = pd.read_csv('/data-0/qibo/Gdata/oneMonth/qb_temp_loan_detail_08_09.csv')
loan_detail_08_09_df.username = loan_detail_08_09_df.username.astype(str)

def test_loan_info(loanstyle, st, et):
    # 给定两个时间范围，给出所在时间内的 funded loan 信息；
    temp1 = label1[(label1.funddate<et)&(label1.funddate>=st)]
    temp2 = label2[(label2.funddate<et)&(label2.funddate>=st)]
    final = temp1.append(temp2)
    if loanstyle == 'all':
        return final
    return final[final.loanstyle == loanstyle]

def source_table(loanstyle, st, et):
    #xiaomajie biao;
    test_funded_loans = loan_detail_08_09_df[(loan_detail_08_09_df.fundtime>=st)&(loan_detail_08_09_df.fundtime<et)]
    #label 表;
    test_label = test_loan_info(loanstyle, st, et)
    #merge;
    table3 = pd.merge(test_funded_loans, test_label, on='loanid')
    return table3

def final_process(source_df, save_path=None):
    ori, RES_phone = test_with_local_files(nodetype = "PhoneNumber", file_path = test_day_file)
    final = pd.merge(source_df, RES_phone, on='username')
    final = final[~final.default_now.isnull()]
    if save_dir is not None:
        final.to_csv(save_path)
    return final

source_df = source_table('绿卡30天1期', test_st, test_et)


NameError: name 'test_st' is not defined

In [76]:
test_st='2019-06-11 00:00:00'
test_et='2019-06-12 00:00:00'
save_dir = '/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_phone/save_df/'
final_save_path = save_dir + "cc_{}_{}.csv".format(test_st, test_et)

####################### test step:   #####################
source_df = source_table('绿卡30天1期', test_st, test_et)

test_day_file = '/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_phone/cc_test_{}_{}.txt'.format(test_st, test_et)
with open(test_day_file, 'w') as f1:
    csv_out = csv.writer(f1)
    for i in tqdm(source_df.username.unique()):
        csv_out.writerow([i])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
100%|██████████| 4602/4602 [00:00<00:00, 972157.49it/s]


In [93]:
results = connected_comp_appr_files(test_day_file)['results'][0]['vv']

query:conn_comp_check
paras:['file_path=/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_phone/cc_test_2019-06-11 00:00:00_2019-06-12 00:00:00.txt']
request for url: http://192.168.20.241:9000/query/OneMonthNet/conn_comp_check?file_path=/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_phone/cc_test_2019-06-11 00:00:00_2019-06-12 00:00:00.txt
---------------------------------------------------------------------------------------------------
run query finish, use 0.2819373607635498 seconds




In [88]:
results = [[i['v_id'], i['attributes']['cc_id']] for i in connected_comp_appr_files(test_day_file)['results'][0]['vv']]

query:conn_comp_check
paras:['file_path=/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_phone/cc_test_2019-06-11 00:00:00_2019-06-12 00:00:00.txt']
request for url: http://192.168.20.241:9000/query/OneMonthNet/conn_comp_check?file_path=/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_phone/cc_test_2019-06-11 00:00:00_2019-06-12 00:00:00.txt
---------------------------------------------------------------------------------------------------
run query finish, use 0.2990140914916992 seconds




In [90]:
dic = {}
for i in res:
    if i[1] not in dic:
        dic[i[1]]=1
    else:
        dic[i[1]]+=1

In [91]:
dic

{0: 3654,
 15728849: 1,
 27263156: 1,
 20971862: 1,
 9437290: 1,
 26214740: 1,
 9437365: 1,
 26214992: 1,
 4195047: 1,
 4195696: 1,
 4196265: 1,
 24117626: 1,
 20972292: 1,
 15729711: 1,
 16778060: 1,
 2097831: 1,
 2097900: 1,
 24117902: 1,
 17826590: 1,
 11535777: 1,
 27263328: 1,
 23069412: 1,
 4720226: 1,
 11535888: 1,
 24118898: 1,
 4724682: 1,
 12584672: 1,
 26215708: 1,
 1049530: 1,
 4725873: 1,
 12584742: 1,
 3147246: 1,
 4759627: 1,
 2098240: 1,
 3147296: 1,
 16778970: 1,
 17827344: 1,
 29362372: 1,
 16779110: 1,
 2163: 1,
 6293254: 1,
 2768055: 1,
 22021900: 1,
 22021920: 1,
 13632986: 1,
 25167108: 1,
 9191799: 1,
 22022092: 1,
 1387879: 1,
 17827504: 1,
 19924704: 1,
 3570413: 1,
 3638895: 1,
 3676047: 1,
 3676120: 1,
 15742009: 1,
 26216228: 1,
 26216248: 1,
 28313588: 1,
 9438499: 1,
 5122674: 1,
 11017423: 1,
 11538342: 1,
 16153537: 1,
 16222511: 1,
 16223389: 1,
 26216512: 1,
 142551: 1,
 16259027: 1,
 11051883: 1,
 11053045: 1,
 11057601: 1,
 2630839: 1,
 2663151: 1,
 

In [None]:


####################### paras:       #####################
train_st = "2019-05-13 00:00:00"
train_et = "2019-06-12 00:00:00" 
test_st='2019-06-13 00:00:00'
test_et='2019-06-14 00:00:00'
save_dir = '/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_df/'
final_save_path = save_dir + "{}_{}.csv".format(test_st, test_et)

####################### train step:  #####################
train(st=train_st, et=train_et, node_type="Device", reset_bool=True)

####################### test step:   #####################
source_df = source_table('绿卡30天1期', test_st, test_et)
test_day_file = '/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/test_{}_{}.txt'.format(test_st, test_et)
with open(test_day_file, 'w') as f1:
    csv_out = csv.writer(f1)
    for i in tqdm(source_df.username.unique()):
        csv_out.writerow([i])
        
####################### final step:  ######################   
result = final_process(source_df, final_save_path)
