In [15]:
import requests
import json
import pandas as pd
import numpy as np
from urllib.parse import urljoin
import time
import logging
import csv
from tqdm import tqdm
import glob

GRAPHSQL_URL = "http://192.168.20.241:9000/query/OneMonthNet/"

############################ util functions ############################
def query_graphsql_ori(query_name, para_string):
    remaining_url = "{}?{}".format(query_name, para_string)
    url=urljoin(GRAPHSQL_URL, remaining_url)
    print('query:{}\nparas:{}\nrequest for url: {}'.format(query_name, para_string.split('&'), url))
    print('---------------------------------------------------------------------------------------------------')
    result = requests.get(url)
    result_json = json.loads(result.text)
    return result_json


def query_graphsql(query_name, para_string):
    remaining_url = "{}?{}".format(query_name, para_string)
    url=urljoin(GRAPHSQL_URL, remaining_url)
    st = time.time()
    print('query:{}\nparas:{}\nrequest for url: {}'.format(query_name, para_string.split('&'), url))
    print('---------------------------------------------------------------------------------------------------')
    try:
        result = requests.get(url)
        result_json = json.loads(result.text)
        if result_json['error']:
            logging.error(result_json['message'])
            print('run query failed')
            return None
        print('run query finish, use {} seconds\n\n'.format(time.time() - st))
        return result_json
    except Exception as e:
        print('failed')
        
        
def _update_max_min_date(node_name):
    '''
    get the max & min date for the given node type based on its history update dates;
    para: node_name: 'Device'; check all the node_name type in gsql graph schema: OneMonthNet;
    '''
    query_name = 'update_max_min_date'
    paras = 'node={}'.format(node_name)
    return query_graphsql(query_name, paras)


def _node_cutoff_filter(start_time, end_time, node_type):
    '''
    note that call _update_max_min_date before run this query;
    select those nodes with given node_type exist between start_time and end_time;
    paras:start_time:'2019-06-01 18:42:22'
    paras:end_time:'2019-07-01 18:42:22'
    paras:node_type:'User'; check all the node_name type in gsql graph schema: OneMonthNet;
    '''
    query_name = 'node_cutoff_filter'
    paras = 'start_t={}&end_t={}&node={}'.format(start_time, end_time, node_type) 
    return query_graphsql(query_name, paras)
    
    
########################################################################

# def label_get(start_date):
#     'old version and has been deprecated now, pls use label_get2 query;'
#     query_name = 'label_get'
#     paras = 'start_date={}'.format(start_date) 
#     return query_graphsql(query_name, paras)


def label_get2(start_date, end_date, loanstyle):
    '''
    get loans with the given loanstyle whose fundtime between start_date & end_date;
    paras: start_date: '2019-06-01 00:00:00'
    paras: end_date: '2019-07-01 00:00:00'
    paras: loanstyle: '绿卡30天1期'
    '''
    query_name = 'label_get2'
    paras = 'start_date={}&end_date={}&loan_type={}'.format(start_date, end_date, loanstyle) 
    return query_graphsql(query_name, paras)['results'][0]['loanlabelSHOW']


#1
def reset(node):
    '''
    '''
    query_name = 'reset'
    paras = 'node={}'.format(node) 
    return query_graphsql(query_name, paras)


#2
def pageRank_train(start_t, end_t, node, maxChange, maxIter, damping, query_name):
    '''
    '''
    paras = 'start_t={}&end_t={}&node={}&maxChange={}&maxIter={}&damping={}'.format(start_t, end_t, node, maxChange,maxIter,damping) 
    return query_graphsql(query_name, paras)


#3
def pageRank_appr_files(file_abs_path, query_name):
    '''
    draft 1 for pageRank in cashbus graph; 
    Only works for user-device relationship(single edge type & double vertex type);
    Only output nodes(type: user or device) with pg_score !=0;
    '''
    paras = 'file_path={}'.format(file_abs_path) 
    return query_graphsql(query_name, paras) 

    
def pyG_prepare():
    query_name = 'pyG_pre'
    paras = ''
    return query_graphsql(query_name, paras) 


#2
def connected_comp_train(start_t, end_t, node):
    '''
    '''
    query_name='conn_comp'
    paras = 'start_t={}&end_t={}&node={}'.format(start_t, end_t, node) 
    return query_graphsql(query_name, paras)


#3
def connected_comp_appr_files(file_abs_path):
    '''
    draft 1 for pageRank in cashbus graph; 
    Only works for user-device relationship(single edge type & double vertex type);
    Only output nodes(type: user or device) with pg_score != 0;
    '''
    paras = 'file_path={}'.format(file_abs_path)
    query_name = 'conn_comp_check'
    return query_graphsql(query_name, paras)


def train(st, et, node_type, reset_bool):
    """
    paras: st: start time of a period, only node within the restriction will be trained and given a pgscore;
    paras: et: end time of the period;
    paras: node_type: determine which nodetype will be trained;
    """
    assert node_type in ["Device", "PhoneNumber"]
    if reset_bool:
        reset(node_type)
    pageRank_train(st, 
                   et, 
                   node_type, 
                   10, 3, 0.6,
                   "pageRank_train_{}".format(node_type.lower()))
    return None


def test_with_local_files(nodetype, file_path):
    """
    paras: node_type: determine which nodetype will be test;
    paras: file_path: only provide a path to save test nodes id;
    """
    test_res2 = pageRank_appr_files(file_path, 
                                    "pageRank_appr_files_{}".format(nodetype.lower()))
    user_pg_res = [i['attributes'] for i in test_res2['results'][0]['test_set']]
    user_pg_df2 = pd.DataFrame.from_dict(user_pg_res)
    user_pg_df2.rename(columns={'prim_id': 'username'}, inplace=True)
    return test_res2, user_pg_df2


In [16]:
glob.glob('/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/*')

['/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/test_2019-06-12 00:00:00_2019-06-13 00:00:00.txt',
 '/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/test_2019-06-14 00:00:00_2019-06-15 00:00:00.txt',
 '/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/test_2019-06-13 00:00:00_2019-06-14 00:00:00.txt',
 '/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/test_2019-06-16 00:00:00_2019-06-17 00:00:00.txt',
 '/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/test_2019-06-15 00:00:00_2019-06-16 00:00:00.txt']

In [18]:
connected_comp_appr_files(glob.glob('/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/*')[-1])

query:conn_comp_check
paras:['file_path=/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/test_2019-06-15 00:00:00_2019-06-16 00:00:00.txt']
request for url: http://192.168.20.241:9000/query/OneMonthNet/conn_comp_check?file_path=/home/tigergraph/GraphProject/OneMonthGraph/Querys/G_algorithm/pagerank/dir_device/save_test_txt/test_2019-06-15 00:00:00_2019-06-16 00:00:00.txt
---------------------------------------------------------------------------------------------------
run query finish, use 0.1433696746826172 seconds




{'version': {'api': 'v2', 'schema': 0},
 'error': False,
 'results': []}

In [6]:
RES = pyG_prepare()

query:pyG_pre
paras:['']
request for url: http://192.168.20.241:9000/query/OneMonthNet/pyG_pre
---------------------------------------------------------------------------------------------------
run query finish, use 179.02964043617249 seconds




In [14]:
pd.DataFrame.from_dict(RES['results'][0]['vv'])

Unnamed: 0,attributes,v_id,v_type
0,"{'prim_id': '18278150759', 'datetime_set': [20...",18278150759,PhoneNumber
1,"{'prim_id': '18270835097', 'datetime_set': [20...",18270835097,PhoneNumber
2,"{'prim_id': '18269112900', 'datetime_set': [20...",18269112900,PhoneNumber
3,"{'prim_id': '18089630838', 'datetime_set': [20...",18089630838,PhoneNumber
4,"{'prim_id': '18681189309', 'datetime_set': [20...",18681189309,PhoneNumber
5,"{'prim_id': '18272226449', 'datetime_set': [20...",18272226449,PhoneNumber
6,"{'prim_id': '057156107469', 'datetime_set': [2...",057156107469,PhoneNumber
7,"{'prim_id': '18268213630', 'datetime_set': [20...",18268213630,PhoneNumber
8,"{'prim_id': '15803205680', 'datetime_set': [20...",15803205680,PhoneNumber
9,"{'prim_id': '18022392043', 'datetime_set': [20...",18022392043,PhoneNumber


In [None]:
5月份; 训练前三周; 得到cc; 测试第四周; 
4月份; 训练前三周; 得到cc; 测试第四周; 

In [None]:
connected_comp_train()

In [None]:
需要拿到第四周test文件; 
  

In [19]:
601 602 603;

SyntaxError: invalid syntax (<ipython-input-19-05a94ea9b76d>, line 1)

In [21]:
label1 = pd.read_csv('/data-0/gsm/qb_one_month/loan_label_1_1901_1907.csv')
label2 = pd.read_csv('/data-0/gsm/qb_one_month/loan_label_2_new_1901_1907.csv')
label2.columns = list(label1.columns)+['nouse']
loan_detail_08_09_df = pd.read_csv('/data-0/qibo/Gdata/oneMonth/qb_temp_loan_detail_08_09.csv')
loan_detail_08_09_df.username = loan_detail_08_09_df.username.astype(str)

def test_loan_info(loanstyle, st, et):
    # 给定两个时间范围，给出所在时间内的 funded loan 信息；
    temp1 = label1[(label1.funddate<et)&(label1.funddate>=st)]
    temp2 = label2[(label2.funddate<et)&(label2.funddate>=st)]
    final = temp1.append(temp2)
    if loanstyle == 'all':
        return final
    return final[final.loanstyle == loanstyle]

def source_table(loanstyle, st, et):
    #xiaomajie biao;
    test_funded_loans = loan_detail_08_09_df[(loan_detail_08_09_df.fundtime>=st)&(loan_detail_08_09_df.fundtime<et)]
    #label 表;
    test_label = test_loan_info(loanstyle, st, et)
    #merge;
    table3 = pd.merge(test_funded_loans, test_label, on='loanid')
    return table3

def final_process(source_df, save_path=None):
    ori, RES_phone = test_with_local_files(nodetype = "PhoneNumber", file_path = test_day_file)
    final = pd.merge(source_df, RES_phone, on = 'username')
    final = final[~final.default_now.isnull()]
    if save_dir is not None:
        final.to_csv(save_path)
    return final


In [24]:

test_st='2019-06-01 00:00:00'
test_et='2019-06-04 00:00:00'
source_df = source_table('绿卡30天1期', test_st, test_et)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [28]:
sum(source_df.dcs_username.astype('int')!=source_df.username.astype('int'))

282

In [None]:
source_df[['default_now', 'username', 'loanid','fundtime']]