
 ## 啟動luigi  
 ## luigi --module customerjourney customerjourney.HiveCustomerTask --interval 2016-09-01

In [None]:
import os
import sys
import luigi
import datetime
 
from luigi import date_interval as d
from luigi.contrib.spark import PySparkTask
from pyspark.sql import HiveContext,DataFrameWriter
from utils import check_nameNode,check_partition,add_partition
from utils import COLUMN
import json
 
from luigi.contrib.hdfs import HdfsClient, HdfsTarget
from luigi.contrib.hive import HiveTableTarget, HivePartitionTarget, run_hive_cmd
 
BASEPATH = "{}/..".format(os.path.dirname(os.path.abspath(__file__)))
 
class HiveCustomerTask(luigi.Task):
    task_namespace = "customerjourney"
 
    interval = luigi.DateIntervalParameter()
    hive_cctxn = luigi.DictParameter(default={"lib": "basic.hdfs.cctxn_hive"})
 
    def requires(self):
        nameservice = check_nameNode()
        hdfs_path = 'hdfs://{}/bank/ap_chp/hive'.format(nameservice)
        out_path = '{}/customer_journey_event'.format(hdfs_path)
        yyyymm = "".join(self.interval.to_string().split("-")[:2])
 
        yield SparkTask(yyyymm=yyyymm, event="cctxn",\
                        sql="select * from bap_chp.event_cc_txn where yyyymm={}".format(yyyymm),\
                        out_path=out_path, **self.hive_cctxn)
        yield SparkTask(yyyymm=yyyymm, event="cti",\
                        sql="select * from bap_chp.event_cti where yyyymm={}".format(yyyymm),\
                        out_path=out_path, **self.hive_cti)

## Require Task

In [None]:
class SparkTask(PySparkTask):
    task_namespace = "customerjourney"
    yyyymm = luigi.Parameter()
    event = luigi.Parameter()
    sql = luigi.Parameter()
    out_path = luigi.Parameter()
    lib = luigi.Parameter()
 
    def main(self,sc,*arg):
        #partition = check_partition(event = self.event, ym = self.yyyymm)
 
        #if partition[0].strip() == '0':
            #add_partition(event = self.event, ym = self.yyyymm)
 
        mod = __import__(self.lib, fromlist=[""])
 
        hc = HiveContext(sc)
        #解決DataFrameWriter無法partition問題
        hc.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
        rdd1 = hc.sql(self.sql).rdd
 
        rdd2 = rdd1.map(lambda line: mod.row_create(line,self.yyyymm,self.event))
 
        hc_DF = hc.createDataFrame(rdd2,COLUMN)
 
        hc_writer = DataFrameWriter(hc_DF)
        #hc_writer.partitionBy('yyyymm','event').save(mode='append',format='parquet', path=self.out_path)
        hc_writer.partitionBy('yyyymm','event').saveAsTable('bap_chp.customer_journey_event',mode='append'\
                                                            ,format='parquet', path=self.out_path)
 
    def output(self):
        return HdfsTarget('{}/yyyymm={}/event={}/*'.format(self.out_path, self.yyyymm, self.event))


## Lib cctxn_hive.py 

In [None]:
import json
from utils import date_to_sec
import datetime
 
def row_create(line, yyyymm, event):
    attr =  {"action": {
                         "txn_amt": int(line["txn_amt"]),
                         "original_currency_code": line["original_currency_code"].strip(),
                         "total_installment_times": int(line["total_installment_times"]),
                         "purchase_type_code": line["purchase_type_code"].strip(),
                         "consumption_category_desc": line["consumption_category_desc"].strip()
                       },
             "object": {
                         "merchant_category_code": line["merchant_category_code"].strip(),
                         "merchant_name":line["merchant_name"].strip()
                       },
             "channel": {
                         "card_type":line["kind1"],
                         "card_level":line["kind2"]
                        }
              }
 
    row = ['customer_id', line["customer_id"].strip(), line["txn_code"].strip(), int(date_to_sec(line["txn_date"].strftime("%Y-%m-%d %H:%M:%S"))), 'merchant_nbr', line["merchant_nbr"].strip(), 'credit_card', line["card_nbr"].strip(),\
           json.dumps(attr), yyyymm, event]
 
    return row

## Dev Server /cms/customerjourney/app