In [1]:
from itertools import chain,combinations
from pyspark.sql import HiveContext,DataFrameWriter
from pyspark.sql.types import *
from pyspark.sql.functions import udf,col

import time
import datetime
from functools import partial

hc = HiveContext(sc)

In [2]:
hc.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
hc.setConf("spark.sql.parquet.compression.codec", "uncompressed")

In [3]:
rowdf= hc.sql('''
               SELECT   *
               FROM     vp_bank.adw_cookie_profile 
               ''')
rowdf.printSchema()

root
 |-- cookieuniquevisitortrackingid: string (nullable = true)
 |-- list_eventdt: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- max_eventdt: string (nullable = true)
 |-- min_eventdt: string (nullable = true)
 |-- list_eventdt_web: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- list_eventdt_mybank: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- list_eventdt_b2b: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- list_eventdt_koko: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- list_eventdt_app: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- list_customer_id: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- list_sessionnumber: array (nullable = true)
 |    |-- element: long (containsNull = true)



In [4]:
rowdf.show()

+-----------------------------+--------------------+-----------+-----------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+--------------------+
|cookieuniquevisitortrackingid|        list_eventdt|max_eventdt|min_eventdt|    list_eventdt_web| list_eventdt_mybank|list_eventdt_b2b|list_eventdt_koko|    list_eventdt_app|    list_customer_id|  list_sessionnumber|
+-----------------------------+--------------------+-----------+-----------+--------------------+--------------------+----------------+-----------------+--------------------+--------------------+--------------------+
|         000022c0986b41399...|        [2017-03-02]| 2017-03-02| 2017-03-02|                null|        [2017-03-02]|            null|             null|                null|[T26EF87FBF6218A5E9]|         [104065234]|
|         00002cbd3572465a9...|        [2016-09-04]| 2016-09-04| 2016-09-04|                null|        [2016-09-04]|            nu

In [5]:
def listoutput(dtlist, idx):
    try:
        date = dtlist[idx]
    except:
        date = None
        
    return date

## 第一次和第二次回訪時間

In [8]:
list_udf_0 = udf(lambda c: listoutput(c, 0), StringType())
list_udf_1 = udf(lambda c: listoutput(c, 1), StringType())

In [7]:
list_udf_X = udf(lambda c: 0 if c[0] is None else 1, StringType())

## 篩掉APP資料

In [9]:
rowdf.where(list_udf_0("list_eventdt_app").isNull()).show()
#rowdf.where(list_udf_0("list_eventdt_app").isNotNull())\
#     .where(list_udf_0("list_eventdt_web").isNotNull()).show()


+-----------------------------+--------------------+-----------+-----------+--------------------+--------------------+----------------+-----------------+----------------+--------------------+--------------------+
|cookieuniquevisitortrackingid|        list_eventdt|max_eventdt|min_eventdt|    list_eventdt_web| list_eventdt_mybank|list_eventdt_b2b|list_eventdt_koko|list_eventdt_app|    list_customer_id|  list_sessionnumber|
+-----------------------------+--------------------+-----------+-----------+--------------------+--------------------+----------------+-----------------+----------------+--------------------+--------------------+
|         000022c0986b41399...|        [2017-03-02]| 2017-03-02| 2017-03-02|                null|        [2017-03-02]|            null|             null|            null|[T26EF87FBF6218A5E9]|         [104065234]|
|         00002cbd3572465a9...|        [2016-09-04]| 2016-09-04| 2016-09-04|                null|        [2016-09-04]|            null|             

## 計算第一、二次回訪時間

In [10]:
date0='2017-02-02'
row2df = rowdf.where(list_udf_0("list_eventdt_app").isNull())\
              .select(rowdf["cookieuniquevisitortrackingid"].alias("cookie_id"),\
                      "list_eventdt",\
                      list_udf_0("list_eventdt").alias("1stVisit"),\
                      list_udf_1("list_eventdt").alias("2ndVisit")\
                     )\
              .where(col('1stVisit')==date0)

In [11]:
row2df.show()

+--------------------+--------------------+----------+----------+
|           cookie_id|        list_eventdt|  1stVisit|  2ndVisit|
+--------------------+--------------------+----------+----------+
|003a7bc7069f4883a...|        [2017-02-02]|2017-02-02|      null|
|003fb9966439406c9...|        [2017-02-02]|2017-02-02|      null|
|014dde5d40da43eab...|[2017-02-02, 2017...|2017-02-02|2017-02-03|
|0156e57a113c4f6bb...|[2017-02-02, 2017...|2017-02-02|2017-02-03|
|018eea782eb4482b8...|        [2017-02-02]|2017-02-02|      null|
|01db25d887274806a...|        [2017-02-02]|2017-02-02|      null|
|01f18ad06ca54fbba...|        [2017-02-02]|2017-02-02|      null|
|01f99fd605fe48fda...|[2017-02-02, 2017...|2017-02-02|2017-02-06|
|021b5fd6b40a43e88...|        [2017-02-02]|2017-02-02|      null|
|02b6e071ed6247dba...|        [2017-02-02]|2017-02-02|      null|
|030adf9b6b914b2ba...|[2017-02-02, 2017...|2017-02-02|2017-02-11|
|03276460d8374ff9b...|        [2017-02-02]|2017-02-02|      null|
|033069a98

## 計算拜訪時間差

In [38]:
def daydiff(firstday, secondday):
    try:
        diff = datetime.datetime.strptime(secondday,"%Y-%m-%d") - datetime.datetime.strptime(firstday,"%Y-%m-%d")
        return diff.days
    except:
        return 0

In [39]:
daydiff_udf = udf(daydiff, IntegerType())
row2df.select("cookie_id" ,"list_eventdt", "1stVisit", "2ndVisit", daydiff_udf("1stVisit","2ndVisit").alias("diff"))\
      .registerTempTable("df_diff")
hc.sql('''
          SELECT   *
          FROM     df_diff
          LIMIT    20
       ''').show()

+--------------------+--------------------+----------+----------+----+
|           cookie_id|        list_eventdt|  1stVisit|  2ndVisit|diff|
+--------------------+--------------------+----------+----------+----+
|00019ca6625b49ecb...|        [2017-02-02]|2017-02-02|      null|   0|
|0004d0a738444576a...|[2017-02-02, 2017...|2017-02-02|2017-02-17|  15|
|00505d6ee6404bc19...|        [2017-02-02]|2017-02-02|      null|   0|
|006ca3d618ab4efe8...|        [2017-02-02]|2017-02-02|      null|   0|
|00a23cd756ff46cbb...|[2017-02-02, 2017...|2017-02-02|2017-02-03|   1|
|0118a852128e43869...|[2017-02-02, 2017...|2017-02-02|2017-02-27|  25|
|012abbd4f9264d859...|[2017-02-02, 2017...|2017-02-02|2017-02-03|   1|
|018d44219db643e58...|        [2017-02-02]|2017-02-02|      null|   0|
|01ecc80975914d0e8...|[2017-02-02, 2017...|2017-02-02|2017-02-07|   5|
|02bbd5501a1b43979...|[2017-02-02, 2017...|2017-02-02|2017-02-23|  21|
|031d8202cd224b6e8...|[2017-02-02, 2017...|2017-02-02|2017-03-03|  29|
|0326c

## 計算各區間回訪人數

In [44]:
hc.sql(
       '''
          SELECT   COUNT(1stVisit), 
                   SUM(CASE WHEN Diff = 1 THEN 1 ELSE 0 END) AS Day1,
                   SUM(CASE WHEN Diff = 2 THEN 1 ELSE 0 END) AS Day2,
                   SUM(CASE WHEN Diff = 3 THEN 1 ELSE 0 END) AS Day3,
                   SUM(CASE WHEN Diff = 4 THEN 1 ELSE 0 END) AS Day4,
                   SUM(CASE WHEN Diff = 5 THEN 1 ELSE 0 END) AS Day5,
                   SUM(CASE WHEN Diff = 6 THEN 1 ELSE 0 END) AS Day6,
                   SUM(CASE WHEN Diff = 7 THEN 1 ELSE 0 END) AS Day7,
                   SUM(CASE WHEN Diff >= 8 AND Diff <= 30 THEN 1 ELSE 0 END) AS Day30
          FROM     df_diff
       '''
       ).show()

+---------------+----+----+----+----+----+----+----+-----+
|count(1stVisit)|Day1|Day2|Day3|Day4|Day5|Day6|Day7|Day30|
+---------------+----+----+----+----+----+----+----+-----+
|          67861|5003| 978| 569|1328| 777| 623| 573| 5494|
+---------------+----+----+----+----+----+----+----+-----+

