In [1]:
from pyspark.sql.functions import year, month, dayofmonth, hour, col, lit, desc, dayofweek, concat, to_date, date_trunc, countDistinct

In [2]:
TripsAtt = spark.read.format('parquet').load("/mnt/bronze/cab/operational/att/trips_client").withColumn('Instance', lit('ATT'))
FormHeadersAtt = spark.read.format('parquet').load("/mnt/bronze/cab/operational/att/form_headers_client").withColumn('Instance', lit('ATT'))
MessagesAtt = spark.read.format('parquet').load("/mnt/bronze/cab/operational/att/sent_messages_client").withColumn('Instance', lit('ATT'))
TimekeepingAtt = spark.read.format('parquet').load("/mnt/bronze/cab/operational/att/timekeeping_statuses_client").withColumn('Instance', lit('ATT'))
OrdersAtt = spark.read.format('parquet').load("/mnt/bronze/cab/operational/att/order_headers_client").withColumn('Instance', lit('ATT'))

In [3]:
TripsAtt.count()

In [4]:
#join all the other instances later into one object
#also reduce to only userId column
Trips = TripsAtt.select(col('CompanyId'), col('UserId')).distinct()
FormHeaders = FormHeadersAtt.select(col('CompanyId'), col('StartedByUserId').alias('UserId')).distinct()
Messages = MessagesAtt.select(col('CompanyId'), col('FromUserId').alias('UserId')).distinct()
Timekeeping = TimekeepingAtt.select(col('CompanyId'), col('UserId')).distinct()
Orders = OrdersAtt.select(col('CompanyId'), col('CreatedByUserId').alias('UserId')).distinct()

In [5]:
#join all the other instances later into one object
#also reduce to only userId column
TripsLastYear = TripsAtt.filter(col('Year') == 2019).select(col('CompanyId'), col('UserId')).distinct()
FormHeadersLastYear = FormHeadersAtt.filter(col('Year') == 2019).select(col('CompanyId'), col('StartedByUserId').alias('UserId')).distinct()
MessagesLastYear = MessagesAtt.filter(col('Year') == 2019).select(col('CompanyId'), col('FromUserId').alias('UserId')).distinct()
TimekeepingLastYear = TimekeepingAtt.filter(col('Year') == 2019).select(col('CompanyId'), col('UserId')).distinct()
OrdersLastYear = OrdersAtt.filter(col('Year') == 2019).select(col('CompanyId'), col('CreatedByUserId').alias('UserId')).distinct()

In [6]:
FormHeadersLastYear.count()

In [7]:
FormHeaders.count()

In [8]:
from functools import reduce 
from pyspark.sql import DataFrame

def unionAll(*dfs):
    return reduce(DataFrame.unionAll, dfs)
  
CombinedUserIdsLastYear = unionAll(TripsLastYear, FormHeadersLastYear, FormHeadersLastYear, MessagesLastYear, TimekeepingLastYear, OrdersLastYear).distinct()
CombinedUserIds = unionAll(Trips, FormHeaders, FormHeaders, Messages, Timekeeping, Orders).distinct()

In [9]:
display(CombinedUserIdsLastYear.describe())

summary,CompanyId,UserId
count,42273.0,41673.0
mean,6562.927400468384,42780.84397571569
stddev,3924.602799296175,21596.532267624592
min,134.0,12.0
max,18438.0,81182.0


In [10]:
display(CombinedUserIdsLastYear.agg(countDistinct('UserId')))

count(DISTINCT UserId)
41672


In [11]:
#pull in licenses and payment methods to get companies with active billing
#Accounts = spark.read.format('parquet').load("/mnt/bronze/cab/billing/accounts")
Licenses = spark.read.format('parquet').load("/mnt/bronze/cab/billing/licenses")
PaymentMethods = spark.read.format('parquet').load("/mnt/bronze/cab/billing/payment_methods")
PaymentSchedules = spark.read.format('parquet').load("/mnt/bronze/cab/billing/payment_schedules")

In [12]:
LicensesWPayment = Licenses.join(PaymentSchedules, 'PaymentScheduleId').join(PaymentMethods, "PaymentMethodId")

In [13]:
BillableComapnies = LicensesWPayment.filter(col('Billable') == False).select('CompanyId').distinct()

In [14]:
BillableComapnies.count()

In [15]:
BillableUserIdsLastYear = CombinedUserIdsLastYear.join(BillableComapnies, 'CompanyId')
BillableUserIdsAllTime = CombinedUserIds.join(BillableComapnies, 'CompanyId')

In [16]:
display(BillableUserIdsLastYear.agg(countDistinct('UserId')))

count(DISTINCT UserId)
39520


In [17]:
display(BillableUserIdsAllTime.agg(countDistinct('UserId')))

count(DISTINCT UserId)
45332
