In [1]:
%run ./BillableCompanies

In [2]:
Forms = spark.read.format('delta').load("/mnt/sandboxes/SampleDataSets/All/FormHeader6Month")

In [3]:
Forms.count()

### Forms Per Company

In [5]:
from pyspark.sql.functions import year, month, dayofmonth, hour, col, lit

FormsWPartit = Forms \
                .withColumn("year", year(col("StartUtcTimeTag"))) \
                .withColumn("month", month(col("StartUtcTimeTag"))) \
                .withColumn("day", dayofmonth(col("StartUtcTimeTag")))

In [6]:
FormsNotRepublic = FormsWPartit.filter(col('CompanyId') != 5788)

In [7]:
FormsNotRepublicBillable = FormsNotRepublic.join(billableCompanies, FormsNotRepublic.CompanyId == Companies.CompanyId )

In [8]:
formsByCompany = FormsNotRepublicBillable.groupBy(Companies.CompanyId).count()

display(formsByCompany.agg({"count":'avg'}))

In [9]:
display(FormsNotRepublicBillable.groupBy(FormsNotRepublic.year, FormsNotRepublic.month, FormsNotRepublic.day, Companies.CompanyId).count().agg({"count":'avg'}))

In [10]:
FormsNotRepublicBillable.createOrReplaceTempView('republicForms')

In [11]:
%sql

select *
from republicForms


### Modules per Company

In [13]:
CompanyModules = spark.read.format('parquet').load("/mnt/bronze/Samples/CompanyModules")
Modules = spark.read.format('parquet').load("/mnt/bronze/Samples/Modules")

In [14]:
CompanyModulesBillable = CompanyModules.join(billableCompanies, CompanyModules.CompanyId == Companies.CompanyId )

In [15]:
CompanyFormsModules = CompanyModulesBillable.filter(col('ModuleType') == 1)

In [16]:
display(CompanyModulesBillable)

In [17]:
display(CompanyFormsModules.filter(CompanyModules.Deleted == False).groupBy(CompanyModules.CompanyId).count().agg({"count":'avg'}))

In [18]:
display(CompanyFormsModules.groupBy('ModuleType').count())


### Custom Vs pre-Built modules

In [20]:
CompanyModulesBillable.count()

In [21]:
display(CompanyModulesBillable.groupBy('IsUserDefined').count())

In [22]:
display(CompanyModules.filter(CompanyModules.CompanyId == 212))

### Modules that are still base module

In [24]:
display(Modules)

In [25]:
display(CompanyModulesBillable.filter(col('ModuleId') == 22))

In [26]:
CompanyModulesWModules = CompanyModulesBillable.join(Modules, (CompanyModulesBillable.ModuleId == Modules.ModuleId) & (CompanyModulesBillable.ActiveVersion == Modules.Version), how="left")

In [27]:
FormModulesCustomized = CompanyModulesWModules.filter(CompanyModulesBillable.ModuleType == 1).filter(Modules.ModuleId.isNull())
FormModulesStillDefault = CompanyModulesWModules.filter(CompanyModulesBillable.ModuleType == 1).filter(Modules.ModuleId.isNotNull())

In [28]:
print(f'Total Form Modules: {CompanyModulesWModules.filter(CompanyModulesBillable.ModuleType == 1).count()}')
print(f'Customized(from scratch): {FormModulesCustomized.filter(col("IsUserDefined") == True).count()}')
print(f'Customized(from library): {FormModulesCustomized.filter(col("IsUserDefined") == False).count()}')
print(f'Default: {FormModulesStillDefault.count()}')

In [29]:
print(f'TotalBillableModules: {CompanyModulesBillable.count()}')
#print(f'Modules Still Default {ModulesStillDefault.count()}')

In [30]:
display(ModulesStillDefault.groupBy('IsUserDefined').count())

### Report Jobs (Continue here)

In [32]:
ReportJobs = spark.read.format('parquet').load("/mnt/bronze/Samples/ReportJobs")

In [33]:
billableCompanies.count()

In [34]:
billableCompReportJobs = ReportJobs.join(billableCompanies, ReportJobs.CompanyId == billableCompanies.CompanyId, how="left")

In [35]:
billableCompReportJobs.filter(billableCompanies.CompanyId.isNotNull()).count()

In [36]:
ReportJobs.count()

In [37]:
billableCompReportJobs.groupBy(Companies.CompanyId).count().count()

In [38]:
ReportJobs.groupBy(ReportJobs.CompanyId).count().count()

In [39]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, IntegerType

schema = StructType(
    [
      StructField('ScheduleType', StringType(), True), 
      StructField('Interval', IntegerType(), True)
    ]
)
shreddedScheduleReportJobs = ReportJobs.select('*', from_json('Scheduler', schema).alias('parsedPayload')) \
                                              .withColumn('ScheduleType', col('parsedPayload.ScheduleType')) \
                                              .withColumn('Interval', col('parsedPayload.Interval')) 
                                  


In [40]:
display(shreddedScheduleReportJobs)

Did not finish report stats come back to later.

### Timekeeping stats

In [43]:
Timekeeping = spark.read.format('parquet').load("/mnt/bronze/Samples/Timekeeing1yr")
TimekeepingFormData = spark.read.format('parquet').load("/mnt/bronze/Samples/TimekeeingFormData1yr")

In [44]:
print(f'Companies= {billableCompanies.groupBy("CompanyId").count().count()}')
print(f'TimekeepingComanpanies= {Timekeeping.groupBy("CompanyId").count().count()}')
print(f'TimekeepingRecords= {Timekeeping.count()}')

In [45]:
BillableTimekeeping = Timekeeping.join(billableCompanies, (Timekeeping.CompanyId == Companies.CompanyId) & (Companies.InstanceId == 50), how="inner")

In [46]:
display(Timekeeping.filter(Timekeeping.TimekeepingStatusId == 838961463298))

In [47]:
display(BillableTimekeeping.filter(Timekeeping.TimekeepingStatusId == 838961463298))

In [48]:
BillableTimekeeping.count()

In [49]:
BillableTimekeeping.filter(col('IsRoot') == True).count()

In [50]:
TimekeepingWData = Timekeeping.join(TimekeepingFormData, (Timekeeping.CompanyId == TimekeepingFormData.CompanyId) & (Timekeeping.TimekeepingStatusId == TimekeepingFormData.TimekeepingStatusId))

In [51]:
TimekeepingWData.count()

In [52]:
TimekeepingWData.groupBy(Timekeeping.CompanyId).count().count()

In [53]:
display(TimekeepingWData.groupBy(Timekeeping.CompanyId, Timekeeping.TimekeepingStatusId).count().groupBy('companyId').count().agg({'count': 'avg'}))