In [5]:
import os
import requests
from pyspark.sql import SparkSession
from tmlt.analytics.keyset import KeySet
from tmlt.analytics.privacy_budget import PureDPBudget
from tmlt.analytics.query_builder import QueryBuilder
from tmlt.analytics.session import Session

# r = requests.get(
#     'https://tumult-public.s3.amazonaws.com/library-members.csv',
# )
# with open("members.csv", "w") as f:
#     f.write(r.text)
spark = SparkSession.builder.getOrCreate()
members_df = spark.read.csv("rie_survey2.csv", header=True, inferSchema=True)

session = Session.from_dataframe(
    privacy_budget=PureDPBudget(epsilon=float('inf')),
    source_id="members",
    dataframe=members_df,
)

In [6]:
itms_list = members_df.select('ITMs').collect()

In [7]:
itms_array = [str(row.ITMs) for row in itms_list]

In [8]:
itms_array

['Precision Engineering',
 'Marine and Offshore',
 'Aerospace',
 'Energy and Chemicals',
 'Food Manufacturing',
 'Wholesale Trade',
 'Logistics',
 'Sea Transport',
 'Construction',
 'Professional Services',
 'Information and Communications',
 'Financial Services',
 'Retail',
 'Non-ITM (Biomed Mfg)']

In [9]:
members_df.head(10)

[Row(ITMs='Precision Engineering', BERD=528.4),
 Row(ITMs='Marine and Offshore', BERD=34.2),
 Row(ITMs='Aerospace', BERD=81.8),
 Row(ITMs='Energy and Chemicals', BERD=300.8),
 Row(ITMs='Food Manufacturing', BERD=86.6),
 Row(ITMs='Wholesale Trade', BERD=940.2),
 Row(ITMs='Logistics', BERD=179.9),
 Row(ITMs='Sea Transport', BERD=180.9),
 Row(ITMs='Construction', BERD=14.7),
 Row(ITMs='Professional Services', BERD=295.0)]

In [10]:
itm_sectors = KeySet.from_dict({
    "ITMs": itms_array
})

In [11]:
edu_average_age_query = (
    QueryBuilder("members")
    .groupby(itm_sectors)
    .max("BERD", low=0, high=200)
)
edu_average_ages = session.evaluate(
    edu_average_age_query,
    privacy_budget=PureDPBudget(5),
)
edu_average_ages.sort("BERD_max").show(truncate=False)

22/06/13 23:54:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
22/06/13 23:54:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
22/06/13 23:54:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 75.08% for 9 writers
22/06/13 23:54:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 67.58% for 10 writers
22/06/13 23:54:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 61.43% for 11 writers
22/06/13 23:54:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 56.31% for 12 writers
22/06/13 23:54:22 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,

+------------------------------+------------------+
|ITMs                          |BERD_max          |
+------------------------------+------------------+
|Construction                  |14.758246114348793|
|Information and Communications|22.936700118585094|
|Professional Services         |57.15592762243958 |
|Sea Transport                 |59.88919052057159 |
|Precision Engineering         |63.68114641164536 |
|Retail                        |84.50334556798033 |
|Energy and Chemicals          |86.34806053778527 |
|Logistics                     |91.44488583616092 |
|Non-ITM (Biomed Mfg)          |146.62215472792917|
|Wholesale Trade               |173.29664214337276|
|Marine and Offshore           |173.8420811891271 |
|Aerospace                     |176.43483146668234|
|Food Manufacturing            |196.45775641161768|
|Financial Services            |198.51366620334719|
+------------------------------+------------------+

