In [2]:
from calitp import get_engine
import pandas as pd
import pyarrow

In [3]:
engine = get_engine()
connection = engine.connect()

## Duplicates in the `micropayment_device_transactions` table

We expect the number of duplicate `littlepay_transaction_id`, `micropayment_id` pairs to be `0`.

In [3]:
duplicate_micropayment_transaction_pair_ids_sql = f"""
with 

distinct_id_pairs as (
    select littlepay_transaction_id, micropayment_id, count(*) as cnt
    from `cal-itp-data-infra.payments.micropayment_device_transactions` 
    group by 1, 2
),

duplicate_id_pairs as (
    select littlepay_transaction_id, micropayment_id, cnt
    from distinct_id_pairs
    where cnt > 1
)

select
    (select count(*) from distinct_id_pairs) as `Total_Distinct_ID_Pairs`,
    (select count(*) from duplicate_id_pairs) as `Duplicate_ID_Pairs`
"""

pd.read_sql_query(duplicate_micropayment_transaction_pair_ids_sql, connection)

Unnamed: 0,Total_Distinct_ID_Pairs,Duplicate_ID_Pairs
0,22402,1


The duplicate data that we are seeing is:

In [4]:
pd.read_sql_query("""
    select littlepay_transaction_id, micropayment_id
    from `cal-itp-data-infra.payments.micropayment_device_transactions` 
    group by 1, 2
    having count(*) > 1
    limit 10
""", connection)

Unnamed: 0,littlepay_transaction_id,micropayment_id
0,75358478-c900-4af0-816c-04df7bc1f2be,eba917b8-8a80-4886-b5a9-a27884712651


## Duplicate `littlepay_transaction_id` values in `device_transactions`

We expect the duplicate transaction ID counts for each agency to be `0`.

In [5]:
duplicate_transaction_ids_sql = f"""
with 

all_ids as (
    select littlepay_transaction_id
    from `cal-itp-data-infra.payments.device_transactions`
    where participant_id = %(agency)s
),

duplicate_ids as (
    select littlepay_transaction_id
    from all_ids 
    group by 1
    having count(*) > 1
)

select
    %(agency)s as `Agency`,
    (select count(distinct littlepay_transaction_id) from all_ids) as `Total_Unique_IDs`,
    (select count(*) from duplicate_ids) as `Duplicate_IDs`
"""

pd.read_sql_query(duplicate_transaction_ids_sql, connection, params={'agency': 'mst'})

Unnamed: 0,Agency,Total_Unique_IDs,Duplicate_IDs
0,mst,19582,1352


In [6]:
pd.read_sql_query(duplicate_transaction_ids_sql, connection, params={'agency': 'sbmtd'})

Unnamed: 0,Agency,Total_Unique_IDs,Duplicate_IDs
0,sbmtd,527,0


For MST, many of the duplicates have at least one entry where the route is `'Route Z'`.

In [7]:
count_duplicate_transaction_records_sql = f"""
with 

duplicate_ids as (
    select littlepay_transaction_id, array_agg(route_id) as route_ids, count(distinct route_id) cnt_route_ids
    from `cal-itp-data-infra.payments.device_transactions` 
    where participant_id = %(agency)s
    group by 1
    having count(*) > 1
)

select
    %(agency)s as Agency,
    (select count(*) from duplicate_ids) as `Total_Duplicate_IDs`,
    (select count(*) from duplicate_ids where cnt_route_ids = 1) as `Duplicate_IDs_With_Same_Route`,
    (select count(*) from duplicate_ids where 'Route Z' in unnest(route_ids)) as `Duplicate_IDs_With_Route_Z`,
    (select count(*) from duplicate_ids where cnt_route_ids = 1 and 'Route Z' in unnest(route_ids)) as `Duplicate_IDs_With_Only_Route_Z`
"""

pd.read_sql_query(count_duplicate_transaction_records_sql, connection, params={'agency': 'mst'})

Unnamed: 0,Agency,Total_Duplicate_IDs,Duplicate_IDs_With_Same_Route,Duplicate_IDs_With_Route_Z,Duplicate_IDs_With_Only_Route_Z
0,mst,1352,75,1330,53


In [8]:
sample_duplicate_transaction_records_sql = f"""
with 

duplicate_ids as (
    select littlepay_transaction_id
    from `cal-itp-data-infra.payments.device_transactions` 
    where participant_id = %(agency)s
    group by 1
    having count(*) > 1
)

select distinct *
from `cal-itp-data-infra.payments.device_transactions`
where participant_id = %(agency)s
    and littlepay_transaction_id in (select * from duplicate_ids)
order by littlepay_transaction_id
limit 10
"""

pd.read_sql_query(sample_duplicate_transaction_records_sql, connection, params={'agency': 'mst'})

Unnamed: 0,participant_id,customer_id,device_transaction_id,littlepay_transaction_id,device_id,device_id_issuer,type,transaction_outcome,transction_deny_reason,transaction_date_time_utc,...,zone_id,route_id,mode,direction,latitude,longitude,vehicle_id,granted_zone_ids,onward_zone_ids,calitp_extracted_at
0,mst,2e8bfdc1-ce64-4630-a6ff-3035b8e6581f,36481BC5-7633-A21C-3EB6-484093BD91D3,0004dabe-abde-4539-8c60-ba49ff7b8c25,17F1267C,Littlepay,single,allow,,2021-07-22T22:21:48.000Z,...,,016,BUS,inbound,36.65778350830078,-121.7696304321289,977,,,2021-09-27
1,mst,2e8bfdc1-ce64-4630-a6ff-3035b8e6581f,36481BC5-7633-A21C-3EB6-484093BD91D3,0004dabe-abde-4539-8c60-ba49ff7b8c25,17F1267C,Littlepay,single,allow,,2021-07-22T22:21:48.000Z,...,,Route Z,BUS,outbound,36.65778350830078,-121.7696304321289,977,,,2021-09-27
2,mst,e600de16-582c-4795-a433-7780adb4985d,C7741986-B015-55C4-B448-8DD5085D8499,000df49a-aff8-422a-abbe-7c6b97123b6c,17F12523,Littlepay,single,allow,,2021-08-30T23:26:35.000Z,...,,Route Z,BUS,outbound,36.41764831542969,-121.32093048095705,2103,,,2021-09-27
3,mst,e600de16-582c-4795-a433-7780adb4985d,C7741986-B015-55C4-B448-8DD5085D8499,000df49a-aff8-422a-abbe-7c6b97123b6c,17F12523,Littlepay,single,allow,,2021-08-30T23:26:35.000Z,...,,023,BUS,outbound,36.41764831542969,-121.32093048095705,2103,,,2021-09-27
4,mst,20962f6b-b7f1-4eac-9c94-71af37d000f8,14B37ADA-8D27-8181-B93B-7BD833A47676,0057ac79-a7f0-4314-93d2-9e53be215666,17F125BE,Littlepay,single,allow,,2021-08-16T23:39:15.000Z,...,,044,BUS,inbound,36.71390914916992,-121.65453338623048,2008,,,2021-09-27
5,mst,20962f6b-b7f1-4eac-9c94-71af37d000f8,14B37ADA-8D27-8181-B93B-7BD833A47676,0057ac79-a7f0-4314-93d2-9e53be215666,17F125BE,Littlepay,single,allow,,2021-08-16T23:39:15.000Z,...,,Route Z,BUS,outbound,36.71390914916992,-121.65453338623048,2008,,,2021-09-27
6,mst,eac3857e-ec07-4d86-9c7d-4bbc81305c24,78BA0E4E-7B1B-7099-89A5-AA141640E0AE,007ee236-05ee-49eb-a42b-90f5b284bf95,17F12881,Littlepay,single,allow,,2021-08-02T22:57:08.000Z,...,,018,BUS,inbound,36.59715270996094,-121.85679626464844,1724,,,2021-09-27
7,mst,eac3857e-ec07-4d86-9c7d-4bbc81305c24,78BA0E4E-7B1B-7099-89A5-AA141640E0AE,007ee236-05ee-49eb-a42b-90f5b284bf95,17F12881,Littlepay,single,allow,,2021-08-02T22:57:08.000Z,...,,Route Z,BUS,outbound,36.59715270996094,-121.85679626464844,1724,,,2021-09-27
8,mst,ab8bd3d6-6e5b-4026-a536-baf9fa19b7ef,00CB3E8B-1C0D-458D-0610-3235AD1A5DD1,0086e6ce-9e7d-43b6-b7f7-dee23c407102,17F1255D,Littlepay,single,allow,,2021-09-22T02:21:19.000Z,...,,049,BUS,inbound,36.676063537597656,-121.65709686279295,1721,,,2021-09-27
9,mst,ab8bd3d6-6e5b-4026-a536-baf9fa19b7ef,00CB3E8B-1C0D-458D-0610-3235AD1A5DD1,0086e6ce-9e7d-43b6-b7f7-dee23c407102,17F1255D,Littlepay,single,allow,,2021-09-22T02:21:19.000Z,...,,Route Z,BUS,outbound,36.676063537597656,-121.65709686279295,1721,,,2021-09-27


What is `Route Z`?

### Duplicate transactions across file partitions

We are loading these transactions from date-partitioned CSV files, and it appears that there aren't any duplicates within each partition -- only across partitions. Are there corrections being made to transactions that might appear in a later dated partition?

In [18]:
duplicate_transaction_ids__within_partitions_sql = f"""
with 

all_ids as (
    select _FILE_NAME as fn, littlepay_transaction_id
    from `cal-itp-data-infra.payments.device_transactions`
    where participant_id = %(agency)s
),

duplicate_ids as (
    select fn, littlepay_transaction_id
    from all_ids 
    group by 1, 2
    having count(*) > 1
)

select
    %(agency)s as `Agency`,
    (select count(distinct littlepay_transaction_id) from all_ids) as `Total_Unique_IDs`,
    (select count(*) from duplicate_ids) as `Duplicate_IDs_within_partitions`
"""

pd.read_sql_query(duplicate_transaction_ids__within_partitions_sql, connection, params={'agency': 'mst'})

Unnamed: 0,Agency,Total_Unique_IDs,Duplicate_IDs_within_partitions
0,mst,19582,0


## Duplicate `micropayment_id`

We expect the number of duplicate micropayment IDs from  each agency to be `0`.

In [9]:
duplicate_micropayment_ids_sql = f"""
with 

all_ids as (
    select micropayment_id
    from `cal-itp-data-infra.payments.micropayments`
    where participant_id = %(agency)s
),

duplicate_ids as (
    select micropayment_id
    from all_ids 
    group by 1
    having count(*) > 1
)

select
    %(agency)s as `Agency`,
    (select count(distinct micropayment_id) from all_ids) as `Total_IDs`,
    (select count(*) from duplicate_ids) as `Duplicate_IDs`
"""

pd.read_sql_query(duplicate_micropayment_ids_sql, connection, params={'agency': 'mst'})

Unnamed: 0,Agency,Total_IDs,Duplicate_IDs
0,mst,13807,8112


In [10]:
pd.read_sql_query(duplicate_micropayment_ids_sql, connection, params={'agency': 'sbmtd'})

Unnamed: 0,Agency,Total_IDs,Duplicate_IDs
0,sbmtd,527,0


For MST, many of the duplicates have different transaction times. 

In [11]:
count_duplicate_micropayment_records_sql = f"""
with 

duplicate_ids as (
    select micropayment_id
    from `cal-itp-data-infra.payments.micropayments` 
    where participant_id = %(agency)s
    group by 1
    having count(*) > 1
)

select *
from `cal-itp-data-infra.payments.micropayments`
where micropayment_id in (select * from duplicate_ids)
order by micropayment_id
limit 10
"""

pd.read_sql_query(count_duplicate_micropayment_records_sql, connection, params={'agency': 'mst'})

Unnamed: 0,micropayment_id,aggregation_id,participant_id,customer_id,funding_source_vault_id,transaction_time,payment_liability,charge_amount,nominal_amount,currency_code,type,charge_type,calitp_extracted_at
0,000317e7-220f-4c09-817c-b32a7b308812,08a13a93-f9fc-4d8f-bfc0-e57b7e6fab01,mst,3aec6af1-208d-4316-a254-98571fc79724,246574a7-8ac3-4a29-a1dc-7681fdc8dc78,2021-07-17 22:36:18+00:00,OPERATOR,2.5,2.5,840,DEBIT,complete_variable_fare,2021-09-27
1,000317e7-220f-4c09-817c-b32a7b308812,08a13a93-f9fc-4d8f-bfc0-e57b7e6fab01,mst,3aec6af1-208d-4316-a254-98571fc79724,246574a7-8ac3-4a29-a1dc-7681fdc8dc78,2021-07-17 22:29:19+00:00,OPERATOR,2.5,2.5,840,DEBIT,complete_variable_fare,2021-09-27
2,001393af-c1cf-4ff4-8edc-8ac34bb81cb9,e342a964-aed2-4131-80f1-3476cee01c59,mst,13d2f3e7-ca12-43f0-81c9-265efad6f84c,0eefbba9-5498-440e-b5f8-18fc86fbfe54,2021-07-31 03:23:24+00:00,OPERATOR,2.5,2.5,840,DEBIT,complete_variable_fare,2021-09-27
3,001393af-c1cf-4ff4-8edc-8ac34bb81cb9,e342a964-aed2-4131-80f1-3476cee01c59,mst,13d2f3e7-ca12-43f0-81c9-265efad6f84c,0eefbba9-5498-440e-b5f8-18fc86fbfe54,2021-07-31 03:47:16+00:00,OPERATOR,2.5,2.5,840,DEBIT,complete_variable_fare,2021-09-27
4,0014011f-f7b4-4bfa-b3b5-157b61b63611,ec04f603-30ac-473f-b355-36c25dcc9085,mst,3137815e-3376-4a77-98a4-56e8381a1cd2,a2be3cb3-23ac-4b12-b9f0-2bcf03226ecc,2021-08-09 13:48:33+00:00,OPERATOR,1.5,1.5,840,DEBIT,complete_variable_fare,2021-09-27
5,0014011f-f7b4-4bfa-b3b5-157b61b63611,ec04f603-30ac-473f-b355-36c25dcc9085,mst,3137815e-3376-4a77-98a4-56e8381a1cd2,a2be3cb3-23ac-4b12-b9f0-2bcf03226ecc,2021-08-09 13:58:46+00:00,OPERATOR,1.5,1.5,840,DEBIT,complete_variable_fare,2021-09-27
6,00257883-a117-47d6-ab85-482c11aab3e0,bae840f5-9ba8-4492-9941-0e615a2783ce,mst,ec9f103c-fe7e-46d9-b242-0d04002e280e,e9c39bc7-9786-40e4-8c55-ddcdf4c72ab7,2021-08-16 19:34:28+00:00,OPERATOR,3.5,3.5,840,DEBIT,complete_variable_fare,2021-09-27
7,00257883-a117-47d6-ab85-482c11aab3e0,bae840f5-9ba8-4492-9941-0e615a2783ce,mst,ec9f103c-fe7e-46d9-b242-0d04002e280e,e9c39bc7-9786-40e4-8c55-ddcdf4c72ab7,2021-08-16 18:28:29+00:00,OPERATOR,3.5,3.5,840,DEBIT,complete_variable_fare,2021-09-27
8,0026c9ef-22ba-412e-9fec-412db4cb5e4b,a660f8aa-4390-4692-9af0-72af4256a371,mst,d0fdfde6-3106-40eb-9677-2a22e7000c90,a803a01c-270e-4289-bedd-e43aeb6c0263,2021-08-30 15:01:17+00:00,OPERATOR,2.5,2.5,840,DEBIT,complete_variable_fare,2021-09-27
9,0026c9ef-22ba-412e-9fec-412db4cb5e4b,a660f8aa-4390-4692-9af0-72af4256a371,mst,d0fdfde6-3106-40eb-9677-2a22e7000c90,a803a01c-270e-4289-bedd-e43aeb6c0263,2021-08-30 15:40:30+00:00,OPERATOR,2.5,2.5,840,DEBIT,complete_variable_fare,2021-09-27


Unlike the transactions, the majority of duplicates for micropayments happen within partitions (as opposed to between partitions).

In [43]:
duplicate_transaction_ids__within_partitions_sql = f"""
with 

all_ids as (
    select _FILE_NAME as fn, micropayment_id
    from `cal-itp-data-infra.payments.micropayments`
    where participant_id = %(agency)s
),

duplicate_ids as (
    select micropayment_id
    from all_ids 
    group by 1
    having count(*) > 1
),

duplicate_ids_by_partition as (
    select fn, micropayment_id
    from all_ids 
    group by 1, 2
    having count(*) > 1
)

select
    %(agency)s as `Agency`,
    (select count(distinct micropayment_id) from all_ids) as `Total_Unique_IDs`,
    (select count(distinct micropayment_id) from duplicate_ids) as `Count_IDs_with_Duplicates`,
    (select count(distinct micropayment_id) from duplicate_ids_by_partition) as `Duplicate_IDs_within_partitions`
"""

pd.read_sql_query(duplicate_transaction_ids__within_partitions_sql, connection, params={'agency': 'mst'})

Unnamed: 0,Agency,Total_Unique_IDs,Count_IDs_with_Duplicates,Duplicate_IDs_within_partitions
0,mst,16588,10272,10220


## Inconsistent `customer_id` values

When we join `micropayments` to `device_transactions` through `micropayment_device_transactions` we end up with a lot of mismatching `customer_id` values. This is even after we control for duplicate records:

In [12]:
unique_id_counts_sql = f"""
with

transaction_customer_counts as (
    select littlepay_transaction_id, count(distinct customer_id) as distinct_customer_cnt
    from payments.device_transactions
    where participant_id = %(agency)s
    group by 1
),

micropayment_customer_counts as (
    select micropayment_id, count(distinct customer_id) as distinct_customer_cnt
    from payments.micropayments
    where participant_id = %(agency)s
    group by 1
)

select
    %(agency)s as `Agency`,
    (select count(*) from transaction_customer_counts where distinct_customer_cnt > 1) as `Transaction_IDs_with_different_Customer_IDs`,
    (select count(*) from micropayment_customer_counts where distinct_customer_cnt > 1) as `Micropayment_IDs_with_different_Customer_IDs`
"""

pd.read_sql_query(unique_id_counts_sql, connection, params={'agency': 'mst'})

Unnamed: 0,Agency,Transaction_IDs_with_different_Customer_IDs,Micropayment_IDs_with_different_Customer_IDs
0,mst,0,0


Given that the same `littlepay_transaction_id` always has the same `customer_id` (even for duplicate `littlepay_transaction_id` records), and the same `micropayment_id` always has the same `customer_id` (even for duplicate `micropayment_id` records), we make the assumptions below that:
* If we select distinct `littlepay_transaction_id`/`customer_id` pairs, and
* We select distinct `micropayment_id`/`customer_id` pairs, and
* We join these through the distinct `micropayment_device_transactions` records, then
* We should have matching `customer_id` values.

However, this isn't the case (at least for MST):

In [13]:
inconsistent_customer_ctes = """
with

micropayments as (
    select distinct micropayment_id, customer_id as micropayment_customer_id
    from payments.micropayments
    where participant_id = %(agency)s
),

transactions as (
    select distinct littlepay_transaction_id, customer_id as transaction_customer_id
    from payments.device_transactions
    where participant_id = %(agency)s
),

through as (
    select distinct littlepay_transaction_id, micropayment_id
    from payments.micropayment_device_transactions
),

combined as (
    select *
    from micropayments
    join through using (micropayment_id)
    join transactions using (littlepay_transaction_id)
)

"""

pd.read_sql_query(inconsistent_customer_ctes + """
    select
        %(agency)s as `Agency`,
        (select count(*) from combined where micropayment_customer_id = transaction_customer_id) as `Matching_Customer_ID_count`,
        (select count(*) from combined where micropayment_customer_id != transaction_customer_id) as `Mismatch_Customer_ID_count`
""", connection, params={'agency': 'mst'})

Unnamed: 0,Agency,Matching_Customer_ID_count,Mismatch_Customer_ID_count
0,mst,21706,144


In [14]:
pd.read_sql_query(inconsistent_customer_ctes + """
    select
        %(agency)s as `Agency`,
        (select count(*) from combined where micropayment_customer_id = transaction_customer_id) as `Matching_Customer_ID_count`,
        (select count(*) from combined where micropayment_customer_id != transaction_customer_id) as `Mismatch_Customer_ID_count`
""", connection, params={'agency': 'sbmtd'})

Unnamed: 0,Agency,Matching_Customer_ID_count,Mismatch_Customer_ID_count
0,sbmtd,527,0


In [15]:
pd.read_sql_query(inconsistent_customer_ctes + """
    select *
    from combined
    where micropayment_customer_id != transaction_customer_id
    order by micropayment_id
""", connection, params={'agency': 'mst'})

Unnamed: 0,littlepay_transaction_id,micropayment_id,micropayment_customer_id,transaction_customer_id
0,4fcf8757-c0d5-4df2-a0b2-b07e0bdb0037,004ca44f-4b35-446f-8d85-515bc938b167,4c8200ca-5e45-47b8-a13d-6b9b48607c29,2b800774-633e-4e87-9ac9-50de498abce3
1,edb76395-6b0b-4e7a-b35b-8f1ed59fe66e,05089d34-1e57-498e-b64a-e0f9a5f108c0,e074383d-779e-4378-ac61-7d9ef6422d91,59bcaa6c-2329-499f-91d7-67fc5c037712
2,4918c915-f30d-48a7-a52a-d022526c85be,051de55a-c69f-4314-975a-0143054183d5,d2978f9b-b7d0-4402-be25-55bf6c6bce13,87919b31-80db-467f-a690-e703253e366d
3,91a4e433-47c8-4a63-b34c-bb82ae597b32,05cff6bd-d616-429b-848f-11d9808e05f2,6b26b584-0133-40ea-9913-3386ca2dc608,b98a8401-a00f-4c0f-9ef5-4b884e46ba13
4,186cad2c-78e7-41e8-8691-257779a2de74,066fbc09-c2a5-4c48-be0d-6f5d2b172b2d,555e0c45-af79-4f40-83e8-09a958d167a0,d2a3c751-1beb-4995-a01a-28609c4c6f73
...,...,...,...,...
139,ea465579-61d6-48d1-a9fb-8d8504054d14,f7b4bf0b-05c6-4b85-a854-e4c699773363,f63d30ca-56d9-49ae-a72f-8478ff0250d4,af70f71d-dfdf-4fe6-b7f7-31d125d9e2f0
140,ec4b7f25-da2b-4b49-9971-4db4fb0c3bba,f953bb7e-05a3-4a4a-bc0e-6629fc0c2416,3f1bbc14-3786-4485-90b2-6e2ea685c79f,7f1ae877-00b7-460f-979c-b308468a75d7
141,1ca2935c-5d10-4a7e-b9f2-1ccb0f5f389b,f9cff78b-7478-4874-a387-17a2cc22cc32,54941ea1-46b3-4124-868a-83f37a2c6fa5,d7ac4c3e-84b0-469f-a491-0dfa2122f715
142,f370c84d-ddd3-4e7b-ab81-f145339ef9e7,fcd75fd3-95cb-4c58-8ac9-97a10c040c0b,a5f9e75e-ba8e-4e20-82fa-172fbf4eebaf,937a5a5c-c021-4313-a10b-689b59fa87ba


## Types of trips

At least for MST, we expect there to be trips that aren't just coded as `single`, since MST [charges by the mile](https://mst.org/fares/route-types-fares/#:~:text=Contactless%20Fares:%20How%20Calculated%20and%20Charged) when using contactless payment.

In [16]:
pd.read_sql_query("""
    select distinct type as `Distinct_Trip_Types_for_MST`
    from payments.device_transactions
    where participant_id = 'mst'
""", connection)

Unnamed: 0,Distinct_Trip_Types_for_MST
0,single


# Investigating changes across duplicate IDs

It may be reasonable for there to be certain fields that change for a duplicated transaction ID, but we should check whether the fields we see being changed fall into that reasonable set. Based on the below, it looks like the `route_id`, `location_id`, `location_name`, and `direction` fields are the only ones that are inconsistent across instances transaction ids.

For micropayments on the other hand, the only field that seems to change across duplicate `micropayment_id` values is the `transaction_time`. As was mentioned above though, these changes happen within the same partition files.

In [15]:
consistent_fields = [
    'participant_id',
    'customer_id',
    'device_transaction_id',
    'littlepay_transaction_id',
    'device_id',
    'device_id_issuer',
    'type',
    'transaction_outcome',
    'transction_deny_reason',
    'transaction_date_time_utc',
#    'location_id',
    'location_scheme',
#    'location_name',
    'zone_id',
#    'route_id',
    'mode',
#    'direction',
    'latitude',
    'longitude',
    'vehicle_id',
    'granted_zone_ids',
    'onward_zone_ids',
    'calitp_extracted_at',
]

sql = f"""
    with
    
    deduped_transactions as (
        select distinct {', '.join(consistent_fields)}
        from payments.device_transactions
        where participant_id = 'mst'
    ),
    
    dup_ids as (
        select littlepay_transaction_id
        from deduped_transactions
        group by littlepay_transaction_id
        having count(*) > 1
    )
    
    select distinct *
    from deduped_transactions
    join dup_ids using (littlepay_transaction_id)
    order by littlepay_transaction_id
    limit 2
"""

pd.read_sql_query(sql, connection)

Unnamed: 0,littlepay_transaction_id,participant_id,customer_id,device_transaction_id,device_id,device_id_issuer,type,transaction_outcome,transction_deny_reason,transaction_date_time_utc,location_scheme,zone_id,mode,latitude,longitude,vehicle_id,granted_zone_ids,onward_zone_ids,calitp_extracted_at


In [38]:
consistent_fields = [
    'micropayment_id',
    'aggregation_id',
    'participant_id',
    'customer_id',
    'funding_source_vault_id',
#    'transaction_time',
    'payment_liability',
    'charge_amount',
    'nominal_amount',
    'currency_code',
    'type',
    'charge_type',
    'calitp_extracted_at',
]

sql = f"""
    with
    
    deduped_micropayments as (
        select distinct {', '.join(consistent_fields)}
        from payments.micropayments
        where participant_id = 'mst'
    ),
    
    dup_ids as (
        select micropayment_id
        from deduped_micropayments
        group by micropayment_id
        having count(*) > 1
    )
    
    select distinct *
    from deduped_micropayments
    join dup_ids using (micropayment_id)
    order by micropayment_id
    limit 2
"""

pd.read_sql_query(sql, connection)

Unnamed: 0,micropayment_id,aggregation_id,participant_id,customer_id,funding_source_vault_id,payment_liability,charge_amount,nominal_amount,currency_code,type,charge_type,calitp_extracted_at


# Investigating Customer duplication

We expect there to be some unique key or composite key in the `customer_funding_source` table, and that it will be made up of some combination of `funding_source_id`, `funding_source_vault_id`, `customer_id`, however there are a number of duplications even when we use _all_ of these IDs as a composite key (even within a single date's data!):

In [33]:
sql = """
with dup_counts_within_partition_files as (
    select _FILE_NAME as fn, funding_source_id, funding_source_vault_id, customer_id, count(*) as num_dups
    from payments.customer_funding_source
    group by 1, 2, 3, 4
    having count(*) > 1
)

select
    count(distinct funding_source_id || funding_source_vault_id || customer_id) as num_duplicated_composite_keys,
    max(num_dups) as max_dups_per_key_per_file
from dup_counts_within_partition_files
"""

pd.read_sql_query(sql, connection)

Unnamed: 0,num_duplicated_composite_keys,max_dups_per_key_per_file
0,1216,2


Moreover, we expect that each `customer_id` is correlated with either a single `funding_source_vault_id` or a single `funding_source_id` (I wasn't sure which, so I checked both), however, that's also not the case, as can be seen with the two queries below.

In [35]:
sql = """
select customer_id, count(*) as count_of_distinct_funding_source_id
from (
    select distinct customer_id, funding_source_id
    from payments.customer_funding_source)
group by 1
having count(*) > 1
order by count(*) desc
"""

pd.read_sql_query(sql, connection)

Unnamed: 0,customer_id,count_of_distinct_funding_source_id
0,56580229-a41b-416a-84df-3344c63d435d,5
1,b50875da-01b8-45c6-8509-d11056045554,4
2,cb35f24f-f3d4-433a-9c52-6e213aeca853,4
3,6563418a-d7fb-4328-9062-e52ac0b607e2,4
4,6d2a695e-feac-41fb-b5c8-16a60d82e272,3
...,...,...
364,9e1184e2-aa35-42ec-913a-6d55b9d33ea8,2
365,669dd84b-22dd-4faa-a018-ac364e6a5fa6,2
366,ffe21fbf-7b13-449d-b508-f506fb60cc11,2
367,2edd2ca8-ed22-4a74-bc12-322da867205d,2


In [36]:
sql = """
select customer_id, count(*) as count_of_distinct_vault_id
from (
    select distinct customer_id, funding_source_vault_id
    from payments.customer_funding_source)
group by 1
having count(*) > 1
order by count(*) desc
"""

pd.read_sql_query(sql, connection)

Unnamed: 0,customer_id,count_of_distinct_vault_id
0,cb35f24f-f3d4-433a-9c52-6e213aeca853,4
1,6563418a-d7fb-4328-9062-e52ac0b607e2,4
2,8eba2297-8c43-4063-9cc4-02f431ba29b9,3
3,13532d65-8ebd-4496-8412-e472491f0d58,3
4,7db5f9bf-da6f-4903-ac47-e26c1e2307d5,3
...,...,...
191,56662871-56da-4a07-b49e-d3985d3ea193,2
192,4b0c3469-2360-457b-9d11-9c153a2822c3,2
193,5696fe72-fa73-45ac-8509-e72f4b384311,2
194,37fe21e8-d5c6-4d08-8e5a-b5d7beb0fc96,2
