In [1]:
import pandas as pd

# Load EUTL dataset
- Source
    - The reason why a same transaction (with one TRANSACTION_ID) is described by multiple lines is because the file details transactions by ORIGINATING_REGISTRY, UNIT_TYPE_DESCRIPTION, SUPP_UNIT_TYPE_DESCRIPTION, ORIGINAL_PERIOD_CODE, LULUCF_CODE_DESCRIPTION, PROJECT_IDENTIFIER, TRACK and EXPIRY_DATE, which can be many for one transaction
- Information of EUTL: X EUAs are trasnferred from account A to account B on date T
- [Abrell](https://www.euets.info/) support tools:  
    - [scraper](https://github.com/jabrell/eutl_scraper): provides access to the Python routines for downloading and processing of the source data
    - [pyeutl](https://github.com/jabrell/pyeutl/tree/main): provides Python routines to access the data provided in a convenient way
    - [transactions_analysis](https://github.com/jabrell/transaction_eutl)

## Check optimal dtypes
- based on number of unique values, set as category or string

In [16]:
df_eutl_2022 = pd.read_csv(r'../data/raw/transactions_eutl_2025/transactions_EUTL_PUBLIC_NOTESD_20251031.csv', nrows = 10000)

for col in df_eutl_2022.select_dtypes(include="object"):
    print(f"{col}: {df_eutl_2022[col].nunique(dropna=False) / len(df_eutl_2022)}")

TRANSACTION_ID: 0.7958
TRANSACTION_TYPE: 0.0004
TRANSACTION_DATE: 0.79
TRANSACTION_STATUS: 0.0001
TRANSFERRING_REGISTRY_NAME: 0.0033
TRANSFERRING_ACCOUNT_TYPE2: 0.0004
TRANSFERRING_ACCOUNT_TYPE3: 0.0007
TRANSFERRING_ACCOUNT_OPEN_DT: 0.144
TRANSFERRING_ACCOUNT_END_OF_VALIDITY: 0.0655
TRANSFERRING_ACCOUNT_NAME: 0.2351
TRANSFERRING_ACCOUNT_HOLDER: 0.1922
TRANSFERRING_ACCOUNT_HOLDER_ADDRESS1: 0.1836
TRANSFERRING_ACCOUNT_HOLDER_ADDRESS2: 0.0475
TRANSFERRING_ACCOUNT_HOLDER_CITY: 0.1294
TRANSFERRING_ACCOUNT_HOLDER_POSTAL_CODE: 0.1552
TRANSFERRING_ACCOUNT_HOLDER_COUNTRY_CODE: 0.0045
TRANSFERRING_ACCOUNT_HOLDER_COMPANY_REGISTRATION_NUMBER: 0.1879
TRANSFERRING_ACCOUNT_HOLDER_LEI: 0.0574
TRANSFERRING_INSTALLATION_NAME: 0.1836
TRANSFERRING_INSTALLATION_PERMIT_IDENTIFIER: 0.1862
TRANSFERRING_INSTALLATION_PARENT_COMPANY: 0.0399
TRANSFERRING_INSTALLATION_SUBSIDIARY_COMPANY: 0.0111
TRANSFERRING_INSTALLATION_EPER_IDENTIFICATION: 0.0633
TRANSFERRING_INSTALLATION_CITY: 0.1476
TRANSFERRING_INSTALLATION_PO

  df_eutl_2022 = pd.read_csv(r'../data/raw/transactions_eutl_2025/transactions_EUTL_PUBLIC_NOTESD_20251031.csv', nrows = 10000)
See https://pandas.pydata.org/docs/user_guide/migration-3-strings.html#string-migration-select-dtypes for details on how to write code that works with pandas 2 and 3.
  for col in df_eutl_2022.select_dtypes(include="object"):


In [17]:
dtypes = {
    # IDs / keys
    "TRANSACTION_ID": "string",

    # Low-cardinality / codes
    "TRANSACTION_TYPE": "category",
    "TRANSACTION_STATUS": "category",

    "TRANSFERRING_REGISTRY_NAME": "category",
    "ACQUIRING_REGISTRY_NAME": "category",
    "ORIGINATING_REGISTRY": "category",

    "TRANSFERRING_ACCOUNT_TYPE2": "category",
    "TRANSFERRING_ACCOUNT_TYPE3": "category",
    "ACQUIRING_ACCOUNT_TYPE2": "category",
    "ACQUIRING_ACCOUNT_TYPE3": "category",

    "TRANSFERRING_ACCOUNT_HOLDER_COUNTRY_CODE": "category",
    "ACQUIRING_ACCOUNT_HOLDER_COUNTRY_CODE": "category",

    "TRANSFERRING_INSTALLATION_MAIN_ACTIVITY": "category",
    "ACQUIRING_INSTALLATION_MAIN_ACTIVITY": "category",

    "UNIT_TYPE_DESCRIPTION": "category",
    "SUPP_UNIT_TYPE_DESCRIPTION": "category",
    "LULUCF_CODE_DESCRIPTION": "string",  # force away mixed types; you can later .astype("category") if you want

    # Lower-cardinality text (often worth category)
    "TRANSFERRING_INSTALLATION_PARENT_COMPANY": "category",
    "TRANSFERRING_INSTALLATION_SUBSIDIARY_COMPANY": "category",
    "ACQUIRING_INSTALLATION_PARENT_COMPANY": "category",
    "ACQUIRING_INSTALLATION_SUBSIDIARY_COMPANY": "category",

    "TRANSFERRING_ACCOUNT_HOLDER_LEI": "category",
    "ACQUIRING_ACCOUNT_HOLDER_LEI": "category",

    "TRANSFERRING_ACCOUNT_HOLDER_ADDRESS2": "category",
    "ACQUIRING_ACCOUNT_HOLDER_ADDRESS2": "category",
    "TRANSFERRING_INSTALLATION_ADDRESS2": "category",
    "ACQUIRING_INSTALLATION_ADDRESS2": "category",

    "TRANSFERRING_INSTALLATION_EPER_IDENTIFICATION": "category",
    "ACQUIRING_INSTALLATION_EPER_IDENTIFICATION": "category",

    # High-ish cardinality text -> string (safe default)
    "TRANSFERRING_ACCOUNT_NAME": "string",
    "TRANSFERRING_ACCOUNT_HOLDER": "string",
    "TRANSFERRING_ACCOUNT_HOLDER_ADDRESS1": "string",
    "TRANSFERRING_ACCOUNT_HOLDER_CITY": "string",
    "TRANSFERRING_ACCOUNT_HOLDER_POSTAL_CODE": "string",
    "TRANSFERRING_ACCOUNT_HOLDER_COMPANY_REGISTRATION_NUMBER": "string",

    "TRANSFERRING_INSTALLATION_NAME": "string",
    "TRANSFERRING_INSTALLATION_PERMIT_IDENTIFIER": "string",
    "TRANSFERRING_INSTALLATION_CITY": "string",
    "TRANSFERRING_INSTALLATION_POSTAL_CODE": "string",
    "TRANSFERRING_INSTALLATION_ADDRESS1": "string",

    "ACQUIRING_ACCOUNT_NAME": "string",
    "ACQUIRING_ACCOUNT_HOLDER": "string",
    "ACQUIRING_ACCOUNT_HOLDER_ADDRESS1": "string",
    "ACQUIRING_ACCOUNT_HOLDER_CITY": "string",
    "ACQUIRING_ACCOUNT_HOLDER_POSTAL_CODE": "string",
    "ACQUIRING_ACCOUNT_HOLDER_COMPANY_REGISTRATION_NUMBER": "string",

    "ACQUIRING_INSTALLATION_NAME": "string",
    "ACQUIRING_INSTALLATION_PERMIT_IDENTIFIER": "string",
    "ACQUIRING_INSTALLATION_CITY": "string",
    "ACQUIRING_INSTALLATION_POSTAL_CODE": "string",
    "ACQUIRING_INSTALLATION_ADDRESS1": "string",
}

## Set columns of interest and read

In [25]:
cols = ['TRANSACTION_ID', 'TRANSACTION_TYPE', 'TRANSACTION_DATE', 'TRANSFERRING_REGISTRY_NAME','TRANSFERRING_ACCOUNT_NAME',
       'TRANSFERRING_ACCOUNT_IDENTIFIER', 'TRANSFERRING_ACCOUNT_HOLDER', 'TRANSFERRING_ACCOUNT_HOLDER_COMPANY_REGISTRATION_NUMBER',
       'TRANSFERRING_ACCOUNT_HOLDER_LEI', 'TRANSFERRING_INSTALLATION_NAME',
       'TRANSFERRING_INSTALLATION_INSTALLATION_IDENTIFIER','TRANSFERRING_INSTALLATION_PARENT_COMPANY',
       'TRANSFERRING_INSTALLATION_SUBSIDIARY_COMPANY','TRANSFERRING_INSTALLATION_MAIN_ACTIVITY', 'ACQUIRING_REGISTRY_NAME','ACQUIRING_ACCOUNT_NAME',
       'ACQUIRING_ACCOUNT_IDENTIFIER', 'ACQUIRING_ACCOUNT_HOLDER','ACQUIRING_ACCOUNT_HOLDER_LEI', 'ACQUIRING_INSTALLATION_NAME',
       'ACQUIRING_INSTALLATION_INSTALLATION_IDENTIFIER','ACQUIRING_INSTALLATION_PARENT_COMPANY',
       'ACQUIRING_INSTALLATION_SUBSIDIARY_COMPANY']

In [26]:
df_eutl_2022 = pd.read_csv(
    "../data/raw/transactions_eutl_2025/transactions_EUTL_PUBLIC_NOTESD_20251031.csv",
    usecols=cols,
    dtype=dtypes,
    parse_dates=["TRANSACTION_DATE"],
    low_memory=False    # internal chunsize is not customizable in python, so in order to avoid mixed types we need to disable it entirely
)

In [29]:
df_eutl_2022.info(memory_usage="deep")

<class 'pandas.DataFrame'>
RangeIndex: 2142475 entries, 0 to 2142474
Data columns (total 23 columns):
 #   Column                                                   Dtype         
---  ------                                                   -----         
 0   TRANSACTION_ID                                           string        
 1   TRANSACTION_TYPE                                         category      
 2   TRANSACTION_DATE                                         datetime64[us]
 3   TRANSFERRING_REGISTRY_NAME                               category      
 4   TRANSFERRING_ACCOUNT_NAME                                string        
 5   TRANSFERRING_ACCOUNT_IDENTIFIER                          float64       
 6   TRANSFERRING_ACCOUNT_HOLDER                              string        
 7   TRANSFERRING_ACCOUNT_HOLDER_COMPANY_REGISTRATION_NUMBER  string        
 8   TRANSFERRING_ACCOUNT_HOLDER_LEI                          category      
 9   TRANSFERRING_INSTALLATION_NAME                

In [28]:
df_eutl_2022

Unnamed: 0,TRANSACTION_ID,TRANSACTION_TYPE,TRANSACTION_DATE,TRANSFERRING_REGISTRY_NAME,TRANSFERRING_ACCOUNT_NAME,TRANSFERRING_ACCOUNT_IDENTIFIER,TRANSFERRING_ACCOUNT_HOLDER,TRANSFERRING_ACCOUNT_HOLDER_COMPANY_REGISTRATION_NUMBER,TRANSFERRING_ACCOUNT_HOLDER_LEI,TRANSFERRING_INSTALLATION_NAME,...,TRANSFERRING_INSTALLATION_MAIN_ACTIVITY,ACQUIRING_REGISTRY_NAME,ACQUIRING_ACCOUNT_NAME,ACQUIRING_ACCOUNT_IDENTIFIER,ACQUIRING_ACCOUNT_HOLDER,ACQUIRING_ACCOUNT_HOLDER_LEI,ACQUIRING_INSTALLATION_NAME,ACQUIRING_INSTALLATION_INSTALLATION_IDENTIFIER,ACQUIRING_INSTALLATION_PARENT_COMPANY,ACQUIRING_INSTALLATION_SUBSIDIARY_COMPANY
0,DE122558,10-0,2015-06-04 17:38:44,Germany,1914 - RWE Power AG Personenkonto,1914.0,RWE Power Aktiengesellschaft,HRB 17420 Amtsgericht Essen,,,...,-,Germany,FutureCamp Climate GmbH Personenkonto,2963,FutureCamp Climate GmbH,,,,,
1,GB66629,10-0,2012-08-28 15:29:28,United Kingdom,MLI Emissions Registry Account,901.0,Merrill Lynch International,02312079,,,...,-,United Kingdom,MLCE,841,Merrill Lynch Commodities (Europe) Limited,,,,,
2,NL28978,10-0,2012-09-12 17:25:39,Netherlands,SIA Vidzeme Eko,286.0,"SIA ""Vidzeme Eko""",40003755312,,,...,-,Netherlands,ACT Carbon,778,ACT Financial Solutions B.V.,724500LY73GPE4GDX159\r,,,,
3,NL28981,10-0,2012-09-13 15:45:37,Netherlands,SIA Vidzeme Eko,286.0,"SIA ""Vidzeme Eko""",40003755312,,,...,-,Netherlands,ACT Carbon,778,ACT Financial Solutions B.V.,724500LY73GPE4GDX159\r,,,,
4,NL29004,10-0,2012-09-18 14:23:50,Netherlands,SIA Vidzeme Eko,286.0,"SIA ""Vidzeme Eko""",40003755312,,,...,-,Netherlands,ACT Carbon,778,ACT Financial Solutions B.V.,724500LY73GPE4GDX159\r,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2142470,EU542433,10-2,2020-01-09 18:14:20,Spain,"Tereos Starch & Sweeteners Iberia, S.A.U.",5008918.0,"Tereos Starch & Sweeteners Iberia, S.A.U.",A50012921,,"Tereos Starch & Sweeteners Iberia, S.A.U.",...,20-Combustion of fuels,European Commission,EU Allowance deletion,5016380,European Commission,,,,,
2142471,FR93298,10-0,2009-05-07 17:26:34,France,CONSUS FR,1287.0,CONSUS FRANCE SARL,491808804,,,...,-,France,CONSUS FR PWX,1291,KLAPUCKI,,,,,
2142472,FR10339,10-0,2006-06-01 16:34:39,France,Caisse des dépôts et consignations Détention,1163.0,Caisse des Dépôts et Consignations,180020026,,,...,-,France,ACCORD ENERGY LTD Détention,1185,WINKLEHNER,,,,,
2142473,EU403486,10-2,2017-04-27 13:28:12,Romania,Operator Account,5010063.0,COMPLEXUL ENERGETIC HUNEDOARA,J20/994/2012,,Electrocentrale Deva,...,20-Combustion of fuels,European Commission,EU Allowance deletion,5016380,European Commission,,,,,


In [36]:
df_eutl_2022['TRANSACTION_TYPE'].value_counts()

TRANSACTION_TYPE
10-0      1171587
3-0        420832
10-2       222826
10-36       95547
10-53       83122
10-71       30745
3-21        22232
10-72       19222
4-0         14428
3-2         12135
5-0         10075
10-34        8325
5-1          6716
7-38         4250
10-35        3569
10-1         3550
10-86        3383
10-37        2255
10-61        1952
4-3           905
2-0           768
10-90         672
10-4          521
7-39          491
10-33         355
10-52         273
10-63         263
10-41         236
1-51          227
10-92         179
4-48          165
10-82         107
6-0            77
10-136         60
10-93          52
1-0            46
1-30           40
3-75           38
10-76          37
4-2            31
10-24          28
10-26          28
10-171         22
4-26           20
10-55          18
8-0            17
4-91           11
10-190         10
10-135          9
3-82            6
1-24            4
1-22            2
1-31            2
4-22            2
10-104     

In [37]:
df_eutl_2022.duplicated().sum()

np.int64(834084)

In [40]:
pd.read_csv(r'../data/raw/transactions_eutl_2025/transactions_EUTL_PUBLIC_NOTESD_20251031.csv', usecols=['UNIT_TYPE_DESCRIPTION']).value_counts()

UNIT_TYPE_DESCRIPTION                                        
AAU - Assigned Amount Unit                                       1054991
Non-Kyoto Unit                                                    708724
CER - Certified Emission Reduction Unit converted from an AAU     324244
ERU - Emission Reduction Unit                                      54135
tCER - Temporary CER                                                 217
RMU - Removal Unit                                                   130
ERU - Converted from an RMU                                           34
Name: count, dtype: int64