### Notebook to count the number of tokens (words) in legal datasets

Ciation: Sean Rehaag, "Notebook to count the number of tokens (words) in legal datasets" (2025), online: <https://github.com/a2aj-ca/canadian-legal-data>.

In [None]:
import tiktoken
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

# Enable progress bar for pandas operations
tqdm.pandas()

# load all legislation / regulations
laws = load_dataset("a2aj/canadian-laws", split="train")
cases = load_dataset("a2aj/canadian-case-law", split="train")
# covert to df
df_laws = laws.to_pandas()
df_cases = cases.to_pandas()

# count tokens
def count_tokens(text):
    if not text:
        return 0
    encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
    tokens = encoding.encode(text)
    return len(tokens)

print("\nCounting tokens for laws (English)...")
df_laws['token_count_en'] = df_laws['unofficial_text_en'].progress_apply(count_tokens)

print("Counting tokens for laws (French)...")
df_laws['token_count_fr'] = df_laws['unofficial_text_fr'].progress_apply(count_tokens)

print("Counting tokens for cases (English)...")
df_cases['token_count_en'] = df_cases['unofficial_text_en'].progress_apply(count_tokens)

print("Counting tokens for cases (French)...")
df_cases['token_count_fr'] = df_cases['unofficial_text_fr'].progress_apply(count_tokens)

# get sum for cases
total_case_tokens = df_cases['token_count_en'].sum() + df_cases['token_count_fr'].sum()
total_law_tokens = df_laws['token_count_en'].sum() + df_laws['token_count_fr'].sum()

print(f"\nResults:")
print(f"Cases: {total_case_tokens:,} tokens")
print(f"Laws: {total_law_tokens:,} tokens")
print(f"Total: {total_case_tokens + total_law_tokens:,} tokens")
print("Assuming an average of 750 words per 1000 tokens:")
print(f"Total: {(total_case_tokens + total_law_tokens) * 750 / 1000:,} words")



Counting tokens for laws (English)...


100%|██████████| 5757/5757 [00:04<00:00, 1222.01it/s]


Counting tokens for laws (French)...


100%|██████████| 5757/5757 [00:06<00:00, 904.09it/s] 


Counting tokens for cases (English)...


100%|██████████| 116734/116734 [01:49<00:00, 1067.75it/s]


Counting tokens for cases (French)...


100%|██████████| 116734/116734 [02:11<00:00, 884.41it/s] 


Results:
Cases: 1,230,543,005 tokens
Laws: 55,577,239 tokens
Total: 1,286,120,244 tokens





In [10]:
# make cases table with token counts
import requests
import pandas as pd

# Set url
url = "https://api.a2aj.ca/coverage"

# Get parameters
doc_type = "cases" 
parameters = f"doc_type={doc_type}"

# Get data
request = requests.get(f"{url}?{parameters}")
data = request.json()

# Convert to dataframe
df_cases_coverage = pd.DataFrame(data['results'])

# Create new col, token_count, sum of df.token_count_en and df.token_count fr
df_cases["token_count"] = df_cases["token_count_en"] + df_cases["token_count_fr"]

# Create new df_cases_summary groupby datasets
df_cases_tokens = df_cases.groupby("dataset").agg({"token_count": "sum"}).reset_index()

# Merge on dataset
df_cases_coverage = df_cases_coverage.merge(df_cases_tokens, on="dataset")

# Drop description_fr
df_cases_coverage = df_cases_coverage.drop(columns=["description_fr"])

df_cases_coverage

Unnamed: 0,dataset,description_en,earliest_document_date,latest_document_date,number_of_documents,token_count
0,CHRT,Canadian Human Rights Tribunal,2003-01-10,2025-07-16,1050,21875457
1,CMAC,Court Martial Appeal Court,2001-01-19,2025-06-17,147,2352315
2,FC,Federal Court,2001-02-01,2025-08-01,34256,409525860
3,FCA,Federal Court of Appeal,2001-02-01,2025-08-01,7580,74318193
4,ONCA,Ontario Court of Appeal,2007-01-02,2025-08-01,16951,59157061
5,RAD,Refugee Appeal Division (IRB),2013-02-19,2024-07-22,14022,172800873
6,RLLR,"Refugee Law Lab Reporter (RPD, IRB)",2019-01-07,2023-12-29,898,2459030
7,RPD,Refugee Protection Division (IRB),2002-07-16,2020-12-14,6729,79241147
8,SCC,Supreme Court of Canada,1877-01-15,2025-07-31,10845,179880823
9,SST,Social Security Tribunal,2013-03-08,2025-12-16,16338,119870547


In [11]:
# make laws table with token counts
import requests
import pandas as pd

# Set url
url = "https://api.a2aj.ca/coverage"

# Get parameters
doc_type = "laws" 
parameters = f"doc_type={doc_type}"

# Get data
request = requests.get(f"{url}?{parameters}")
data = request.json()

# Convert to dataframe
df_laws_coverage = pd.DataFrame(data['results'])

# Create new col, token_count, sum of df.token_count_en and df.token_count fr
df_laws["token_count"] = df_laws["token_count_en"] + df_laws["token_count_fr"]

# Create new df_cases_summary groupby datasets
df_laws_tokens = df_laws.groupby("dataset").agg({"token_count": "sum"}).reset_index()

# Merge on dataset
df_laws_coverage = df_laws_coverage.merge(df_laws_tokens, on="dataset")

# Drop description_fr
df_laws_coverage = df_laws_coverage.drop(columns=["description_fr"])

df_laws_coverage

Unnamed: 0,dataset,description_en,earliest_document_date,latest_document_date,number_of_documents,token_count
0,LEGISLATION-FED,Federal Statutes,1870-05-12,2025-06-26,954,28692454
1,REGULATIONS-FED,Federal Regulations,1945-12-21,2025-07-16,4803,26884785
