In [None]:
# read data/ecfr_contents.csv into df

import polars as pl

df_contents = pl.read_csv('data/ecfr_contents.csv', ignore_errors=True)

# convert contents to string
df_contents = df_contents.with_columns(
    df_contents['contents'].cast(pl.String)
)

df_contents

In [None]:
# add a space char after every "head" value
df_counted = df_contents.with_columns([
    pl.concat_str([pl.col("head"), pl.lit(" ")]).alias("head")
])

# create new column with head and contents with "join"
df_counted = df_counted.with_columns([
    pl.concat_str([pl.col("head"), pl.col("contents")]).alias("head_and_contents")
])

# remove all URLs from head_and_contents
df_counted = df_counted.with_columns([
    pl.col("head_and_contents").str.replace_all(r"http\S+", " ").alias("head_and_contents")
])

# modify all head_and_contents to be just letters and keep all possible whitespace
df_counted = df_counted.with_columns([
    pl.col("head_and_contents").str.replace_all(r"[^a-zA-Z]", " ").alias("head_and_contents")
])

df_counted = df_counted.drop(["head", "contents"])

# lowercase all words
df_counted = df_counted.with_columns([
    pl.col("head_and_contents").str.to_lowercase().alias("head_and_contents")
])

df_counted = df_counted.with_columns([
    pl.col("head_and_contents").str.split(" ").list.len().alias("word_count"),
    pl.col("head_and_contents").str.split(" ").list.eval(pl.element().value_counts()).alias("word_count_of_each_word"),
])

df_counted

In [None]:
# explode the word_count_of_each_word column
df_exploded_counts = df_counted.explode("word_count_of_each_word")

df_exploded_counts

In [None]:
# drop rows where word_count_of_each_word is null
df_tranformed_counts = df_exploded_counts.filter(pl.col("word_count_of_each_word").is_not_null())

# word_count_of_each_word has type struct[2], first value will be the word, second value will be the count
# split the struct into two columns

df_tranformed_counts = df_tranformed_counts.with_columns([
    pl.col("word_count_of_each_word").struct.unnest()
])

df_tranformed_counts

In [None]:
# print names of cols
print(df_tranformed_counts.columns)

# rename count to word_count and '' (blank col) t0 word_value
df_named_counts = df_tranformed_counts.with_columns([
    pl.col("").alias("word_value"),
    pl.col("count").alias("word_count")
])

# drop the prev cols
df_named_counts = df_named_counts.drop(["word_count_of_each_word", "", "count", "children_ids", "type", "n"])

# add new "word_length" column
df_named_counts = df_named_counts.with_columns([
    pl.col("word_value").str.len_chars().alias("word_length")
])

# with polars, sort by the longest "word_value" char count
df_named_counts = df_named_counts.sort("word_length", descending=True)
df_named_counts.head(20)

In [None]:
# summarize by volume
df_volume_grouped = df_named_counts.group_by(['volume', 'word_value']).agg([
    pl.sum("word_count").alias("word_count")
])

df_volume_grouped

In [None]:
df_chapter_grouped = df_named_counts.group_by(['volume', 'chapter', 'word_value']).agg([
    pl.sum("word_count").alias("word_count")
])

df_chapter_grouped

In [None]:
df_subchapter_grouped = df_named_counts.group_by(['volume', 'chapter', 'subchapter', 'word_value']).agg([
    pl.sum("word_count").alias("word_count")
])
df_subchapter_grouped

In [None]:
df_part_grouped = df_named_counts.group_by(['volume', 'chapter', 'subchapter', 'part', 'word_value']).agg([
    pl.sum("word_count").alias("word_count")
])

df_part_grouped

In [None]:
df_subpart_grouped = df_named_counts.group_by(['volume', 'chapter', 'subchapter', 'part', 'subpart', 'word_value']).agg([
    pl.sum("word_count").alias("word_count")
])

In [None]:
df_volume_grouped.write_csv('data/word_count_by_volume.csv')
df_chapter_grouped.write_csv('data/word_count_by_chapter.csv')
df_subchapter_grouped.write_csv('data/word_count_by_subchapter.csv')
df_part_grouped.write_csv('data/word_count_by_part.csv')
df_subpart_grouped.write_csv('data/word_count_by_subpart.csv')