In [1]:
from pathlib import Path
import pandas as pd

# go over all parquet files in data directory
data_dir = Path('./data/')

def count_words(s):
    return sum(1 for w in s.split(' ') if len(w) > 0)

def stats_strlen(col: pd.Series):
    col_len = col.apply(len)
    return col_len.min(), col_len.max(), col_len.mean(), col_len.median()

def stats_wordcount(col: pd.Series):
    col_wc = col.apply(count_words) 
    return col_wc.min(), col_wc.max(), col_wc.mean(), col_wc.median()


file_name = []
rows = []
text_min_strlen, text_max_strlen, text_mean_strlen, text_median_strlen = [], [], [], []
summary_min_strlen, summary_max_strlen, summary_mean_strlen, summary_median_strlen = [], [], [], []

text_min_words, text_max_words, text_mean_words, text_median_words = [], [], [], []
summary_min_words, summary_max_words, summary_mean_words, summary_median_words = [], [], [], []
 
for fn in data_dir.glob('*/*.parquet'):
    print('processing:', fn)
    df = pd.read_parquet(path=fn, engine='pyarrow')
    file_name.append(fn)
    rows.append(len(df))
    mi, ma, mu, me = stats_strlen(df['text'])
    text_min_strlen.append(mi)
    text_max_strlen.append(ma)
    text_mean_strlen.append(mu)
    text_median_strlen.append(me)
    mi, ma, mu, me = stats_strlen(df['summary'])
    summary_min_strlen.append(mi)
    summary_max_strlen.append(ma)
    summary_mean_strlen.append(mu)
    summary_median_strlen.append(me)
    mi, ma, mu, me = stats_wordcount(df['text'])
    text_min_words.append(mi)
    text_max_words.append(ma)
    text_mean_words.append(mu)
    text_median_words.append(me)
    mi, ma, mu, me = stats_wordcount(df['summary'])
    summary_min_words.append(mi)
    summary_max_words.append(ma)
    summary_mean_words.append(mu)
    summary_median_words.append(me)

stats = pd.DataFrame({"file_name": file_name, "row_count": rows, "text_min_strlen": text_min_strlen, "text_max_strlen": text_max_strlen, "text_mean_strlen": text_mean_strlen, "text_median_strlen": text_median_strlen, 
    "summary_min_strlen": summary_min_strlen, "summary_max_strlen": summary_max_strlen, "summary_mean_strlen": summary_mean_strlen, "summary_median_strlen": summary_median_strlen, 
    "text_min_words": text_min_words, "text_max_words": text_max_words, "text_mean_words": text_mean_words, "text_median_words": text_median_words,
    "summary_min_words": summary_min_words, "summary_max_words": summary_max_words, "summary_mean_words": summary_mean_words, "summary_median_words": summary_median_words})

output_dir = Path('./data/')
stats.to_csv(output_dir / 'stats.csv')

stats
    

processing: data/cnn_dailymail/cnn_dailymail-3.0.0_test.snappy.parquet
processing: data/cnn_dailymail/cnn_dailymail-3.0.0_validation.snappy.parquet
processing: data/cnn_dailymail/cnn_dailymail-3.0.0_train.snappy.parquet
processing: data/tldr-challenge/tldr-challenge_00000.snappy.parquet
processing: data/tldr-challenge/tldr-challenge_00001.snappy.parquet
processing: data/wikihow/wikihow-all_validation.snappy.parquet
processing: data/wikihow/wikihow-all_test.snappy.parquet
processing: data/wikihow/wikihow-all_train.snappy.parquet
processing: data/billsum/billsum_validation.snappy.parquet
processing: data/billsum/billsum_test.snappy.parquet
processing: data/billsum/billsum_train.snappy.parquet
processing: data/newsroom/newsroom_train_00002.snappy.parquet
processing: data/newsroom/newsroom_train_00000.snappy.parquet
processing: data/newsroom/newsroom_dev.snappy.parquet
processing: data/newsroom/newsroom_test.snappy.parquet
processing: data/newsroom/newsroom_train_00001.snappy.parquet
proce

Unnamed: 0,file_name,row_count,text_min_strlen,text_max_strlen,text_mean_strlen,text_median_strlen,summary_min_strlen,summary_max_strlen,summary_mean_strlen,summary_median_strlen,text_min_words,text_max_words,text_mean_words,text_median_words,summary_min_words,summary_max_words,summary_mean_words,summary_median_words
0,data/cnn_dailymail/cnn_dailymail-3.0.0_test.sn...,11490,293,11991,3967.077807,3563.0,51,3410,311.927241,290.0,55,1954,682.348216,613.0,9,535,51.803916,48.0
1,data/cnn_dailymail/cnn_dailymail-3.0.0_validat...,13368,245,11412,3923.831912,3523.0,52,8541,328.033513,299.0,41,1914,675.020347,606.0,10,1407,54.435293,50.0
2,data/cnn_dailymail/cnn_dailymail-3.0.0_train.s...,287113,48,15925,4033.661722,3682.0,14,7388,294.77039,280.0,8,2347,691.624859,632.0,4,1246,48.688304,46.0
3,data/tldr-challenge/tldr-challenge_00000.snapp...,1542205,314,11546,1116.4086,1018.0,1,4286,132.310908,99.0,4,800,207.961671,190.0,1,560,24.170323,18.0
4,data/tldr-challenge/tldr-challenge_00001.snapp...,1542205,110,24819,1170.944259,1091.0,0,6451,151.757709,114.0,1,3874,219.325584,204.0,0,664,27.976438,21.0
5,data/wikihow/wikihow-all_validation.snappy.par...,5599,57,23943,2896.788355,1991.0,11,3245,305.339882,220.0,11,3918,501.348812,346.0,1,481,45.550455,33.0
6,data/wikihow/wikihow-all_test.snappy.parquet,5577,45,37744,2916.031379,1993.0,10,3417,304.391967,222.0,8,6398,504.595481,347.0,2,414,45.252107,33.0
7,data/wikihow/wikihow-all_train.snappy.parquet,157252,30,74165,2901.226585,1999.0,2,23536,305.955778,223.0,1,11928,502.60749,349.0,1,3757,45.621963,33.0
8,data/billsum/billsum_validation.snappy.parquet,631,5013,19967,10475.125198,9461.0,105,4077,1140.1458,996.0,368,2800,1398.2187,1265.0,14,611,171.413629,152.0
9,data/billsum/billsum_test.snappy.parquet,3269,5004,19998,10268.095442,9316.0,62,4986,1184.728357,1033.0,221,3036,1361.401652,1233.0,10,787,177.883451,155.0
