In [None]:
import os
import json
import nltk
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag

nltk.download('punkt')

In [None]:
def analyse_train_data(file_path: str) -> dict:
    counter = Counter()
    total_tokens = 0
    total_sentences = 0
    total_word_length = 0

    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
        sentences = sent_tokenize(text)
        total_sentences += len(sentences)
        
        for sentence in sentences:
            tokens = word_tokenize(sentence)
            total_tokens += len(tokens)
            total_word_length += sum(len(token) for token in tokens)
            counter.update(tokens)

    total_types = len(counter)
    total_words = sum(counter.values())
    average_words_per_sentence = total_words / total_sentences
    average_word_length = total_word_length / total_words

    words = list(counter.elements())
    tagged = pos_tag(words)
    pos_counter = Counter(tag for word, tag in tagged)
    most_common_pos = pos_counter.most_common(10)

    return {
        'total_tokens': total_tokens,
        'total_types': total_types,
        'total_words': total_words,
        'average_words_per_sentence': average_words_per_sentence,
        'average_word_length': average_word_length,
        'most_common_pos': most_common_pos,
        'word_frequencies': counter,
    }

In [6]:
DATASET_PATH: str = "babylm_data"
RESULTS_PATH: str = "results"
os.makedirs(RESULTS_PATH) if not os.path.exists(RESULTS_PATH) else None

for experiment in ["babylm_10M", "babylm_100M"]:

    file_paths: list[str] = [
        f"{DATASET_PATH}/{experiment}/aochildes.train",
        f"{DATASET_PATH}/{experiment}/bnc_spoken.train",
        f"{DATASET_PATH}/{experiment}/cbt.train",
        f"{DATASET_PATH}/{experiment}/children_stories.train",
        f"{DATASET_PATH}/{experiment}/gutenberg.train",
        f"{DATASET_PATH}/{experiment}/open_subtitles.train",
        f"{DATASET_PATH}/{experiment}/qed.train",
        f"{DATASET_PATH}/{experiment}/simple_wikipedia.train",
        f"{DATASET_PATH}/{experiment}/switchboard.train",
        f"{DATASET_PATH}/{experiment}/wikipedia.train",
    ]

    experiment_results_target_path: str = f"{RESULTS_PATH}/{experiment}.json"
    results: list[dict] = []
    for file_path in file_paths:
        print(f"Processing [{file_path}] ...")
        result: dict = analyse_train_data(file_path)
        results.append(result)
        
        # store last computed results
        with open(experiment_results_target_path, "w") as file:
            print(f"Saving last results to [{experiment_results_target_path}]")
            file.write(json.dumps(results))