In [2]:
!git clone https://github.com/gucorpling/amalgum.git

Cloning into 'amalgum'...
remote: Enumerating objects: 148871, done.[K
remote: Counting objects: 100% (7860/7860), done.[K
remote: Compressing objects: 100% (1845/1845), done.[K
remote: Total 148871 (delta 6020), reused 7819 (delta 6014), pack-reused 141011 (from 1)[K
Receiving objects: 100% (148871/148871), 714.55 MiB | 21.57 MiB/s, done.
Resolving deltas: 100% (101358/101358), done.
Updating files: 100% (24540/24540), done.


In [3]:
import os
import re
import random
import pandas as pd
import nltk
from collections import Counter

In [13]:
def extract_tokens_from_conllu(filepath):
    tokens = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            if not line.startswith('#') and line.strip():
                parts = line.strip().split('\t')
                if len(parts) >= 2 and '.' not in parts[0]:
                    tokens.append(parts[1])
    return tokens

base_dir = "amalgum/amalgum"
genres = {
    "fiction": 350,
    "bio": 350
}
max_total_files = sum(genres.values())
all_tokens = []

print("Selected files and sizes (in KB):")
total_kb = 0

random.seed(42)

for genre, file_limit in genres.items():
    genre_dir = os.path.join(base_dir, genre, "dep")
    conllu_files = [f for f in os.listdir(genre_dir) if f.endswith(".conllu")]
    selected_files = random.sample(conllu_files, file_limit)

    for file in selected_files:
        filepath = os.path.join(genre_dir, file)
        file_size_kb = os.path.getsize(filepath) / 1024
        total_kb += file_size_kb

        print(f"{file} — {file_size_kb:.2f} KB")

        tokens = extract_tokens_from_conllu(filepath)
        all_tokens.extend(tokens)

raw_text = ' '.join(all_tokens)
print(f"\nTotal data size: {total_kb:.2f} KB ({len(all_tokens)} tokens)")

Selected files and sizes (in KB):
AMALGUM_fiction_hans.conllu — 86.00 KB
AMALGUM_fiction_helium.conllu — 70.36 KB
AMALGUM_fiction_perry.conllu — 76.60 KB
AMALGUM_fiction_hilarius.conllu — 79.00 KB
AMALGUM_fiction_van.conllu — 77.69 KB
AMALGUM_fiction_mansoul.conllu — 78.14 KB
AMALGUM_fiction_mapuhi.conllu — 82.31 KB
AMALGUM_fiction_residencia.conllu — 76.87 KB
AMALGUM_fiction_flirt.conllu — 86.27 KB
AMALGUM_fiction_amulet.conllu — 81.68 KB
AMALGUM_fiction_huckleberry.conllu — 81.36 KB
AMALGUM_fiction_nugget.conllu — 80.15 KB
AMALGUM_fiction_lenape.conllu — 76.17 KB
AMALGUM_fiction_diana.conllu — 78.64 KB
AMALGUM_fiction_shadow.conllu — 74.25 KB
AMALGUM_fiction_dynamiter.conllu — 85.07 KB
AMALGUM_fiction_harney.conllu — 80.48 KB
AMALGUM_fiction_goriot.conllu — 78.92 KB
AMALGUM_fiction_desplein.conllu — 75.28 KB
AMALGUM_fiction_gentleman.conllu — 85.72 KB
AMALGUM_fiction_swift.conllu — 79.88 KB
AMALGUM_fiction_casterbridge.conllu — 78.78 KB
AMALGUM_fiction_limberlost.conllu — 82.60 KB
AM

In [14]:
def merge_apostrophe_tokens(tokens):
    merged_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and tokens[i+1].lower() in {
            "'s", "’s", "'re", "’re", "'ve", "’ve", "'ll", "’ll", "'d", "’d", "'m", "’m", "n't", "n’t"
        }:
            merged = tokens[i] + tokens[i+1]
            merged_tokens.append(merged)
            i += 2
        else:
            merged_tokens.append(tokens[i])
            i += 1
    return merged_tokens

def clean_token(token):
    token = token.lower()
    token = re.sub(r"^[^\w']+|[^\w']+$", "", token)
    if re.fullmatch(r"[a-zA-Z]+'[a-zA-Z]+", token) or token.isalpha():
        return token
    return ""

merged_tokens = merge_apostrophe_tokens(all_tokens)

cleaned_tokens = [clean_token(tok) for tok in merged_tokens if clean_token(tok)]
raw_clean_text = ' '.join(cleaned_tokens)

word_freq = Counter(cleaned_tokens)

In [15]:
total_words = sum(word_freq.values())
unique_words = len(word_freq)

freq_df = pd.DataFrame(word_freq.items(), columns=['word', 'count'])
freq_df['percent'] = (freq_df['count'] / total_words) * 100
freq_df = freq_df.sort_values(by='count', ascending=False).reset_index(drop=True)

df_size_kb = freq_df.memory_usage(deep=True).sum() / 1024
print(f"DataFrame size: {df_size_kb:.2f} KB")
print(f"Total words: {total_words}")
print(f"Unique words: {unique_words}")
print("\nTop 50 most common words:")
print(freq_df.head(50))

DataFrame size: 2919.45 KB
Total words: 541570
Unique words: 36878

Top 50 most common words:
     word  count   percent
0     the  34815  6.428532
1      of  20118  3.714755
2     and  18232  3.366508
3      to  13462  2.485736
4      in  12618  2.329893
5       a  12191  2.251048
6     was   8866  1.637092
7      he   7783  1.437118
8     his   6041  1.115461
9    that   5333  0.984730
10      i   4779  0.882434
11     as   4469  0.825193
12    for   4349  0.803036
13     it   4343  0.801928
14   with   4151  0.766475
15    had   3904  0.720867
16     on   3788  0.699448
17     at   3485  0.643499
18    her   3153  0.582196
19     by   2860  0.528094
20    she   2859  0.527910
21     is   2857  0.527540
22   from   2563  0.473254
23    but   2485  0.458851
24    you   2473  0.456635
25    not   2181  0.402718
26    him   2105  0.388685
27   were   1966  0.363019
28     be   1953  0.360618
29     an   1913  0.353232
30  which   1902  0.351201
31   have   1805  0.333290
32   this   176

In [16]:
from google.colab import drive
drive.mount('/content/drive')

output_path = "/content/drive/MyDrive/NWP/cleaned_text_larger.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write(raw_clean_text)

print(f"Saved cleaned text to: {output_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved cleaned text to: /content/drive/MyDrive/NWP/cleaned_text_larger.txt
