In [1]:
# AMALGUM Data Source

!git clone https://github.com/gucorpling/amalgum.git

Cloning into 'amalgum'...
remote: Enumerating objects: 148871, done.[K
remote: Counting objects: 100% (7860/7860), done.[K
remote: Compressing objects: 100% (1845/1845), done.[K
remote: Total 148871 (delta 6020), reused 7819 (delta 6014), pack-reused 141011 (from 1)[K
Receiving objects: 100% (148871/148871), 714.55 MiB | 20.16 MiB/s, done.
Resolving deltas: 100% (101358/101358), done.
Updating files: 100% (24540/24540), done.


In [2]:
# Imports and Setup

import os
import re
import random
import pandas as pd
import nltk
from collections import Counter

random.seed(42)

In [19]:
# Data Collection

def extract_tokens(filepath):
    tokens = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            if not line.startswith('#') and line.strip():
                parts = line.strip().split('\t')
                if len(parts) >= 2 and '.' not in parts[0]:
                    tokens.append(parts[1])
    return tokens

base_dir = "amalgum/amalgum"
genres = {
    "bio": 150,
    "fiction": 150,
}
max_total_files = sum(genres.values())
all_tokens = []

for genre, file_limit in genres.items():
    genre_dir = os.path.join(base_dir, genre, "dep")
    conllu_files = [f for f in os.listdir(genre_dir) if f.endswith(".conllu")]
    selected_files = random.sample(conllu_files, file_limit)

    for file in selected_files:
        filepath = os.path.join(genre_dir, file)
        print(file)

        tokens = extract_tokens(filepath)
        all_tokens.extend(tokens)

raw_text = ' '.join(all_tokens)

print(f"\nTotal words: {len(all_tokens)}")
print(f"Total characters: {len(raw_text)}")

AMALGUM_bio_komatsu.conllu
AMALGUM_bio_thompson.conllu
AMALGUM_bio_robert.conllu
AMALGUM_bio_lechay.conllu
AMALGUM_bio_herbert.conllu
AMALGUM_bio_korsrud.conllu
AMALGUM_bio_tisquesusa.conllu
AMALGUM_bio_brassington.conllu
AMALGUM_bio_legh.conllu
AMALGUM_bio_jovan.conllu
AMALGUM_bio_albernaz.conllu
AMALGUM_bio_bicchi.conllu
AMALGUM_bio_hakohen.conllu
AMALGUM_bio_cretaceous.conllu
AMALGUM_bio_chenhao.conllu
AMALGUM_bio_gunthorpe.conllu
AMALGUM_bio_kozinski.conllu
AMALGUM_bio_dyche.conllu
AMALGUM_bio_balandin.conllu
AMALGUM_bio_wadham.conllu
AMALGUM_bio_freemason.conllu
AMALGUM_bio_powell.conllu
AMALGUM_bio_lucombe.conllu
AMALGUM_bio_wemyss.conllu
AMALGUM_bio_grainger.conllu
AMALGUM_bio_rufim.conllu
AMALGUM_bio_tahir.conllu
AMALGUM_bio_wallace.conllu
AMALGUM_bio_malinowski.conllu
AMALGUM_bio_carco.conllu
AMALGUM_bio_naotora.conllu
AMALGUM_bio_blocke.conllu
AMALGUM_bio_falkener.conllu
AMALGUM_bio_driver.conllu
AMALGUM_bio_arca.conllu
AMALGUM_bio_juanita.conllu
AMALGUM_bio_lopes.conllu
AMAL

In [20]:
# Data Cleaning

def merge_apostrophe_tokens(tokens):
    merged_tokens = []
    i = 0
    while i < len(tokens):
        if i < len(tokens) - 1 and tokens[i+1].lower() in {
            "'s", "’s", "'re", "’re", "'ve", "’ve", "'ll", "’ll", "'d", "’d", "'m", "’m", "n't", "n’t"
        }:
            merged = tokens[i] + tokens[i+1]
            merged_tokens.append(merged)
            i += 2
        else:
            merged_tokens.append(tokens[i])
            i += 1
    return merged_tokens

def clean_token(token):
    token = token.lower()
    token = re.sub(r"^[^\w']+|[^\w']+$", "", token)
    if re.fullmatch(r"[a-zA-Z]+'[a-zA-Z]+", token) or token.isalpha():
        return token
    return ""

merged_tokens = merge_apostrophe_tokens(all_tokens)

cleaned_tokens = [clean_token(tok) for tok in merged_tokens if clean_token(tok)]
raw_clean_text = ' '.join(cleaned_tokens)

word_freq = Counter(cleaned_tokens)

In [21]:
# Data Analysis

total_words = sum(word_freq.values())
unique_words = len(word_freq)

freq_df = pd.DataFrame(word_freq.items(), columns=['word', 'count'])
freq_df['percent'] = (freq_df['count'] / total_words) * 100
freq_df = freq_df.sort_values(by='count', ascending=False).reset_index(drop=True)

df_size_kb = freq_df.memory_usage(deep=True).sum() / 1024
print(f"DataFrame size: {df_size_kb:.2f} KB")
print(f"Total words: {total_words}")
print(f"Unique words: {unique_words}")
print("\nTop 50 most common words:")
print(freq_df.head(50))

DataFrame size: 1818.61 KB
Total words: 233241
Unique words: 23057

Top 50 most common words:
     word  count   percent
0     the  14374  6.162724
1      of   8654  3.710325
2     and   8176  3.505387
3      to   5829  2.499132
4      in   5481  2.349930
5       a   5079  2.177576
6     was   3711  1.591058
7      he   3209  1.375830
8     his   2481  1.063707
9    that   2343  1.004540
10     as   1936  0.830043
11      i   1885  0.808177
12     it   1850  0.793171
13   with   1827  0.783310
14    for   1826  0.782881
15     on   1669  0.715569
16    had   1595  0.683842
17    she   1547  0.663262
18    her   1514  0.649114
19     at   1392  0.596808
20     is   1265  0.542357
21     by   1187  0.508916
22    you   1108  0.475045
23   from   1067  0.457467
24    but   1051  0.450607
25    not    934  0.400444
26     be    910  0.390154
27    him    864  0.370432
28   they    854  0.366145
29     an    841  0.360571
30  which    795  0.340849
31   were    788  0.337848
32   have    76

In [22]:
# Save Data to Drive

from google.colab import drive
drive.mount('/content/drive')

output_path = "/content/drive/MyDrive/NWP/cleaned_text.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write(raw_clean_text)

print(f"Saved cleaned text to: {output_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Saved cleaned text to: /content/drive/MyDrive/NWP/cleaned_text.txt
