In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
from collections import Counter
from google.colab import drive

# --- 0. Configuration ---
path_to_adv_txt = '/content/drive/My Drive/Colab Notebooks/adv.txt'
path_to_code_txt = '/content/drive/My Drive/Colab Notebooks/code.txt'

# --- 2. Preprocessing Functions ---

def preprocess_and_tokenize(text, mode):
    """
    Applies the preprocessing rules based on the dataset type
    and returns a list of tokens.
    """
    # Convert the text to lowercase (common for both)
    text = text.lower()

    if mode == 'text':
        text = re.sub(r'[^a-z0-9 \.]', '', text)
        tokens = text.split()

    elif mode == 'code':
        tokens = re.findall(r'\w+|[^\w\s]', text)

    return tokens

def process_and_report(file_path, mode):
    """
    Loads, processes, and prints the full report for a given file.
    """
    print(f"--- Processing Report for: {file_path} (Mode: {mode}) ---")

    # 1. Load Data
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            corpus = f.read()
    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        print("Please make sure the file exists and the path is correct.\n")
        return 

    # 2. Preprocess
    all_tokens = preprocess_and_tokenize(corpus, mode)

    if not all_tokens:
        print("No tokens were found after preprocessing. Is the file empty?\n")
        return

    vocabulary = set(all_tokens)
    word_counts = Counter(all_tokens)

    # 3. Report Frequencies
    print(f"\nVocabulary Size: {len(vocabulary)}")

    most_frequent = word_counts.most_common(10)
    print("\n10 Most Frequent Words:")
    for word, count in most_frequent:
        print(f"- {word}: {count}")

    least_frequent = word_counts.most_common()[:-11:-1]
    print("\n10 Least Frequent Words:")
    for word, count in least_frequent:
        print(f"- {word}: {count}")

    # 4. Report (X, y) Pairs
    print("\n--- (X, y) Pair Generation Example ---")
    CONTEXT_SIZE = 5
    padding = ['.'] * CONTEXT_SIZE
    padded_tokens = padding + all_tokens

    training_pairs = []
    for i in range(len(padded_tokens) - CONTEXT_SIZE):
        context = padded_tokens[i : i + CONTEXT_SIZE]
        target = padded_tokens[i + CONTEXT_SIZE]
        training_pairs.append((context, target))

    for context, target in training_pairs[:10]:
        print(f"{' '.join(context)} ---> {target}")
    print("\n") 


# --- Main execution ---

process_and_report(path_to_adv_txt, 'text')

print("="*60 + "\n") 

process_and_report(path_to_code_txt, 'code')

--- Processing Report for: /content/drive/My Drive/Colab Notebooks/adv.txt (Mode: text) ---

Vocabulary Size: 16821

10 Most Frequent Words:
- the: 4795
- and: 2572
- of: 2458
- to: 2435
- i: 2420
- a: 2344
- in: 1607
- that: 1494
- it: 1304
- was: 1216

10 Least Frequent Words:
- ebooks.: 1
- newsletter: 1
- tosubscribe: 1
- gutenbergincluding: 1
- includes: 1
- www.gutenberg.org.this: 1
- searchfacility: 1
- pg: 1
- paperedition.most: 1
- notnecessarily: 1

--- (X, y) Pair Generation Example ---
. . . . . ---> the
. . . . the ---> project
. . . the project ---> gutenberg
. . the project gutenberg ---> ebook
. the project gutenberg ebook ---> of
the project gutenberg ebook of ---> the
project gutenberg ebook of the ---> adventures
gutenberg ebook of the adventures ---> of
ebook of the adventures of ---> sherlock
of the adventures of sherlock ---> holmesby



--- Processing Report for: /content/drive/My Drive/Colab Notebooks/code.txt (Mode: code) ---

Vocabulary Size: 32821

10 Most Fr