<a href="https://colab.research.google.com/github/anagha1999/anlp-project/blob/main/kannada/2.Clean_Preprocess_Kannada_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/anagha1999/anlp-project/

Cloning into 'anlp-project'...
remote: Enumerating objects: 170, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (119/119), done.[K
remote: Total 170 (delta 56), reused 101 (delta 27), pack-reused 23 (from 1)[K
Receiving objects: 100% (170/170), 39.37 MiB | 19.24 MiB/s, done.
Resolving deltas: 100% (62/62), done.


In [2]:
import requests
import os

urls = [
    "https://raw.githubusercontent.com/crvineeth97/kannada-stop-words/refs/heads/master/stop-words.txt"
]

for url in urls:
    filename = url.split('/')[-1]
    response = requests.get(url)
    with open(filename, 'wb') as f:
        f.write(response.content)
    print(f"Downloaded: {filename}")

Downloaded: stop-words.txt


## Extract Text from PDFs
Use `pypdf` to extract text from specific pages of the following PDF files: pages 17-236 for "1-janapada-kathegalu.pdf" and pages 10-225 for "Satrii_Niiti_Kathegal'u_text.pdf". Then, load the stop words from "stop-words.txt", remove these words from the extracted texts, and save the cleaned content into a new directory named `kannada-pre-processed-texts` with the filenames `1-janapada-kathegalu.txt` and `Satrii_Niiti_Kathegal'u_text.txt`. Finally, summarize the extraction and cleaning process and confirm the paths of the saved files.

In [34]:
!pip install pypdf



In [35]:
import os
import requests
from pypdf import PdfReader

# Define filenames and page ranges
pdf_configs = [
    {'filename': '1-janapada-kathegalu.pdf', 'start': 17, 'end': 236},
    {'filename': "2-Satrii_Niiti_Kathegal'u_text.pdf", 'start': 10, 'end': 225}
]

# Directory where the repo was cloned
repo_dir = 'anlp-project/kannada/kannada-dataset'

extracted_texts = {}

for config in pdf_configs:
    filename = config['filename']
    start_page = config['start']
    end_page = config['end']

    # 1. Try to find the file in the repo directory
    file_path = os.path.join(repo_dir, filename)

    # 2. If not found, check current directory
    if not os.path.exists(file_path):
        file_path = filename

    # 3. If still invalid or corrupt, download it
    if not os.path.exists(file_path) or os.path.getsize(file_path) < 1000:
        print(f"File {filename} missing or corrupt. Downloading...")
        url = f"https://github.com/anagha1999/anlp-project/raw/main/kannada-dataset/{filename}"
        response = requests.get(url)
        if response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(response.content)
            file_path = filename
        else:
             print(f"Failed to download {filename}")

    print(f"Processing file: {file_path}")

    try:
        reader = PdfReader(file_path)
        text_content = []
        # Convert 1-based page numbers to 0-based indices
        # range(start, end) in python excludes end, so we need end_page (since we want up to end_page)
        # Instructions say: 1-janapada-kathegalu.pdf: pages 17 to 236.
        # Python index for page 17 is 16.
        # We want to include page 236. So range should go up to 236 (index 235).
        # range(16, 236) covers indices 16 to 235.
        for i in range(start_page - 1, end_page):
            if i < len(reader.pages):
                page = reader.pages[i]
                text_content.append(page.extract_text())

        extracted_texts[filename] = "\n".join(text_content)
        print(f"Successfully extracted text from {filename}")
    except Exception as e:
        print(f"Error extracting text from {file_path}: {e}")

# Print the first 500 characters of each extracted text
for filename, text in extracted_texts.items():
    print(f"\n--- Start of {filename} ---")
    print(text[:500])
    print(f"--- End of Preview ---")

Processing file: anlp-project/kannada/kannada-dataset/1-janapada-kathegalu.pdf
Successfully extracted text from 1-janapada-kathegalu.pdf
Processing file: anlp-project/kannada/kannada-dataset/2-Satrii_Niiti_Kathegal'u_text.pdf
Successfully extracted text from 2-Satrii_Niiti_Kathegal'u_text.pdf

--- Start of 1-janapada-kathegalu.pdf ---
೧. ಮುತುಗದ ಎಲೆ ಗೋತಾಯಿ 
ಒಂದ್‌ ಪಟ್ಟಣ. ಒಬ್ಬ ದೊಡ್ಡ ದೊರೆ. ಆ ದೊಡ್ಡ ದೊರೆಗೆ ಏಳ್‌ ಜನ ಮಕ್ಕಳು. ಆ 
ಏಳ್‌ ಜನ ಮಕ್ಕಳಿಗೂ ಚೆನ್ನಾಗಿ ಓದ್ದಿ, ಓದ್‌ ಬಂದೋರ್‌ಗೆ ಹಾಲು ತುಪ್ಪ ಅನ್ನ ಹಾಕಿ 
ಮಂಚದ ಮೇಲೆ ಕೂಡ್ರಿಸಿ ಚೆನ್ನಾಗಿ ಸಾಕೋರು. ಏಳು ಜನ ಮಕ್ಕಳೂವೆ ಹೆಣ್‌ ಮಕ್ಕಳೇ 
ಹುಟ್ಟಿದ್ದಾರಲ್ಲ ಅಂತ, ಆ ದೊರೆ ಒಂದಿನ ಹೆಣ್‌ಮಕ್ಕಳನ್ನೆಲ್ಲ ಕೇಳಿದನು, ನನ್ನಾರವ್ವ 
ಸಾಕೋರು ಅಂತ. ಆರು ಜನ ಮಕ್ಕಳೂವೆ ನಾನ್‌ ಸಾಕ್ತೀನಿ, ನಾನ್‌ ಸಾಕ್ತೀನಿಅಂತ ಅಂದ್ರು. 
ಆದರೆ ಕೊನೆ ಕಿರಿಮಗಳು ಮಾತ್ರ ನಾನ್‌ ಸಾಕಾಕಿಲ್ಲ ಅಂದ್ಲು. ಅಪ್ಪ ನಾವ್‌ ಸಾಕಾಕೆ ಆಗ್ಕದ? 
ಭಗವಂತ ನಮ್ಮನ್ನೆಲ್ಲ ಸಾಕ್‌ಬೇಕು. ಈ ಮಾತನ್ನೆ ಹೇಳ್ಕೊಂಡು ಬಂದ್ಲು. ಆಗ ಆ
--- End of Preview ---

--- Start of 2-Satrii_Niiti_Kathegal'u_text.pdf ---
ಸ್ತ್ರೀ ನೀತಿ ಕಥೆಗಳು 
ಪುರಾತನ ಕಾಲದಲ್ಲಿ ಗೋಪಾಲಸೆ ಟ್ಟ ಎಂಬೊಬ್ಬ ವ್ಯಾಪಾರಿಯು 
ಕಾಶಿ ರ ನಗರದಲ್ಲಿ ವಾ

In [None]:
import os
import re
import csv

# Load stop words
with open('stop-words.txt', 'r', encoding='utf-8') as f:
    stop_words = set(f.read().split())

# Create output directory
output_dir = 'kannada-pre-processed'
os.makedirs(output_dir, exist_ok=True)

cleaned_files = []

# Process each extracted text
for filename, text in extracted_texts.items():
    # First, replace multiple consecutive newlines (paragraph breaks) with '---\n'
    text_with_paragraphs = re.sub(r'\n\s*\n', '\n---\n', text)
    
    # Split into paragraphs
    raw_paragraphs = text_with_paragraphs.split('---')
    
    # Process each paragraph to remove stop words
    cleaned_paragraphs = []
    for para in raw_paragraphs:
        # Process line by line within each paragraph
        cleaned_lines = []
        for line in para.splitlines():
            words = line.split()
            cleaned_words = [w for w in words if w not in stop_words]
            cleaned_lines.append(" ".join(cleaned_words))
        
        # Join lines back and strip whitespace
        cleaned_para = "\n".join(cleaned_lines).strip()
        if cleaned_para:  # Only add non-empty paragraphs
            cleaned_paragraphs.append(cleaned_para)

    # Determine output filename (change extension to .csv)
    base_name = os.path.basename(filename)
    csv_filename = os.path.splitext(base_name)[0] + ".csv"
    output_path = os.path.join(output_dir, csv_filename)

    # Save as CSV with paragraph_num and cleaned_text columns
    with open(output_path, 'w', encoding='utf-8', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['paragraph_num', 'cleaned_text'])
        for idx, para in enumerate(cleaned_paragraphs, start=1):
            writer.writerow([idx, para])

    cleaned_files.append(output_path)

print("Extraction and cleaning process completed.")
print(f"Total files processed: {len(cleaned_files)}")
print("Saved files at:")
for path in cleaned_files:
    print(f"  {path}")

Extraction and cleaning process completed (structure preserved).
Saved files at:
kannada-pre-processed/1-janapada-kathegalu.txt
kannada-pre-processed/2-Satrii_Niiti_Kathegal'u_text.txt


## Generate Paragraph Embeddings for each Text [Kannada]


In [3]:
import os

# 1. Define the file path
file_path_to_check = os.path.join('anlp-project', 'kannada', 'kannada-pre-processed', '1-janapada-kathegalu.txt')

# Ensure the file exists before proceeding
if os.path.exists(file_path_to_check):
    # 2. Open the file in read mode ('r') with UTF-8 encoding
    with open(file_path_to_check, 'r', encoding='utf-8') as f:
        # 3. Read the entire content of the file into a string variable
        content = f.read()

    # 4. Count paragraphs based on "---" separator
    paragraphs = [p.strip() for p in content.split('---') if p.strip()]

    # 5. Print the count
    print(f"File: {file_path_to_check}")
    print(f"Total Paragraphs (separated by '---'): {len(paragraphs)}")

    if paragraphs:
        print(f"First paragraph start: {paragraphs[0][:50]}...")
else:
    print(f"Error: File not found at {file_path_to_check}")

File: anlp-project/kannada/kannada-pre-processed/1-janapada-kathegalu.txt
Total Paragraphs (separated by '---'): 220
First paragraph start: ೧. ಮುತುಗದ ಎಲೆ ಗೋತಾಯಿ
ಒಂದ್‌ ಪಟ್ಟಣ. ಒಬ್ಬ ದೊಡ್ಡ ದೊರೆ....


In [None]:
from sentence_transformers import SentenceTransformer
EMBEDDINGS_MODEL_NAME='l3cube-pune/indic-sentence-similarity-sbert'
model = SentenceTransformer(EMBEDDINGS_MODEL_NAME)

In [None]:
import os

target_dir = 'anlp-project/kannada/kannada-pre-processed'
file_embeddings = {}

if os.path.exists(target_dir):
    for filename in os.listdir(target_dir):
        if filename.endswith('.txt'):
            filepath = os.path.join(target_dir, filename)

            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()

            # Split content into paragraphs using '---' as separator and filter empty strings
            paragraphs = [p.strip() for p in content.split('---') if p.strip()]

            if paragraphs:
                # Generate embeddings
                embeddings = model.encode(paragraphs)
                file_embeddings[filename] = embeddings

                print(f"File: {filename}")
                print(f"  Paragraphs processed: {len(paragraphs)}")
                print(f"  Embeddings shape: {embeddings.shape}")
            else:
                print(f"File: {filename} - No paragraphs found.")
else:
    print(f"Target directory not found: {target_dir}")

In [None]:
import pickle

# Define the file path for saving the embeddings
embeddings_file_path = 'kannada_texts_embeddings.pkl'

# Save the file_embeddings dictionary to a pickle file
with open(embeddings_file_path, 'wb') as f:
    pickle.dump(file_embeddings, f)

print(f"Successfully saved paragraph embeddings to {embeddings_file_path}")