In [47]:
import re
import unicodedata
import pandas as pd

In [48]:
# ---------------------------
# 1. Load Tamil Stopwords (Minimal List)
# ---------------------------
tamil_stopwords = {
    "மற்றும்", "ஆகிய", "ஆனால்", "என", "என்று", "எனில்", "என்ன",
    "இது", "அது", "இவர்", "அவர்", "அவர்கள்", "நான்", "நீ",
    "என்", "உன்", "எங்களுக்கு", "அவற்றின்", "உள்ள", "ஒரு",
    "ஒரு", "எது", "எவை", "யார்", "எப்படி",
    "தான்", "வேண்டும்", "மட்டும்", "இல்லை", "உண்டு",
    "என்னும்", "எனும்", "அல்லது"
}

In [49]:
# ---------------------------
# 2. Helper: Tamil Unicode Normalization
# ---------------------------
def normalize_tamil(text):
    return unicodedata.normalize("NFC", text.strip())

In [50]:
# ---------------------------
# 3. Helper: Remove trailing numbers (e.g., ... 21)
# ---------------------------
def remove_trailing_number(line):
    return re.sub(r'\s*\d+\s*$', '', line).strip()


In [51]:
# ---------------------------
# 4. Helper: Remove Tamil stopwords
# ---------------------------
def remove_stopwords(text):
    words = text.split()
    filtered = [w for w in words if w not in tamil_stopwords]
    return " ".join(filtered)


In [52]:
# ---------------------------
# 5. Helper: Detect metadata/header lines
# ---------------------------
def is_metadata(line):
    # Remove section titles like "1. அறத்துப்பால்", "1.1", "திருக்குறள்"
    # Contains non-Tamil or Tamil + digits in ways that do not form poetry lines
    if re.match(r'^\s*$', line):   # empty
        return True
    if re.match(r'^\d+(\.\d+)*', line):  # numeric headings
        return True
    if "திருக்குறள்" in line and len(line) < 20:
        return True
    return False

In [53]:
# ---------------------------
# 6. Main Preprocessing Function
# ---------------------------
def preprocess_thirukkural(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    cleaned_lines = []

    # Step A — Clean line by line
    for line in lines:
        line = line.strip()
        
        if is_metadata(line):
            continue  # skip titles/headers
        
        line = normalize_tamil(line)
        line = remove_trailing_number(line)

        if len(line) < 2:
            continue  # skip tiny lines
        
        cleaned_lines.append(line)

    # Step B — Group every 2 lines into 1 couplet
    couplets = []
    for i in range(0, len(cleaned_lines), 2):
        if i + 1 < len(cleaned_lines):
            couplet = cleaned_lines[i] + " " + cleaned_lines[i + 1]
        else:
            couplet = cleaned_lines[i]  # fallback if odd number
        # couplet = remove_stopwords(couplet)
        couplets.append(couplet)

    # Step C — Sanity check
    print("Total lines after cleaning:", len(cleaned_lines))
    print("Total couplets extracted:", len(couplets))

    # Step D — Return as a dataframe
    df = pd.DataFrame({
        "kural_number": list(range(1, len(couplets) + 1)),
        "couplet": couplets
    })

    return df

In [54]:
# ---------------------------
# 7. Run Script
# ---------------------------
if __name__ == "__main__":
    input_file = "./dataTamil/Thiruvalluvar_Thirukkural.txt"
    df = preprocess_thirukkural(input_file)
    df.to_csv("./processedDataTamil/thirukkural_cleaned.csv", index=False, encoding="utf-8")
    print("Saved cleaned couplets to thirukkural_cleaned.csv")

Total lines after cleaning: 2668
Total couplets extracted: 1334
Saved cleaned couplets to thirukkural_cleaned.csv


In [55]:
import re
import unicodedata
import pandas as pd

# ---------------------------
# Minimal Tamil Stopwords
# (same as Thirukkural script)
# ---------------------------
tamil_stopwords = {
    "மற்றும்", "ஆகிய", "ஆனால்", "என", "என்று",
    "எனில்", "என்ன", "இது", "அது", "இவர்", "அவர்",
    "அவர்கள்", "நான்", "நீ", "என்", "உன்", "ஒரு",
    "உள்ள", "தான்", "வேண்டும்", "மட்டும்", "இல்லை",
    "என்னும்", "எனும்", "அல்லது"
}

# ---------------------------
# Tamil Unicode Normalization
# ---------------------------
def normalize_tamil(text):
    return unicodedata.normalize("NFC", text.strip())

# ---------------------------
# Remove leading numbering (e.g., "1. ", "32. ")
# ---------------------------
def remove_number_prefix(line):
    return re.sub(r'^\s*\d+\.\s*', '', line).strip()

# ---------------------------
# Remove Tamil stopwords
# ---------------------------
def remove_stopwords(text):
    words = text.split()
    filtered = [w for w in words if w not in tamil_stopwords]
    return " ".join(filtered)

# ---------------------------
# Detect metadata/header lines
# ---------------------------
section_keywords = {
    "ஆத்திசூடி",
    "கடவுள் வாழ்த்து",
    "உயிர் வருக்கம்",
    "உயிர்மெய் வருக்கம்",
    "ககர வருக்கம்",
    "சகர வருக்கம்",
    "தகர வருக்கம்",
    "நகர வருக்கம்",
    "பகர வருக்கம்",
    "மகர வருக்கம்",
    "வகர வருக்கம்"
}

def is_metadata(line):
    if line.strip() == "":
        return True
    if line.strip() in section_keywords:
        return True
    return False

# ---------------------------
# MAIN PREPROCESSING FUNCTION
# ---------------------------
def preprocess_aathichoodi(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    cleaned = []
    
    for line in lines:
        line = line.strip()
        if is_metadata(line):
            continue

        line = remove_number_prefix(line)
        line = normalize_tamil(line)

        if len(line) < 2:
            continue

        # line = remove_stopwords(line)
        cleaned.append(line)

    # Create DataFrame
    df = pd.DataFrame({
        "line_number": list(range(1, len(cleaned) + 1)),
        "text": cleaned
    })

    print("Total lines extracted:", len(cleaned))
    return df


In [56]:

# ---------------------------
# Run Script
# ---------------------------
if __name__ == "__main__":
    input_file = "./dataTamil/Avvaiyar_Aathichoodi.txt"
    df = preprocess_aathichudi(input_file)
    df.to_csv("./processedDataTamil/aathichoodi_cleaned.csv", index=False, encoding="utf-8")
    print("Saved as aathichudi_cleaned.csv")


Total lines extracted: 111
Saved as aathichudi_cleaned.csv
