In [47]:
import re
import unicodedata
import pandas as pd

In [48]:
# ---------------------------
# 1. Load Tamil Stopwords (Minimal List)
# ---------------------------
tamil_stopwords = {
    "மற்றும்", "ஆகிய", "ஆனால்", "என", "என்று", "எனில்", "என்ன",
    "இது", "அது", "இவர்", "அவர்", "அவர்கள்", "நான்", "நீ",
    "என்", "உன்", "எங்களுக்கு", "அவற்றின்", "உள்ள", "ஒரு",
    "ஒரு", "எது", "எவை", "யார்", "எப்படி",
    "தான்", "வேண்டும்", "மட்டும்", "இல்லை", "உண்டு",
    "என்னும்", "எனும்", "அல்லது"
}

In [49]:
# ---------------------------
# 2. Helper: Tamil Unicode Normalization
# ---------------------------
def normalize_tamil(text):
    return unicodedata.normalize("NFC", text.strip())

In [50]:
# ---------------------------
# 3. Helper: Remove trailing numbers (e.g., ... 21)
# ---------------------------
def remove_trailing_number(line):
    return re.sub(r'\s*\d+\s*$', '', line).strip()


In [51]:
# ---------------------------
# 4. Helper: Remove Tamil stopwords
# ---------------------------
def remove_stopwords(text):
    words = text.split()
    filtered = [w for w in words if w not in tamil_stopwords]
    return " ".join(filtered)


In [52]:
# ---------------------------
# 5. Helper: Detect metadata/header lines
# ---------------------------
def is_metadata(line):
    # Remove section titles like "1. அறத்துப்பால்", "1.1", "திருக்குறள்"
    # Contains non-Tamil or Tamil + digits in ways that do not form poetry lines
    if re.match(r'^\s*$', line):   # empty
        return True
    if re.match(r'^\d+(\.\d+)*', line):  # numeric headings
        return True
    if "திருக்குறள்" in line and len(line) < 20:
        return True
    return False

In [53]:
# ---------------------------
# 6. Main Preprocessing Function
# ---------------------------
def preprocess_thirukkural(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    cleaned_lines = []

    # Step A — Clean line by line
    for line in lines:
        line = line.strip()
        
        if is_metadata(line):
            continue  # skip titles/headers
        
        line = normalize_tamil(line)
        line = remove_trailing_number(line)

        if len(line) < 2:
            continue  # skip tiny lines
        
        cleaned_lines.append(line)

    # Step B — Group every 2 lines into 1 couplet
    couplets = []
    for i in range(0, len(cleaned_lines), 2):
        if i + 1 < len(cleaned_lines):
            couplet = cleaned_lines[i] + " " + cleaned_lines[i + 1]
        else:
            couplet = cleaned_lines[i]  # fallback if odd number
        # couplet = remove_stopwords(couplet)
        couplets.append(couplet)

    # Step C — Sanity check
    print("Total lines after cleaning:", len(cleaned_lines))
    print("Total couplets extracted:", len(couplets))

    # Step D — Return as a dataframe
    df = pd.DataFrame({
        "kural_number": list(range(1, len(couplets) + 1)),
        "couplet": couplets
    })

    return df

In [54]:
# ---------------------------
# 7. Run Script
# ---------------------------
if __name__ == "__main__":
    input_file = "./dataTamil/Thiruvalluvar_Thirukkural.txt"
    df = preprocess_thirukkural(input_file)
    df.to_csv("./processedDataTamil/thirukkural_cleaned.csv", index=False, encoding="utf-8")
    print("Saved cleaned couplets to thirukkural_cleaned.csv")

Total lines after cleaning: 2668
Total couplets extracted: 1334
Saved cleaned couplets to thirukkural_cleaned.csv


In [55]:
import re
import unicodedata
import pandas as pd

# ---------------------------
# Minimal Tamil Stopwords
# (same as Thirukkural script)
# ---------------------------
tamil_stopwords = {
    "மற்றும்", "ஆகிய", "ஆனால்", "என", "என்று",
    "எனில்", "என்ன", "இது", "அது", "இவர்", "அவர்",
    "அவர்கள்", "நான்", "நீ", "என்", "உன்", "ஒரு",
    "உள்ள", "தான்", "வேண்டும்", "மட்டும்", "இல்லை",
    "என்னும்", "எனும்", "அல்லது"
}

# ---------------------------
# Tamil Unicode Normalization
# ---------------------------
def normalize_tamil(text):
    return unicodedata.normalize("NFC", text.strip())

# ---------------------------
# Remove leading numbering (e.g., "1. ", "32. ")
# ---------------------------
def remove_number_prefix(line):
    return re.sub(r'^\s*\d+\.\s*', '', line).strip()

# ---------------------------
# Remove Tamil stopwords
# ---------------------------
def remove_stopwords(text):
    words = text.split()
    filtered = [w for w in words if w not in tamil_stopwords]
    return " ".join(filtered)

# ---------------------------
# Detect metadata/header lines
# ---------------------------
section_keywords = {
    "ஆத்திசூடி",
    "கடவுள் வாழ்த்து",
    "உயிர் வருக்கம்",
    "உயிர்மெய் வருக்கம்",
    "ககர வருக்கம்",
    "சகர வருக்கம்",
    "தகர வருக்கம்",
    "நகர வருக்கம்",
    "பகர வருக்கம்",
    "மகர வருக்கம்",
    "வகர வருக்கம்"
}

def is_metadata(line):
    if line.strip() == "":
        return True
    if line.strip() in section_keywords:
        return True
    return False

# ---------------------------
# MAIN PREPROCESSING FUNCTION
# ---------------------------
def preprocess_aathichoodi(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    cleaned = []
    
    for line in lines:
        line = line.strip()
        if is_metadata(line):
            continue

        line = remove_number_prefix(line)
        line = normalize_tamil(line)

        if len(line) < 2:
            continue

        # line = remove_stopwords(line)
        cleaned.append(line)

    # Create DataFrame
    df = pd.DataFrame({
        "line_number": list(range(1, len(cleaned) + 1)),
        "text": cleaned
    })

    print("Total lines extracted:", len(cleaned))
    return df


# Preprocessing Avvaiyar - Konraiventhan
Similar format to Aathichoodi - single-line wisdom

# Preprocessing Avvaiyar - Konraiventhan
Similar format to Aathichoodi - single-line wisdom

In [None]:
# Process Konraiventhan using same function as Aathichoodi
if __name__ == "__main__":
    input_file = "./dataTamil/Avvaiyar_Konraiventhan.txt"
    df = preprocess_aathichoodi(input_file)
    df.to_csv("./processedDataTamil/konraiventhan_cleaned.csv", index=False, encoding="utf-8")
    print("Saved as konraiventhan_cleaned.csv")

# Preprocessing 4-Line Verses (Moothurai, Nalvazhi, VivekaCinthamani)
These texts have verses with 4 lines each, numbered at the end

In [None]:
# ---------------------------
# Preprocessing function for 4-line verses
# ---------------------------
def preprocess_4line_verses(file_path):
    """
    Preprocess texts with 4-line verses numbered at the end.
    Applicable to: Moothurai, Nalvazhi, VivekaCinthamani
    """
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    verses = []
    current_verse = []
    
    for line in lines:
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
        
        # Skip title and section headers (usually short or all caps or specific keywords)
        if is_metadata(line):
            continue
        
        # Check if line ends with a verse number (e.g., "1", "2.", "1.")
        # Verse numbers appear at the end of the 4th line
        verse_num_match = re.search(r'\s+(\d+)\.?\s*$', line)
        
        if verse_num_match:
            # This is the last line of a verse
            # Remove the verse number from the line
            line_without_num = re.sub(r'\s+\d+\.?\s*$', '', line).strip()
            if line_without_num:
                current_verse.append(normalize_tamil(line_without_num))
            
            # Combine the 4 lines into one verse
            if current_verse:
                verse_text = " ".join(current_verse)
                # verse_text = remove_stopwords(verse_text)
                verses.append(verse_text)
            
            # Reset for next verse
            current_verse = []
        else:
            # Regular line (part of verse)
            line = normalize_tamil(line)
            if len(line) >= 2:
                current_verse.append(line)
    
    # Handle last verse if it didn't have a number
    if current_verse:
        verse_text = " ".join(current_verse)
        verses.append(verse_text)
    
    print("Total verses extracted:", len(verses))
    
    # Create DataFrame
    df = pd.DataFrame({
        "verse_number": list(range(1, len(verses) + 1)),
        "verse": verses
    })
    
    return df

# Preprocessing 8-Line Verses (Ulakanaathar - Ulakaneethi)
This text has verses with 8 lines each, numbered at the end with #

In [None]:
# ---------------------------
# Preprocessing function for 8-line verses (Ulakaneethi)
# ---------------------------
def preprocess_8line_verses(file_path):
    """
    Preprocess texts with 8-line verses numbered at the end.
    Applicable to: Ulakanaathar_Ulakaneethi
    """
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    verses = []
    current_verse = []
    
    for line in lines:
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
        
        # Skip title line
        if "உலக நீதி" in line or "ஆசிரியர்" in line:
            continue
        
        # Check if line ends with verse number (e.g., "#1", "#2")
        # Verse numbers appear at the end of the 8th line
        verse_num_match = re.search(r'\s+#(\d+)\s*$', line)
        
        if verse_num_match:
            # This is the last line of a verse
            # Remove the verse number from the line
            line_without_num = re.sub(r'\s+#\d+\s*$', '', line).strip()
            if line_without_num:
                current_verse.append(normalize_tamil(line_without_num))
            
            # Combine the 8 lines into one verse
            if current_verse:
                verse_text = " ".join(current_verse)
                # verse_text = remove_stopwords(verse_text)
                verses.append(verse_text)
            
            # Reset for next verse
            current_verse = []
        else:
            # Regular line (part of verse)
            line = normalize_tamil(line)
            if len(line) >= 2:
                current_verse.append(line)
    
    # Handle last verse if it didn't have a number
    if current_verse:
        verse_text = " ".join(current_verse)
        verses.append(verse_text)
    
    print("Total verses extracted:", len(verses))
    
    # Create DataFrame
    df = pd.DataFrame({
        "verse_number": list(range(1, len(verses) + 1)),
        "verse": verses
    })
    
    return df

# Process All Files and Save to processedDataTamil

In [None]:
# Create output directory if it doesn't exist
import os
os.makedirs('./processedDataTamil', exist_ok=True)
print("Output directory ready: ./processedDataTamil")

In [None]:
# 1. Process Avvaiyar - Konraiventhan (single-line format)
print("\n" + "="*60)
print("Processing: Avvaiyar - Konraiventhan")
print("="*60)
input_file = "./dataTamil/Avvaiyar_Konraiventhan.txt"
df_konrai = preprocess_aathichoodi(input_file)
df_konrai.to_csv("./processedDataTamil/konraiventhan_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as konraiventhan_cleaned.csv")
print(f"  Total lines: {len(df_konrai)}")
print(f"\nFirst 3 lines:")
print(df_konrai.head(3))

In [None]:
# 2. Process Avvaiyar - Moothurai (4-line verses)
print("\n" + "="*60)
print("Processing: Avvaiyar - Moothurai")
print("="*60)
input_file = "./dataTamil/Avvaiyar_Moothurai.txt"
df_mooth = preprocess_4line_verses(input_file)
df_mooth.to_csv("./processedDataTamil/moothurai_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as moothurai_cleaned.csv")
print(f"  Total verses: {len(df_mooth)}")
print(f"\nFirst 3 verses:")
print(df_mooth.head(3))

In [None]:
# 3. Process Avvaiyar - Nalvazhi (4-line verses)
print("\n" + "="*60)
print("Processing: Avvaiyar - Nalvazhi")
print("="*60)
input_file = "./dataTamil/Avvaiyar_Nalvazhi.txt"
df_nalvazhi = preprocess_4line_verses(input_file)
df_nalvazhi.to_csv("./processedDataTamil/nalvazhi_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as nalvazhi_cleaned.csv")
print(f"  Total verses: {len(df_nalvazhi)}")
print(f"\nFirst 3 verses:")
print(df_nalvazhi.head(3))

In [None]:
# 4. Process Ulakanaathar - Ulakaneethi (8-line verses)
print("\n" + "="*60)
print("Processing: Ulakanaathar - Ulakaneethi")
print("="*60)
input_file = "./dataTamil/Ulakanaathar_Ulakaneethi.txt"
df_ulaka = preprocess_8line_verses(input_file)
df_ulaka.to_csv("./processedDataTamil/ulakaneethi_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as ulakaneethi_cleaned.csv")
print(f"  Total verses: {len(df_ulaka)}")
print(f"\nFirst 3 verses:")
print(df_ulaka.head(3))

In [None]:
# 5. Process VivekaCinthamani (4-line verses)
print("\n" + "="*60)
print("Processing: VivekaCinthamani")
print("="*60)
input_file = "./dataTamil/VivekaCinthamani.txt"
df_viveka = preprocess_4line_verses(input_file)
df_viveka.to_csv("./processedDataTamil/vivekacinthamani_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as vivekacinthamani_cleaned.csv")
print(f"  Total verses: {len(df_viveka)}")
print(f"\nFirst 3 verses:")
print(df_viveka.head(3))

In [None]:
# Summary
print("\n" + "="*60)
print("PREPROCESSING COMPLETE")
print("="*60)
print("\nProcessed Files:")
print("1. ✓ Avvaiyar - Konraiventhan    → konraiventhan_cleaned.csv")
print("2. ✓ Avvaiyar - Moothurai        → moothurai_cleaned.csv")
print("3. ✓ Avvaiyar - Nalvazhi         → nalvazhi_cleaned.csv")
print("4. ✓ Ulakanaathar - Ulakaneethi  → ulakaneethi_cleaned.csv")
print("5. ✓ VivekaCinthamani            → vivekacinthamani_cleaned.csv")
print(f"\nAll files saved to: ./processedDataTamil/")
print("\nReady for moral scoring!")