In [13]:
import re
import unicodedata
import pandas as pd

In [14]:
# Tamil Unicode Normalization
def normalize_tamil(text):
    return unicodedata.normalize("NFC", text.strip())

In [15]:

# Remove trailing numbers (e.g., ... 21)
def remove_trailing_number(line):
    return re.sub(r'\s*\d+\s*$', '', line).strip()


In [16]:
# Detect metadata/header lines
def is_metadata(line):
    # Remove section titles like "1. அறத்துப்பால்", "1.1", "திருக்குறள்"
    # Contains non-Tamil or Tamil + digits in ways that do not form poetry lines
    if re.match(r'^\s*$', line):   # empty
        return True
    if re.match(r'^\d+(\.\d+)*', line):  # numeric headings
        return True
    if "திருக்குறள்" in line and len(line) < 20:
        return True
    return False

In [17]:
# Main Preprocessing Function

def preprocess_thirukkural(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    cleaned_lines = []

    # Step A — Clean line by line
    for line in lines:
        line = line.strip()
        
        if is_metadata(line):
            continue  # skip titles/headers
        
        line = normalize_tamil(line)
        line = remove_trailing_number(line)

        if len(line) < 2:
            continue  # skip tiny lines
        
        cleaned_lines.append(line)

    # Step B — Group every 2 lines into 1 couplet
    couplets = []
    for i in range(0, len(cleaned_lines), 2):
        if i + 1 < len(cleaned_lines):
            couplet = cleaned_lines[i] + " " + cleaned_lines[i + 1]
        else:
            couplet = cleaned_lines[i]  # fallback if odd number
        couplets.append(couplet)

    # Step C — Sanity check
    print("Total lines after cleaning:", len(cleaned_lines))
    print("Total couplets extracted:", len(couplets))

    # Step D — Return as a dataframe
    df = pd.DataFrame({
        "kural_number": list(range(1, len(couplets) + 1)),
        "couplet": couplets
    })

    return df

In [18]:
# Running the Script for Thirukkural Dataset

if __name__ == "__main__":
    input_file = "./tamil-dataset/Thiruvalluvar_Thirukkural.txt"
    df = preprocess_thirukkural(input_file)
    df.to_csv("./tamil-csv/thirukkural_cleaned.csv", index=False, encoding="utf-8")
    print("Saved cleaned couplets to thirukkural_cleaned.csv")

Total lines after cleaning: 2668
Total couplets extracted: 1334
Saved cleaned couplets to thirukkural_cleaned.csv


In [19]:
import re
import unicodedata
import pandas as pd

# Tamil Unicode Normalization
def normalize_tamil(text):
    return unicodedata.normalize("NFC", text.strip())

# Remove leading numbering (e.g., "1. ", "32. ")
def remove_number_prefix(line):
    return re.sub(r'^\s*\d+\.\s*', '', line).strip()

# Detect metadata/header lines
section_keywords = {
    "ஆத்திசூடி",
    "கடவுள் வாழ்த்து",
    "உயிர் வருக்கம்",
    "உயிர்மெய் வருக்கம்",
    "ககர வருக்கம்",
    "சகர வருக்கம்",
    "தகர வருக்கம்",
    "நகர வருக்கம்",
    "பகர வருக்கம்",
    "மகர வருக்கம்",
    "வகர வருக்கம்"
}

def is_metadata(line):
    if line.strip() == "":
        return True
    if line.strip() in section_keywords:
        return True
    return False

# MAIN PREPROCESSING FUNCTION - Aathichoodhi

def preprocess_aathichoodi(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    cleaned = []
    
    for line in lines:
        line = line.strip()
        if is_metadata(line):
            continue

        line = remove_number_prefix(line)
        line = normalize_tamil(line)

        if len(line) < 2:
            continue

        cleaned.append(line)

    # Create DataFrame
    df = pd.DataFrame({
        "line_number": list(range(1, len(cleaned) + 1)),
        "text": cleaned
    })

    print("Total lines extracted:", len(cleaned))
    return df


# Preprocessing Avvaiyar - Konraiventhan
Similar format to Aathichoodi - single-line wisdom

In [20]:
# Processing Konraiventhan using same function as Aathichoodi
if __name__ == "__main__":
    input_file = "./tamil-dataset/Avvaiyar_Konraiventhan.txt"
    df = preprocess_aathichoodi(input_file)
    df.to_csv("./tamil-csv/konraiventhan_cleaned.csv", index=False, encoding="utf-8")
    print("Saved as konraiventhan_cleaned.csv")

Total lines extracted: 94
Saved as konraiventhan_cleaned.csv


# Preprocessing 4-Line Verses (Moothurai, Nalvazhi, VivekaCinthamani)
These texts have verses with 4 lines each, numbered at the end

In [21]:
# Preprocessing function for 4-line verses
def preprocess_4line_verses(file_path):
    """
    Preprocess texts with 4-line verses numbered at the end.
    Applicable to: Moothurai, Nalvazhi, VivekaCinthamani
    """
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    verses = []
    current_verse = []
    
    for line in lines:
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
        
        # Skip title and section headers (usually short or all caps or specific keywords)
        if is_metadata(line):
            continue
        
        # Check if line ends with a verse number (e.g., "1", "2.", "1.")
        # Verse numbers appear at the end of the 4th line
        verse_num_match = re.search(r'\s+(\d+)\.?\s*$', line)
        
        if verse_num_match:
            # This is the last line of a verse
            # Remove the verse number from the line
            line_without_num = re.sub(r'\s+\d+\.?\s*$', '', line).strip()
            if line_without_num:
                current_verse.append(normalize_tamil(line_without_num))
            
            # Combine the 4 lines into one verse
            if current_verse:
                verse_text = " ".join(current_verse)
                verses.append(verse_text)
            
            # Reset for next verse
            current_verse = []
        else:
            # Regular line (part of verse)
            line = normalize_tamil(line)
            if len(line) >= 2:
                current_verse.append(line)
    
    # Handle last verse if it didn't have a number
    if current_verse:
        verse_text = " ".join(current_verse)
        verses.append(verse_text)
    
    print("Total verses extracted:", len(verses))
    
    # Create DataFrame
    df = pd.DataFrame({
        "verse_number": list(range(1, len(verses) + 1)),
        "verse": verses
    })
    
    return df

# Preprocessing 8-Line Verses (Ulakanaathar - Ulakaneethi)
This text has verses with 8 lines each, numbered at the end with #

In [22]:
# Preprocessing function for 8-line verses (Ulakaneethi)
def preprocess_8line_verses(file_path):
    """
    Preprocess texts with 8-line verses numbered at the end.
    Applicable to: Ulakanaathar_Ulakaneethi
    """
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    verses = []
    current_verse = []
    
    for line in lines:
        line = line.strip()
        
        # Skip empty lines
        if not line:
            continue
        
        # Skip title line
        if "உலக நீதி" in line or "ஆசிரியர்" in line:
            continue
        
        # Check if line ends with verse number (e.g., "#1", "#2")
        # Verse numbers appear at the end of the 8th line
        verse_num_match = re.search(r'\s+#(\d+)\s*$', line)
        
        if verse_num_match:
            # This is the last line of a verse
            # Remove the verse number from the line
            line_without_num = re.sub(r'\s+#\d+\s*$', '', line).strip()
            if line_without_num:
                current_verse.append(normalize_tamil(line_without_num))
            
            # Combine the 8 lines into one verse
            if current_verse:
                verse_text = " ".join(current_verse)
                verses.append(verse_text)
            
            # Reset for next verse
            current_verse = []
        else:
            # Regular line (part of verse)
            line = normalize_tamil(line)
            if len(line) >= 2:
                current_verse.append(line)
    
    # Handle last verse if it didn't have a number
    if current_verse:
        verse_text = " ".join(current_verse)
        verses.append(verse_text)
    
    print("Total verses extracted:", len(verses))
    
    # Create DataFrame
    df = pd.DataFrame({
        "verse_number": list(range(1, len(verses) + 1)),
        "verse": verses
    })
    
    return df

# Process All Files and Save to tamil-csv

In [23]:
# Create output directory if it doesn't exist
import os
os.makedirs('./tamil-csv', exist_ok=True)
print("Output directory ready: ./tamil-csv")

Output directory ready: ./tamil-csv


In [24]:
# 1. Process Avvaiyar - Konraiventhan (single-line format)
print("\n" + "="*60)
print("Processing: Avvaiyar - Konraiventhan")
print("="*60)
input_file = "./tamil-dataset/Avvaiyar_Konraiventhan.txt"
df_konrai = preprocess_aathichoodi(input_file)
df_konrai.to_csv("./tamil-csv/konraiventhan_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as konraiventhan_cleaned.csv")
print(f"  Total lines: {len(df_konrai)}")
print(f"\nFirst 3 lines:")
print(df_konrai.head(3))


Processing: Avvaiyar - Konraiventhan
Total lines extracted: 94
✓ Saved as konraiventhan_cleaned.csv
  Total lines: 94

First 3 lines:
   line_number                            text
0            1                  கொன்றை வேந்தன்
1            2  கொன்றை வேந்தன் செல்வன் அடியினை
2            3  என்றும் ஏத்தித் தொழுவோம் யாமே.


In [25]:
# 2. Process Avvaiyar - Moothurai (4-line verses)
print("\n" + "="*60)
print("Processing: Avvaiyar - Moothurai")
print("="*60)
input_file = "./tamil-dataset/Avvaiyar_Moothurai.txt"
df_mooth = preprocess_4line_verses(input_file)
df_mooth.to_csv("./tamil-csv/moothurai_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as moothurai_cleaned.csv")
print(f"  Total verses: {len(df_mooth)}")
print(f"\nFirst 3 verses:")
print(df_mooth.head(3))


Processing: Avvaiyar - Moothurai
Total verses extracted: 30
✓ Saved as moothurai_cleaned.csv
  Total verses: 30

First 3 verses:
   verse_number                                              verse
0             1  மூதுரை வாக்குண்டாம் நல்ல மனமுண்டாம் மாமலராள் ந...
1             2  நல்லார் ஒருவர்க்குச் செய்த உபகாரம் கல்மேல் எழு...
2             3  இன்னா இளமை வறுமைவந் தெய்தியக்கால் இன்னா அளவில்...


In [26]:
# 3. Process Avvaiyar - Nalvazhi (4-line verses)
print("\n" + "="*60)
print("Processing: Avvaiyar - Nalvazhi")
print("="*60)
input_file = "./tamil-dataset/Avvaiyar_Nalvazhi.txt"
df_nalvazhi = preprocess_4line_verses(input_file)
df_nalvazhi.to_csv("./tamil-csv/nalvazhi_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as nalvazhi_cleaned.csv")
print(f"  Total verses: {len(df_nalvazhi)}")
print(f"\nFirst 3 verses:")
print(df_nalvazhi.head(3))


Processing: Avvaiyar - Nalvazhi
Total verses extracted: 40
✓ Saved as nalvazhi_cleaned.csv
  Total verses: 40

First 3 verses:
   verse_number                                              verse
0             1  நல்வழி பாலும் தெளிதேனும் பாகும் பருப்புமிவை நா...
1             2  சாதி இரண்டொழிய வேறில்லை சாற்றுங்கால் நீதி வழுவ...
2             3  இடும்பைக்(கு) இடும்பை இயலுடம்(பு) இதன்றே இடும்...


In [27]:
# 4. Process Ulakanaathar - Ulakaneethi (8-line verses)
print("\n" + "="*60)
print("Processing: Ulakanaathar - Ulakaneethi")
print("="*60)
input_file = "./tamil-dataset/Ulakanaathar_Ulakaneethi.txt"
df_ulaka = preprocess_8line_verses(input_file)
df_ulaka.to_csv("./tamil-csv/ulakaneethi_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as ulakaneethi_cleaned.csv")
print(f"  Total verses: {len(df_ulaka)}")
print(f"\nFirst 3 verses:")
print(df_ulaka.head(3))


Processing: Ulakanaathar - Ulakaneethi
Total verses extracted: 13
✓ Saved as ulakaneethi_cleaned.csv
  Total verses: 13

First 3 verses:
   verse_number                                              verse
0             1  ஓதாமல் ஒருநாளும் இருக்க வேண்டாம் ஒருவரையும் பொ...
1             2  நெஞ்சாரப் பொய் தன்னைச் சொல்ல வேண்டாம் நிலையில்...
2             3  மனம்போன போக்கு எல்லாம் போக வேண்டாம் மாற்றானை உ...


In [28]:
# 5. Process VivekaCinthamani (4-line verses)
print("\n" + "="*60)
print("Processing: VivekaCinthamani")
print("="*60)
input_file = "./tamil-dataset/VivekaCinthamani.txt"
df_viveka = preprocess_4line_verses(input_file)
df_viveka.to_csv("./tamil-csv/vivekacinthamani_cleaned.csv", index=False, encoding="utf-8")
print("✓ Saved as vivekacinthamani_cleaned.csv")
print(f"  Total verses: {len(df_viveka)}")
print(f"\nFirst 3 verses:")
print(df_viveka.head(3))


Processing: VivekaCinthamani
Total verses extracted: 135
✓ Saved as vivekacinthamani_cleaned.csv
  Total verses: 135

First 3 verses:
   verse_number                                              verse
0             1  விவேக சிந்தாமணி அல்லல்போம்; வல்வினைபோம்; அன்னை...
1             2  பிள்ளைதான் வயதில் மூத்தால் பிதாவின் சொல் புத்த...
2             3  குக்கலைப் பிடித்து நாவிக் கூண்டினில் அடைத்து வ...


In [30]:
# Summary
print("\n" + "="*60)
print("PREPROCESSING COMPLETE")
print("="*60)
print("\nProcessed Files:")
print("1. ✓ Avvaiyar - Konraiventhan    → konraiventhan_cleaned.csv")
print("2. ✓ Avvaiyar - Moothurai        → moothurai_cleaned.csv")
print("3. ✓ Avvaiyar - Nalvazhi         → nalvazhi_cleaned.csv")
print("4. ✓ Ulakanaathar - Ulakaneethi  → ulakaneethi_cleaned.csv")
print("5. ✓ VivekaCinthamani            → vivekacinthamani_cleaned.csv")
print(f"\nAll files saved to: ./tamil-csv/")



PREPROCESSING COMPLETE

Processed Files:
1. ✓ Avvaiyar - Konraiventhan    → konraiventhan_cleaned.csv
2. ✓ Avvaiyar - Moothurai        → moothurai_cleaned.csv
3. ✓ Avvaiyar - Nalvazhi         → nalvazhi_cleaned.csv
4. ✓ Ulakanaathar - Ulakaneethi  → ulakaneethi_cleaned.csv
5. ✓ VivekaCinthamani            → vivekacinthamani_cleaned.csv

All files saved to: ./tamil-csv/
