In [1]:
import numpy as np
import pandas as pd

In [2]:
import re
from unicodedata import normalize

def punjabi_normalize(text, remove_nuktas=True, normalize_nasals=True):
    """
    Custom normalization for Punjabi (Gurmukhi) text

    Args:
        text: Input Punjabi text
        remove_nuktas: Whether to remove nuqta characters
        normalize_nasals: Whether to normalize nasal marks

    Returns:
        Normalized Punjabi text
    """
    # Normalize Unicode compositions
    text = normalize('NFC', text)

    # Common normalizations
    replacements = {
        # Normalize variations of the same character
        '੍': '',  # Virama
        '਼': '',   # Nukta if remove_nuktas=True

        # Normalize quote marks
        '“': '"',
        '”': '"',
        '‘': "'",
        '’': "'",

        # Normalize punctuation
        '॥': '।',  # Double danda to single
        '…': '...'
    }

    if remove_nuktas:
        # Remove nukta from specific characters
        nukta_map = {
            'ਖ਼': 'ਖ',
            'ਗ਼': 'ਗ',
            'ਜ਼': 'ਜ',
            'ਫ਼': 'ਫ',
            'ੜ੍ਹ': 'ੜ੍ਹ'  # Special case
        }
        replacements.update(nukta_map)

    if normalize_nasals:
        # Normalize nasal marks
        text = re.sub(r'([ਕ-ਹ])(ੰ|ਂ)', lambda m: m.group(1) + 'ੰ', text)

    # Apply replacements
    for old, new in replacements.items():
        text = text.replace(old, new)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [3]:
def tokenize_punjabi_text(input_text, enable_compound_split=True):
    """
    Tokenizer for Punjabi text.

    Args:
        input_text (str): Normalized Punjabi sentence.
        enable_compound_split (bool): If True, splits compound characters.

    Returns:
        List[str]: Tokenized output.
    """
    # Define common Punjabi punctuation
    punctuation_marks = set('।.,!?;:"\'()[]{}॥॰')

    tokens = []
    buffer = []

    for character in input_text:
        if character.isspace() or character in punctuation_marks:
            if buffer:
                tokens.append(''.join(buffer))
                buffer = []
            if character in punctuation_marks:
                tokens.append(character)
        else:
            # Optional compound character splitting
            #compound words
            if enable_compound_split and character == '੍' and buffer:
                tokens.append(''.join(buffer))
                buffer = [character]
            else:
                buffer.append(character)

    if buffer:
        tokens.append(''.join(buffer))

    # Clean up empty or whitespace-only tokens
    return [token for token in tokens if token.strip()]


In [4]:
def preprocess_punjabi_text(raw_text, apply_normalization=True, apply_tokenization=True):
    """
    Full Punjabi text preprocessing pipeline.

    Args:
        raw_text (str): Original Punjabi text.
        apply_normalization (bool): Whether to normalize the text.
        apply_tokenization (bool): Whether to tokenize the text.

    Returns:
        Union[str, List[str]]: Normalized string or tokenized list.
    """
    if apply_normalization:
        raw_text = punjabi_normalize(raw_text)

    if not apply_tokenization:
        return raw_text

    return tokenize_punjabi_text(raw_text)


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')


In [15]:
df = pd.read_csv("/content/drive/MyDrive/punjabiData.csv")


In [None]:
df.head()

Unnamed: 0,sentence,sentiment
0,ਅਧਿਆਪਕ ਦੇਸ਼ ਤੇ ਕੌਮ ਦਾ ਨਿਰਮਾਤਾ ਹੁੰਦਾ ਹੈ।,positive
1,ਉਹ ਵਿਦਿਆਰਥੀਆਂ ਨੂੰ ਅਗਿਆਨਤਾ ਦੇ ਹਨੇਰੇ ਤੋਂ ਕੱਢ ਕੇ ...,positive
2,ਇਕ ਚੰਗਾ ਅਧਿਆਪਕ ਵਿਦਿਆਰਥੀ ਦਾ ਤੀਜਾ ਨੇਤਰ ਖੋਲ੍ਹ ਸਕਦ...,positive
3,ਅਜਿਹੇ ਅਧਿਆਪਕ ਸਮਾਜ ਵਿੱਚ ਮਾਣ ਅਤੇ ਸਤਿਕਾਰ ਪਾਉਂਦੇ ਹਨ।,positive
4,ਸਾਰੇ ਅਧਿਆਪਕ ਪਿਆਰ ਅਤੇ ਸਤਿਕਾਰ ਦੇ ਪਾਤਰ ਹਨ।,positive


In [16]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,2642
negative,2463
neutral,2193


In [17]:
from collections import Counter
import re

def build_punjabi_vocabulary(text_list, min_frequency=5, max_frequency=50, max_vocab_size=None,
                              exclude_punctuation=True, exclude_digits=True):
    """
    Extracts a vocabulary dictionary from a list of Punjabi text samples.

    Args:
        text_list (List[str]): List of Punjabi sentences or documents.
        min_frequency (int): Minimum frequency threshold to include a word.
        max_frequency (int): Maximum frequency threshold to include a word.
        max_vocab_size (int or None): Limit the number of words returned (None for all).
        exclude_punctuation (bool): Whether to remove Punjabi punctuation.
        exclude_digits (bool): Whether to remove digits (Gurmukhi and Western).

    Returns:
        Dict[str, int]: A dictionary of words and their frequencies, sorted by descending frequency.
    """
    # Set of common Punjabi punctuation and digits (can be expanded)
    punjabi_punctuation = set('।.,!?;:"\'()[]{}॥॰੦੧੨੩੪੫੬੭੮੯')

    vocabulary_counter = Counter()

    for sentence in text_list:
        # Basic whitespace tokenization
        word_list = sentence.split()

        for word in word_list:
            cleaned_word = word

            if exclude_punctuation:
                cleaned_word = ''.join(char for char in cleaned_word if char not in punjabi_punctuation)

            if exclude_digits:
                cleaned_word = re.sub(r'[੦-੯0-9]+', '', cleaned_word)

            if cleaned_word.strip():
                vocabulary_counter[cleaned_word] += 1

    # Filter by frequency thresholds
    filtered_vocab = {
        word: freq for word, freq in vocabulary_counter.items()
        if min_frequency <= freq <= max_frequency
    }

    # Sort and trim vocabulary
    sorted_vocab = dict(sorted(filtered_vocab.items(), key=lambda item: -item[1]))

    if max_vocab_size is not None:
        sorted_vocab = dict(list(sorted_vocab.items())[:max_vocab_size])

    return sorted_vocab


In [18]:
vocab = build_punjabi_vocabulary(df["sentence"].to_list())

In [None]:
vocab

{'ਵਾਰ': 50,
 'ਮਹਿਸੂਸ': 50,
 'ਪੜ੍ਹਾਈ': 49,
 'ਵਰਤੋਂ': 49,
 'ਜਾਂਦੀ': 48,
 'ਮੈਚ': 48,
 'ਹੋਣ': 47,
 'ਗਏ': 47,
 'ਦੌਰਾਨ': 47,
 'ਵਧਾ': 47,
 'ਵਿਕਾਸ': 47,
 'ਮਨੋਰੰਜਨ': 47,
 'ਘੱਟ': 46,
 'ਸੁੰਦਰਤਾ': 46,
 'ਡਾਕਟਰ': 46,
 'ਹੋਏ': 45,
 'ਮਾਨਸਿਕ': 45,
 'ਰਹੀਆਂ': 44,
 'ਜ਼ਿਆਦਾ': 44,
 'ਸਮਾਂ': 44,
 'ਖੁਸ਼ੀ': 44,
 'ਕਮੀ': 44,
 'ਸਿੱਖਿਆ': 44,
 'ਬਾਰੇ': 44,
 'ਕੋਈ': 44,
 'ਵਿਦਿਆਰਥੀ': 42,
 'ਸ਼ੁਰੂ': 42,
 'ਕਰਦੀ': 42,
 'ਧਿਆਨ': 41,
 'ਲੱਗਦਾ': 40,
 'ਸਮੱਸਿਆ': 40,
 'ਕਹਾਣੀ': 40,
 'ਵਾਤਾਵਰਣ': 40,
 'ਹਿੱਸਾ': 40,
 'ਉਨ੍ਹਾਂ': 40,
 'ਆਈ': 39,
 'ਕਾਲਜ': 39,
 'ਅਸੀਂ': 39,
 'ਗੱਸਿਪ': 39,
 'ਗਾਣਿਆਂ': 39,
 'ਪੂਰੀ': 39,
 'ਸੰਗੀਤ': 38,
 'ਸਵੇਰੇ': 37,
 'ਗੰਭੀਰ': 37,
 'ਬਣਾਉਣ': 37,
 'ਧਰਤੀ': 37,
 'ਦਰਸ਼ਕਾਂ': 37,
 'ਖਾਣਾ': 36,
 'ਮਜ਼ਾ': 36,
 'ਗੱਲਾਂ': 36,
 'ਗੱਲਬਾਤ': 36,
 'ਗੁਣਵੱਤਾ': 36,
 'ਪੁਰਾਣੇ': 36,
 'ਪ੍ਰੇਰਿਤ': 36,
 'ਹੁੰਦੀਆਂ': 36,
 'ਵਿਗਿਆਨ': 36,
 'ਉਹ': 35,
 'ਵਿਚ': 35,
 'ਲੈਂਦੇ': 35,
 'ਟੈਸਟ': 34,
 'ਪ੍ਰਭਾਵਿਤ': 34,
 'ਇਕੱਠੇ': 34,
 'ਦਿੰਦੀ': 34,
 'ਦਰਦ': 33,
 'ਗਰਮੀ': 33,
 'ਅਦਾਕਾਰੀ': 33,
 'ਦ੍ਰਿਸ਼': 33,
 'ਮਿਲ': 32,
 'ਵਾਧਾ': 32,
 'ਲੈ': 32,
 'ਪ੍ਰਦੂਸ਼ਿਤ': 32,
 'ਹੋਰ': 32,
 'ਹ

In [19]:
len(vocab)

1561

In [None]:
type(vocab)

dict

In [23]:
vocab = list(vocab)

In [24]:
# already stored in drive and download ...so no need to do it again
# processed vocab
#:)

# v = pd.DataFrame({'vocab': vocab})
# csv_path = '/content/drive/My Drive/new_punjabi_vocab_list.csv'
# v.to_csv(csv_path, index=False, encoding='utf-8')


In [25]:
df['sentence'] = df['sentence'].apply(preprocess_punjabi_text)

In [26]:
df['sentence'].head(20)

Unnamed: 0,sentence
0,"[ਅਧਿਆਪਕ, ਦੇਸ, ਤੇ, ਕੌਮ, ਦਾ, ਨਿਰਮਾਤਾ, ਹੁੰਦਾ, ਹੈ, ।]"
1,"[ਉਹ, ਵਿਦਿਆਰਥੀਆਂ, ਨੂੰ, ਅਗਿਆਨਤਾ, ਦੇ, ਹਨੇਰੇ, ਤੋਂ,..."
2,"[ਇਕ, ਚੰਗਾ, ਅਧਿਆਪਕ, ਵਿਦਿਆਰਥੀ, ਦਾ, ਤੀਜਾ, ਨੇਤਰ, ਖ..."
3,"[ਅਜਿਹੇ, ਅਧਿਆਪਕ, ਸਮਾਜ, ਵਿੱਚ, ਮਾਣ, ਅਤੇ, ਸਤਿਕਾਰ, ..."
4,"[ਸਾਰੇ, ਅਧਿਆਪਕ, ਪਿਆਰ, ਅਤੇ, ਸਤਿਕਾਰ, ਦੇ, ਪਾਤਰ, ਹਨ..."
5,"[ਸ, ., ਹਰਭਜਨ, ਸਿੰਘ, ਜੀ, ਮੇਰੇ, ਮਨਪਸੰਦ, ਅਧਿਆਪਕ, ..."
6,"[ਉਹ, ਐਮ, ., ਏ, ., ਅਤੇ, ਬੀ, ., ਐਡ, ਹਨ, ।]"
7,"[ਉਹਨਾਂ, ਦੀ, ਉਮਰ, ਲਗਭਗ, ਸਾਲ, ਹੈ, ਪਰ, ਉਹ, ਨੌਜਵਾਨ..."
8,"[ਉਹ, ਸਾਫ-ਸੁੱਥਰੇ, ਅਤੇ, ਸਾਦੇ, ਕੱਪੜੇ, ਪਹਿਨਦੇ, ਹਨ, ।]"
9,"[ਉਹ, ਵਿਦਿਆਰਥੀਆਂ, ਨਾਲ, ਪਿਆਰ, ਭਰੀ, ਵਰਤੋਂ, ਕਰਦੇ, ..."


In [27]:
df['sentiment'].head(20)

Unnamed: 0,sentiment
0,positive
1,positive
2,positive
3,positive
4,positive
5,positive
6,positive
7,positive
8,positive
9,positive


In [28]:
# already stored in drive and download ...so no need to do it again
# processed data loaded
#:)

# output_path = '/content/drive/My Drive/new_punjabi_processed_data.csv'

# # Save as UTF-8 CSV (great for Punjabi characters)
# df.to_csv(output_path, index=False, encoding='utf-8')
