In [None]:
import numpy as np
import pandas as pd

In [None]:
import re
from unicodedata import normalize

def punjabi_normalize(text, remove_nuktas=True, normalize_nasals=True):
    """
    Custom normalization for Punjabi (Gurmukhi) text

    Args:
        text: Input Punjabi text
        remove_nuktas: Whether to remove nuqta characters
        normalize_nasals: Whether to normalize nasal marks

    Returns:
        Normalized Punjabi text
    """
    # Normalize Unicode compositions
    text = normalize('NFC', text)

    # Common normalizations
    replacements = {
        # Normalize variations of the same character
        '੍': '',  # Virama
        '਼': '',   # Nukta if remove_nuktas=True

        # Normalize quote marks
        '“': '"',
        '”': '"',
        '‘': "'",
        '’': "'",

        # Normalize punctuation
        '॥': '।',  # Double danda to single
        '…': '...'
    }

    if remove_nuktas:
        # Remove nukta from specific characters
        nukta_map = {
            'ਖ਼': 'ਖ',
            'ਗ਼': 'ਗ',
            'ਜ਼': 'ਜ',
            'ਫ਼': 'ਫ',
            'ੜ੍ਹ': 'ੜ੍ਹ'  # Special case
        }
        replacements.update(nukta_map)

    if normalize_nasals:
        # Normalize nasal marks
        text = re.sub(r'([ਕ-ਹ])(ੰ|ਂ)', lambda m: m.group(1) + 'ੰ', text)

    # Apply replacements
    for old, new in replacements.items():
        text = text.replace(old, new)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
def punjabi_tokenize(text, split_compound_words=True):
    """
    Custom tokenizer for Punjabi text

    Args:
        text: Normalized Punjabi text
        split_compound_words: Whether to split compound words

    Returns:
        List of tokens
    """
    # Common Punjabi punctuation (add more as needed)
    punc = set('।.,!?;:"\'()[]{}॥॰')

    # Initialize tokens
    tokens = []
    current_token = []

    for char in text:
        if char.isspace() or char in punc:
            if current_token:
                tokens.append(''.join(current_token))
                current_token = []
            if char in punc:
                tokens.append(char)
        else:
            # Handle compound words (optional)
            if split_compound_words and char == '੍' and current_token:
                tokens.append(''.join(current_token))
                current_token = [char]
            else:
                current_token.append(char)

    if current_token:
        tokens.append(''.join(current_token))

    # Post-processing
    tokens = [t for t in tokens if t and not t.isspace()]

    return tokens

In [None]:
def preprocess_punjabi(text, normalize=True, tokenize=True):
    """
    Complete Punjabi text preprocessing pipeline

    Args:
        text: Raw Punjabi text
        normalize: Whether to apply normalization
        tokenize: Whether to tokenize

    Returns:
        Processed text (normalized string or token list)
    """
    if normalize:
        text = punjabi_normalize(text)

    if not tokenize:
        return text

    return punjabi_tokenize(text)

In [None]:
df = pd.read_csv("/content/punjabiData (1).csv")
df.head()

Unnamed: 0,sentence,sentiment
0,ਮੈਨੂੰ ਇਹ ਫਿਲਮ ਬਹੁਤ ਪਸੰਦ ਆਈ,positive
1,ਅੱਜ ਦਾ ਦਿਨ ਬਹੁਤ ਮਾੜਾ ਸੀ,negative
2,ਮੈਂ ਬੱਸ ਵਿੱਚ ਸਫ਼ਰ ਕੀਤਾ,neutral
3,ਇਹ ਖਾਣਾ ਸੁਆਦ ਹੈ,positive
4,ਮੇਰਾ ਸਿਰ ਬਹੁਤ ਦੁਖਦਾ ਹੈ,negative


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,2166
negative,2166
neutral,2166


In [None]:
from collections import Counter
import re

def extract_vocabulary(texts, min_freq=5, max_freq = 50, max_words=None,
                      remove_punct=True, remove_numbers=True):
    """
    Extract vocabulary from a list of Punjabi texts

    Args:
        texts: List of Punjabi text strings
        min_freq: Minimum frequency to include a word
        max_words: Maximum number of words to return (None for all)
        remove_punct: Whether to remove punctuation
        remove_numbers: Whether to remove numbers

    Returns:
        Dictionary of {word: frequency} sorted by frequency
    """
    # Punjabi-specific punctuation (add more as needed)
    punjabi_punct = set('।.,!?;:"\'()[]{}॥॰੦੧੨੩੪੫੬੭੮੯')

    word_counts = Counter()

    for text in texts:
        # Tokenize (simple whitespace tokenizer)
        words = text.split()

        for word in words:
            # Clean each word
            if remove_punct:
                word = ''.join(c for c in word if c not in punjabi_punct)
            if remove_numbers:
                word = re.sub(r'[੦-੯0-9]+', '', word)

            # Add to counts if not empty
            if word.strip():
                word_counts[word] += 1

    # Filter by frequency
    vocab = {w: c for w, c in word_counts.items() if (c >= min_freq and c<=max_freq)}

    # Sort by frequency (descending)
    sorted_vocab = dict(sorted(vocab.items(), key=lambda x: -x[1]))

    # Limit vocabulary size if requested
    if max_words is not None:
        sorted_vocab = dict(list(sorted_vocab.items())[:max_words])

    return sorted_vocab

In [None]:
vocab = extract_vocabulary(df["sentence"].to_list())

In [None]:
vocab = list(vocab)

In [None]:
len(vocab)

1353

In [None]:
df['sentence'] = df['sentence'].apply(preprocess_punjabi)

In [None]:
df['sentence'].head(10)

Unnamed: 0,sentence
0,"[ਮੈਨੂੰ, ਇਹ, ਫਿਲਮ, ਬਹੁਤ, ਪਸੰਦ, ਆਈ]"
1,"[ਅੱਜ, ਦਾ, ਦਿਨ, ਬਹੁਤ, ਮਾੜਾ, ਸੀ]"
2,"[ਮੈਂ, ਬੱਸ, ਵਿੱਚ, ਸਫਰ, ਕੀਤਾ]"
3,"[ਇਹ, ਖਾਣਾ, ਸੁਆਦ, ਹੈ]"
4,"[ਮੇਰਾ, ਸਿਰ, ਬਹੁਤ, ਦੁਖਦਾ, ਹੈ]"
5,"[ਉਹ, ਕੱਲਹ, ਸਹਿਰ, ਜਾਵੇਗਾ]"
6,"[ਮੈਂ, ਆਪਣੇ, ਨਵੇਂ, ਘਰ, ਤੋਂ, ਬਹੁਤ, ਖੁਸ, ਹਾਂ]"
7,"[ਟਰੈਫਿਕ, ਜਾਮ, ਕਾਰਨ, ਮੈਂ, ਲੇਟ, ਹੋ, ਗਿਆ]"
8,"[ਅੱਜ, ਬਾਜਾਰ, ਵਿੱਚ, ਬਹੁਤ, ਭੀੜ, ਸੀ]"
9,"[ਮੈਨੂੰ, ਨੌਕਰੀ, ਮਿਲ, ਗਈ, ਹੈ]"


In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [None]:
sentences = df["sentence"].to_list()
labels = df["sentiment"].to_list()

In [None]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(vocab)
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [None]:
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)
num_classes = len(label_encoder.classes_)
labels_categorical = to_categorical(labels_encoded, num_classes=num_classes)

In [None]:
num_classes

3

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_categorical, test_size=0.2, random_state=42)

train_data = (X_train, y_train)
test_data = (X_test, y_test)

In [None]:
# Model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 2, output_dim=128, input_length=max_len))
model.add(LSTM(64)),
model.add(Dense(num_classes, activation='softmax'))



In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=20, validation_data = test_data)

Epoch 1/20
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - accuracy: 0.3783 - loss: 1.0930 - val_accuracy: 0.4453 - val_loss: 1.0652
Epoch 2/20
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 35ms/step - accuracy: 0.4992 - loss: 1.0034 - val_accuracy: 0.5124 - val_loss: 0.9745
Epoch 3/20
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 40ms/step - accuracy: 0.6091 - loss: 0.8376 - val_accuracy: 0.5998 - val_loss: 0.8472
Epoch 4/20
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.6622 - loss: 0.7695 - val_accuracy: 0.6127 - val_loss: 0.8659
Epoch 5/20
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.6564 - loss: 0.7630 - val_accuracy: 0.6053 - val_loss: 0.8213
Epoch 6/20
[1m136/136[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 36ms/step - accuracy: 0.6964 - loss: 0.6924 - val_accuracy: 0.6734 - val_loss: 0.7869
Epoch 7/20
[1m136/136

<keras.src.callbacks.history.History at 0x7eeaa4693690>

In [None]:
# testing on positive sentences
test_sentence = pd.read_csv("punjabi_positive_sentences.csv")
test_sentence = test_sentence["sentence"].to_list()
test_seq = tokenizer.texts_to_sequences(test_sentence)
test_pad = pad_sequences(test_seq, maxlen=max_len, padding='post')
pred = model.predict(test_pad)
predictions = [np.argmax(p) for p in pred]
# pred_label = label_encoder.inverse_transform([np.argmax(pred)])
# print("Prediction:", pred_label)
print(len(test_sentence))

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
97


In [None]:
res = np.array(label_encoder.inverse_transform(predictions))

from collections import Counter

ctr = Counter(res)

ctr

Counter({np.str_('positive'): 83,
         np.str_('neutral'): 8,
         np.str_('negative'): 6})

### Trying BERT

In [None]:
!pip install transformers



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
model_name = "google/muril-base-cased"  # or "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import pandas as pd

df = pd.read_csv("combined.csv")

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["sentence"].tolist(),
    df["sentiment"].tolist(),
    test_size=0.2
)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [None]:
import torch

class PunjabiSentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item, self.labels[idx]

    def __len__(self):
        return len(self.labels)

train_dataset = PunjabiSentimentDataset(train_encodings, train_labels)
val_dataset = PunjabiSentimentDataset(val_encodings, val_labels)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

TypeError: vars() argument must have __dict__ attribute

In [None]:
results = trainer.evaluate()
print(results)

In [None]:
# Save
model.save_pretrained("./punjabi_sentiment_model")
tokenizer.save_pretrained("./punjabi_sentiment_model")

# Load later
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="./punjabi_sentiment_model",
    tokenizer="./punjabi_sentiment_model"
)

# Predict
result = classifier("ਮੈਨੂੰ ਇਹ ਫਿਲਮ ਬਹੁਤ ਪਸੰਦ ਆਈ")
print(result)