In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Setting up torch
import torch

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [3]:
# Defining Network
import torch
import torch.nn as nn
import torch.optim as optim

# Simple feedforward neural network
class DNN_Classifier(nn.Module):
    def __init__(self, input_dim, output_dim,input_neurons=64, hidden1_neurons=32, dropout1_rate=0.4, dropout2_rate=0.3):
        super(DNN_Classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, input_neurons)
        self.bn1 = nn.BatchNorm1d(input_neurons)
        self.dropout1 = nn.Dropout(dropout1_rate)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(input_neurons, hidden1_neurons)
        self.bn2 = nn.BatchNorm1d(hidden1_neurons)
        self.dropout2 = nn.Dropout(dropout2_rate)
        self.fc3 = nn.Linear(hidden1_neurons, output_dim) 
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

# Initialization
input_dim = 300       # Dimension of embeddings
output_dim = 4  # Number of unique classes

model = DNN_Classifier(input_dim, output_dim)
print(model)

DNN_Classifier(
  (fc1): Linear(in_features=300, out_features=64, bias=True)
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.4, inplace=False)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.3, inplace=False)
  (fc3): Linear(in_features=32, out_features=4, bias=True)
)


In [4]:
# Load the saved weights
model.load_state_dict(torch.load("/home/angel-tamang/Nepali Hate Sentiment Detection/Project/Post-Defense/Word Embeddings/dnn_classifier.pth"))

# Set the model to evaluation mode
model.eval()

DNN_Classifier(
  (fc1): Linear(in_features=300, out_features=64, bias=True)
  (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout1): Dropout(p=0.4, inplace=False)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (bn2): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout2): Dropout(p=0.3, inplace=False)
  (fc3): Linear(in_features=32, out_features=4, bias=True)
)

In [5]:
from gensim.models import FastText

path_to_fastext_model = "/home/angel-tamang/Nepali Hate Sentiment Detection/Project/Post-Defense/Data/fasttext_model"

fasttext = FastText.load(path_to_fastext_model)



In [6]:
import re
import string
import nltk
from nepali_stemmer.stemmer import NepStemmer
from nltk.corpus import stopwords

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load Nepali stopwords
nepali_stopwords = set(stopwords.words('nepali'))

# Initialize Nepali stemmer
nepstem = NepStemmer()

# **Preprocessing Function**
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # Handle NaN or non-string values
    
    # Stemming
    text = nepstem.stem(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Replace Nepali sentence-ending punctuation (।) with a space
    text = re.sub(r'।', ' |', text)
    
    # Remove punctuation with a space
    text = re.sub(r'[' + re.escape(string.punctuation) + ']', ' ', text)
    
    # Remove multiple dots (……, ...), dashes (---), or similar symbols
    text = re.sub(r'[\.\-…]+', ' ', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in nepali_stopwords])
    
    return text

[nltk_data] Downloading package punkt to /home/angel-
[nltk_data]     tamang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/angel-
[nltk_data]     tamang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def text_to_embedding(text):
        words = text.split()
        valid_word_vectors = [fasttext.wv[word] for word in words if word in fasttext.wv]
        if valid_word_vectors:
            return np.mean(valid_word_vectors, axis=0)
        else:
            return np.zeros(fasttext.vector_size)  # Return zero vector if no word matches

# **Inference Function**
def preprocess_and_predict(text_or_texts, model, device='cpu'):
    if isinstance(text_or_texts, str):  # If single text, wrap in a list
        text_or_texts = [text_or_texts]

    # Step 1: **Preprocess each text**
    preprocessed_texts = [preprocess_text(text) for text in text_or_texts]
    
    # Step 2: **Convert preprocessed text to embeddings**
    embeddings = np.array([text_to_embedding(text) for text in preprocessed_texts])  # Shape: (N, 300)
    
    # Step 3: Convert to PyTorch tensor
    input_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)

    # Step 4: Run model inference
    model.to(device)
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)
        probs = torch.softmax(output, dim=1)
        predicted_classes = torch.argmax(probs, dim=1).cpu().numpy()
    
    return predicted_classes.tolist(), probs.cpu().numpy()


In [8]:
text = "मेरो नाम दिवाकर हो"  
pred_class, probs = preprocess_and_predict(text, model, device)
print(f"Predicted Class: {pred_class}")
print(f"Confidencce: {probs}")

Predicted Class: [1]
Confidencce: [[4.7372659e-03 9.8829752e-01 3.5398079e-05 6.9298875e-03]]
