In [30]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv("cleaned_dataset.csv")

# Function to clean text
def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply text cleaning
df["clean_text"] = df["Text"].apply(clean_text)

# Check dataset
df.head()


Unnamed: 0,Text,Language,clean_text
0,ഭൗതികപ്രപഞ്ചത്തെ മൊത്തത്തിൽ സൂചിപ്പിക്കുന്ന പദ...,Malayalam,ഭതകപരപഞചതത മതതതതൽ സചപപകകനന പദമണ പരകത ജർമൻ natu...
1,ഭൗതികപ്രതിഭാസങ്ങളും ജീവനും പ്രകൃതിയുടെ ഘടകങ്ങള...,Malayalam,ഭതകപരതഭസങങള ജവന പരകതയട ഘടകങങളണ
2,മനുഷ്യനിർമിതമായ വസ്തുക്കളെ പ്രകൃതിയുടെ ഭാഗമായി...,Malayalam,മനഷയനർമതമയ വസതകകള പരകതയട ഭഗമയ കണകകകകറലല
3,അവയെ കൃത്രിമം എന്ന് വിശേഷിപ്പിക്കുന്നുഅഭിപ്രായ...,Malayalam,അവയ കതരമ എനന വശഷപപകകനനഅഭപരയസവതനതരയ ഇഗലഷ പദമയ n...
4,പ്രകൃതി എന്ന പദം പ്രപഞ്ചത്തെയും അതിലെ സമസ്ത പ്...,Malayalam,പരകത എനന പദ പരപഞചതതയ അതല സമസത പരതഭസങങളയ ഉൾകകളളനന


In [31]:
# Tokenization
vocab_size = 20000  # Size of the vocabulary
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(df["clean_text"])

# Convert text to sequences
X_seq = tokenizer.texts_to_sequences(df["clean_text"])
X_padded = pad_sequences(X_seq, maxlen=150, padding="post", truncating="post")

print("Example sequence:", X_padded[0])

Example sequence: [2900 2901 1700 1184 1701 2902 2903 1185  195 1186 2904 2905 2906 2907
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0]


In [32]:
# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["Language"])
num_classes = len(label_encoder.classes_)

# Convert labels to categorical
y_categorical = tf.keras.utils.to_categorical(df["label"], num_classes)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)

print("Label mapping:", dict(zip(label_encoder.classes_, range(num_classes))))


Label mapping: {'Hindi': 0, 'Kannada': 1, 'Malayalam': 2, 'Tamil': 3}


In [33]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=256, input_length=150),  # Larger embedding size
    Bidirectional(LSTM(128, return_sequences=True)),  # BiLSTM for bidirectional context
    Dropout(0.3),
    Bidirectional(LSTM(64)),  # Another BiLSTM layer
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(num_classes, activation="softmax")  # Multi-class classification
])

# Compile model
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Model summary
model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 150, 256)          5120000   
                                                                 
 bidirectional_2 (Bidirectio  (None, 150, 256)         394240    
 nal)                                                            
                                                                 
 dropout_7 (Dropout)         (None, 150, 256)          0         
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              164352    
 nal)                                                            
                                                                 
 dropout_8 (Dropout)         (None, 128)               0         
                                                                 
 dense_9 (Dense)             (None, 64)               

In [35]:
from tensorflow.keras.callbacks import EarlyStopping
# Define early stopping callback
early_stopping = EarlyStopping(
    monitor="val_loss",   # Track validation loss
    patience=3,           # Stop training if val_loss doesn't improve for 3 epochs
    restore_best_weights=True,  # Restore best model weights
    verbose=1
)


In [36]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test),callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 4: early stopping


In [37]:
# Evaluate on test data
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc:.2f}")

Test Accuracy: 0.96
