In [None]:
import pandas as pd
import numpy as np
import random
import nltk
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.utils import to_categorical

# Load dataset
url = "https://raw.githubusercontent.com/YashiGarg016/Language-Detection/refs/heads/main/Language%20Detection.csv"
df = pd.read_csv(url)
print(df.head())

# Data Preprocessing
def augment_text(text):
    words = text.split()
    augmented_words = []
    for word in words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            augmented_words.append(synonym)
        else:
            augmented_words.append(word)
    return ' '.join(augmented_words)

# Apply augmentation to a subset of data
df['Augmented_Text'] = df['Text'].apply(lambda x: augment_text(x) if random.random() < 0.3 else x)

# Convert text labels to numerical labels
labels = {label: i for i, label in enumerate(df['Language'].unique())}
reverse_labels = {i: label for label, i in labels.items()}
df['Label'] = df['Language'].map(labels)

# Tokenization
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, lower=True)
tokenizer.fit_on_texts(df['Augmented_Text'])
X = tokenizer.texts_to_sequences(df['Augmented_Text'])
X = pad_sequences(X, maxlen=max_len)
y = to_categorical(df['Label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Building
model = Sequential([
    Embedding(max_words, 100, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(len(labels), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Evaluate Model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.2f}")

# Prediction Function
def predict_language(text):
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    prediction = model.predict(padded_sequence)
    predicted_label = np.argmax(prediction)
    return reverse_labels[predicted_label]


                                                Text Language
0   Nature, in the broadest sense, is the natural...  English
1  "Nature" can refer to the phenomena of the phy...  English
2  The study of nature is a large, if not the onl...  English
3  Although humans are part of nature, human acti...  English
4  [1] The word nature is borrowed from the Old F...  English
Epoch 1/5




[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 68ms/step - accuracy: 0.3037 - loss: 2.2947 - val_accuracy: 0.9009 - val_loss: 0.5821
Epoch 2/5
[1m259/259[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 107ms/step - accuracy: 0.8951 - loss: 0.4710

In [None]:
# Example Usage
sample_text = "Hola, ¿cómo estás?"
predicted_language = predict_language(sample_text)
print(f"Predicted Language: {predicted_language}")