In [1]:
!pip install tensorflow scikit-learn pandas



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras import layers, models
import numpy as np

In [3]:
data = pd.read_csv("/content/drive/MyDrive/Dataset/Language Detection.csv")

In [4]:
data.columns = ["Text", "Language"]

In [6]:
X = data["Text"]
y = data["Language"]

In [7]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(X).toarray()

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [10]:
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

In [11]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [12]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

Epoch 1/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.3287 - loss: 2.3999 - val_accuracy: 0.9468 - val_loss: 0.4990
Epoch 2/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9262 - loss: 0.4300 - val_accuracy: 0.9625 - val_loss: 0.1780
Epoch 3/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.9652 - loss: 0.1762 - val_accuracy: 0.9655 - val_loss: 0.1372
Epoch 4/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.9723 - loss: 0.1348 - val_accuracy: 0.9643 - val_loss: 0.1254
Epoch 5/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.9681 - loss: 0.1269 - val_accuracy: 0.9661 - val_loss: 0.1210
Epoch 6/10
[1m207/207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9711 - loss: 0.1135 - val_accuracy: 0.9680 - val_loss: 0.1199
Epoch 7/10
[1m207/207[

In [13]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Test Accuracy: 96.03%


In [14]:
def detect(text_sample):

    text_features = tfidf_vectorizer.transform([text_sample]).toarray()
    prediction = model.predict(text_features)
    predicted_label = np.argmax(prediction, axis=1)
    return label_encoder.inverse_transform(predicted_label)[0]

In [15]:
sample_text = "Nature is beautiful and full of surprises."
print(f"Predicted Language: {detect(sample_text)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Predicted Language: English


In [16]:
sample_text = "പോർട്ടൽ അനലിറ്റിക്സ് വിദ്യ നൽകുന്നു"
print(f"Predicted Language: {detect(sample_text)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Predicted Language: Malayalam


In [17]:
model.summary()
