<a href="https://colab.research.google.com/github/abhijadhav14/Extended-Language-Detection/blob/main/ELD_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install tensorflow scikit-learn pandas



In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras import layers, models
import numpy as np

In [4]:
data = pd.read_csv("/content/drive/MyDrive/Dataset/expanded_language_detection_dataset_30000.csv")

In [8]:
print(data.head())
print("\nColumn Names:")
print(data.columns)

   ID               Text Sample  Length  Word Count  Average Word Length  \
0   1  Bonjour, comment ça va ?      25           4                 4.75   
1   2        Hola, ¿cómo estás?      19           3                 3.67   
2   3                   你好，你好吗？       9           3                 1.67   
3   4         Привет, как дела?      21           3                 3.67   
4   5   Hallo, wie geht es dir?      25           5                 3.20   

   Vowel Count  Consonant Count  Unique Words  Stop Words Count  \
0            9               11             4                 1   
1            7                7             3                 1   
2            0                0             3                 0   
3            5               12             3                 1   
4            7               13             5                 1   

   Special Characters  Digit Count  Punctuation Count  Uppercase Count  \
0                   2            0                  3             

In [9]:
X = data["Text Sample"]
y = data["Language"]

In [10]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [11]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to top 5000 features for efficiency
X_tfidf = tfidf_vectorizer.fit_transform(X).toarray()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [13]:
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

In [14]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [15]:
print("\nTraining the model...")
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)



Training the model...
Epoch 1/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.6916 - loss: 1.4756 - val_accuracy: 0.8952 - val_loss: 0.4390
Epoch 2/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 10ms/step - accuracy: 0.9169 - loss: 0.3493 - val_accuracy: 0.9065 - val_loss: 0.3373
Epoch 3/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.9541 - loss: 0.1863 - val_accuracy: 0.9038 - val_loss: 0.3117
Epoch 4/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.9597 - loss: 0.1420 - val_accuracy: 0.8998 - val_loss: 0.3223
Epoch 5/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.9588 - loss: 0.1341 - val_accuracy: 0.9019 - val_loss: 0.3291
Epoch 6/10
[1m600/600[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.9603 - loss: 0.1247 - val_accuracy: 0.9021 - val_loss: 0.3273

In [16]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\nTest Accuracy: {test_accuracy * 100:.2f}%")


Test Accuracy: 90.87%


In [17]:
def detect(text_sample):
    """
    Predict the language of a given text sample.

    Args:
    - text_sample (str): Text sample to predict.

    Returns:
    - str: Predicted language.
    """
    text_features = tfidf_vectorizer.transform([text_sample]).toarray()
    prediction = model.predict(text_features)
    predicted_label = np.argmax(prediction, axis=1)
    return label_encoder.inverse_transform(predicted_label)[0]

In [19]:
sample_text = "Hello,how are you?"
print(f"\nSample Prediction: {sample_text}")
print(f"Predicted Language: {detect(sample_text)}")


Sample Prediction: Hello,how are you?
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
Predicted Language: English
