In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

In [3]:
print("Loading training data...")
with open('archive/x_train.txt', 'r', encoding='utf-8') as f:
    X_train_text = f.read().splitlines()

with open('archive/y_train.txt', 'r', encoding='utf-8') as f:
    y_train_labels = f.read().splitlines()

print("Loading testing data...")
with open('archive/x_test.txt', 'r', encoding='utf-8') as f:
    X_test_text = f.read().splitlines()

with open('archive/y_test.txt', 'r', encoding='utf-8') as f:
    y_test_labels = f.read().splitlines()

print(f"Loaded {len(X_train_text)} training samples and {len(X_test_text)} testing samples.")

Loading training data...
Loading testing data...
Loaded 117500 training samples and 117500 testing samples.


In [4]:
model_pipeline = Pipeline([
    ('vectorizer', CountVectorizer(analyzer='char', ngram_range=(1, 4))),
    ('classifier', MultinomialNB())
])

In [5]:
print("\nTraining the model...")
model_pipeline.fit(X_train_text, y_train_labels)
print("Training complete!")


Training the model...
Training complete!


In [6]:
print("\nEvaluating model")
predictions = model_pipeline.predict(X_test_text)
accuracy = accuracy_score(y_test_labels, predictions)
print(f"\nModel Accuracy: {accuracy:.2%}")


Evaluating model

Model Accuracy: 91.93%


In [7]:
print("\nClassification Report:")
unique_labels = sorted(list(set(y_test_labels)))
print(classification_report(y_test_labels, predictions, labels=unique_labels[:10]))


Classification Report:
              precision    recall  f1-score   support

         ace       1.00      0.98      0.99       500
         afr       0.93      1.00      0.96       500
         als       0.98      0.89      0.93       500
         amh       1.00      0.99      0.99       500
         ang       1.00      0.89      0.94       500
         ara       0.86      0.99      0.92       500
         arg       1.00      0.81      0.90       500
         arz       0.99      0.86      0.92       500
         asm       1.00      0.97      0.98       500
         ast       0.80      0.96      0.88       500

   micro avg       0.95      0.93      0.94      5000
   macro avg       0.96      0.93      0.94      5000
weighted avg       0.96      0.93      0.94      5000



In [8]:
print("\n--- Testing with custom input ---")
custom_text = ["This is a test of the language detection system."]
prediction = model_pipeline.predict(custom_text)
print(f"The text '{custom_text[0]}' was identified as: {prediction[0]}")


--- Testing with custom input ---
The text 'This is a test of the language detection system.' was identified as: eng


In [9]:
import joblib
model_filename = 'language_detector.joblib'
print(f"\nSaving model to {model_filename}...")
joblib.dump(model_pipeline, model_filename)
print("Model saved successfully!")


Saving model to language_detector.joblib...
Model saved successfully!


In [10]:
print("Loading pre-trained model...")
try:
    loaded_model = joblib.load('language_detector.joblib')
    print("Model loaded successfully!")
except FileNotFoundError:
    print("Error: Model file 'language_detector.joblib' not found.")
    print("Please run the training script first to create the model file.")
    exit()

Loading pre-trained model...
Model loaded successfully!


In [12]:
def predict_language(text):
    prediction = loaded_model.predict([text])
    return prediction[0]

text1 = "This is a sentence written in English."
text2 = "Wie viel kostet das?"
text3 = "Esta es una frase en español."
text4 = "یہ اردو میں لکھا ہوا ایک جملہ ہے۔" 

print(f"Text: '{text1}' \n--> Predicted Language: {predict_language(text1)}")
print(f"Text: '{text2}' \n--> Predicted Language: {predict_language(text2)}")
print(f"Text: '{text3}' \n--> Predicted Language: {predict_language(text3)}")
print(f"Text: '{text4}' \n--> Predicted Language: {predict_language(text4)}")

Text: 'This is a sentence written in English.' 
--> Predicted Language: eng
Text: 'Wie viel kostet das?' 
--> Predicted Language: deu
Text: 'Esta es una frase en español.' 
--> Predicted Language: spa
Text: 'یہ اردو میں لکھا ہوا ایک جملہ ہے۔' 
--> Predicted Language: urd
