In [1]:
import os
from tqdm import tqdm

data_dir = '/home/abhisek/Project/Language_detector_200/Ultimate_100_data'

texts, labels = [], []
languages = sorted(os.listdir(data_dir))

for lang in tqdm(languages, desc="Loading language files"):
    lang_dir = os.path.join(data_dir, lang)
    if not os.path.isdir(lang_dir):
        continue
    for filename in os.listdir(lang_dir):
        with open(os.path.join(lang_dir, filename), 'r', encoding='utf-8') as f:
            texts.append(f.read())
            labels.append(lang)

Loading language files: 100%|██████████| 200/200 [00:09<00:00, 20.02it/s]


In [2]:
from sklearn.model_selection import train_test_split
import random

combined = list(zip(texts, labels))
random.shuffle(combined)
texts, labels = zip(*combined)

X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    ngram_range=(2, 5),
    max_features=150_000,
    dtype=np.float32,
    lowercase=True
)

X_train_vec = vectorizer.fit_transform(tqdm(X_train, desc="Fitting TF-IDF"))
X_test_vec = vectorizer.transform(X_test)


Fitting TF-IDF: 100%|██████████| 16000/16000 [15:35<00:00, 17.11it/s]


In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn.utils.class_weight import compute_class_weight
import joblib

clf = SGDClassifier(
    loss='log_loss',
    penalty='elasticnet',
    alpha=1e-5,
    max_iter=1000,
    tol=1e-3,
    n_jobs=-1
)

clf.fit(X_train_vec, y_train)


  ys_types = set(type_of_target(x) for x in ys)


0,1,2
,loss,'log_loss'
,penalty,'elasticnet'
,alpha,1e-05
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.001
,shuffle,True
,verbose,0
,epsilon,0.1


In [5]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = clf.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))


Accuracy: 0.9945
                                              precision    recall  f1-score   support

                                    Acehnese       1.00      1.00      1.00        20
                                   Afrikaans       1.00      1.00      1.00        20
                                    Albanian       1.00      1.00      1.00        20
                            Alemannic German       1.00      1.00      1.00        20
                                     Amharic       1.00      1.00      1.00        20
                                      Arabic       1.00      1.00      1.00        20
                                   Aragonese       1.00      1.00      1.00        20
                                    Armenian       1.00      1.00      1.00        20
                                    Assamese       1.00      1.00      1.00        20
                                    Asturian       1.00      1.00      1.00        20
                                 Aze

In [6]:
import joblib

save_dir = "/home/abhisek/Project/Language_detector_200/Code/ultimate-n-gram-model"
os.makedirs(save_dir, exist_ok=True)

joblib.dump(clf, os.path.join(save_dir, "clf.joblib"))
joblib.dump(vectorizer, os.path.join(save_dir, "vectorizer.joblib"))


['/home/abhisek/Project/Language_detector_200/Code/ultimate-n-gram-model/vectorizer.joblib']

In [9]:
clf = joblib.load(os.path.join(save_dir, "clf.joblib"))
vectorizer = joblib.load(os.path.join(save_dir, "vectorizer.joblib"))

test_lines = [
    "The weather is absolutely delightful today, with a gentle, refreshing breeze rustling through the trees and the sun shining brightly, making it perfect for an outdoor stroll.", # English
    "আজকের আবহাওয়া একেবারেই আনন্দদায়ক, গাছের পাতার মধ্যে দিয়ে একটি মৃদু, সতেজ বাতাস বইছে এবং সূর্য উজ্জ্বলভাবে আলোকিত হচ্ছে, যা বাইরের হাঁটার জন্য এটিকে নিখুঁত করে তুলেছে।", # Bengali
    "El clima es absolutamente delicioso hoy, con una brisa suave y refrescante susurrando entre los árboles y el sol brillando intensamente, lo que lo hace perfecto para un paseo al aire libre.", # Spanish
    "Das Wetter ist heute absolut herrlich, mit einer sanften, erfrischenden Brise, die durch die Bäume rauscht, und der Sonne, die hell scheint, was es perfekt für einen Spaziergang im Freien macht.", # German
    "今日は本当に素晴らしい天気で、やさしく爽やかな風が木々をそよがせ、太陽が明るく輝いています。屋外を散歩するのに最適です。", # Japanese
    "Le temps est absolument magnifique aujourd'hui, avec une brise douce et rafraîchissante qui frôle les arbres et le soleil qui brille de mille feux, ce qui en fait une journée parfaite pour une promenade en plein air.", # French
    "O tempo está absolutamente delicioso hoje, com uma brisa suave e refrescante sussurrando entre as árvores e o sol brilhando intensamente, tornando-o perfeito para um passeio ao ar livre." # Portuguese
]

test_vec = vectorizer.transform(test_lines)
preds = clf.predict(test_vec)

for line, pred in zip(test_lines, preds):
    print(f"{line}\nPredicted Language: {pred}\n")


The weather is absolutely delightful today, with a gentle, refreshing breeze rustling through the trees and the sun shining brightly, making it perfect for an outdoor stroll.
Predicted Language: English

আজকের আবহাওয়া একেবারেই আনন্দদায়ক, গাছের পাতার মধ্যে দিয়ে একটি মৃদু, সতেজ বাতাস বইছে এবং সূর্য উজ্জ্বলভাবে আলোকিত হচ্ছে, যা বাইরের হাঁটার জন্য এটিকে নিখুঁত করে তুলেছে।
Predicted Language: Bengali

El clima es absolutamente delicioso hoy, con una brisa suave y refrescante susurrando entre los árboles y el sol brillando intensamente, lo que lo hace perfecto para un paseo al aire libre.
Predicted Language: Spanish

Das Wetter ist heute absolut herrlich, mit einer sanften, erfrischenden Brise, die durch die Bäume rauscht, und der Sonne, die hell scheint, was es perfekt für einen Spaziergang im Freien macht.
Predicted Language: German

今日は本当に素晴らしい天気で、やさしく爽やかな風が木々をそよがせ、太陽が明るく輝いています。屋外を散歩するのに最適です。
Predicted Language: Japanese

Le temps est absolument magnifique aujourd'hui, avec une brise d