In [1]:
import os
from tqdm import tqdm

data_dir = "/home/abhisek/Project/Language_detector_200/mini-Ultimate_100_data_30lang"
train_file = "/home/abhisek/Project/Language_detector_200/FastText_30/train.txt"

os.makedirs(os.path.dirname(train_file), exist_ok=True)

with open(train_file, "w", encoding="utf-8") as out_file:
    for lang in tqdm(sorted(os.listdir(data_dir)), desc="Processing languages"):
        lang_dir = os.path.join(data_dir, lang)
        if os.path.isdir(lang_dir):
            for fname in os.listdir(lang_dir):
                fpath = os.path.join(lang_dir, fname)
                with open(fpath, "r", encoding="utf-8") as f:
                    text = f.read().strip().replace("\n", " ")
                    if text:
                        out_file.write(f"__label__{lang} {text}\n")


Processing languages: 100%|██████████| 30/30 [00:02<00:00, 12.48it/s]


In [2]:
import fasttext

model_path = "/home/abhisek/Project/Language_detector_200/FastText_30/lang_detect_model.bin"

model = fasttext.train_supervised(
    input=train_file,
    lr=1.0,
    epoch=25,
    wordNgrams=2,
    verbose=2,
    minCount=1,
    loss='softmax'
)

model.save_model(model_path)


Read 15M words
Number of words:  2884035
Number of labels: 30
Progress: 100.0% words/sec/thread:  350773 lr:  0.000000 avg.loss:  0.234705 ETA:   0h 0m 0s 218815 lr:  0.995544 avg.loss:  3.405111 ETA:   0h 4m 9s  0h 2m58s avg.loss:  2.282122 ETA:   0h 2m42s  7.2% words/sec/thread:  333072 lr:  0.927904 avg.loss:  1.836301 ETA:   0h 2m32s avg.loss:  1.381515 ETA:   0h 2m28s% words/sec/thread:  328949 lr:  0.886998 avg.loss:  1.294023 ETA:   0h 2m27s 0.885787 avg.loss:  1.284713 ETA:   0h 2m27sm10s% words/sec/thread:  347239 lr:  0.776112 avg.loss:  0.884336 ETA:   0h 2m 2s avg.loss:  0.792479 ETA:   0h 1m56s% words/sec/thread:  351734 lr:  0.737448 avg.loss:  0.768552 ETA:   0h 1m55s  0h 1m50s ETA:   0h 1m31s lr:  0.568625 avg.loss:  0.516963 ETA:   0h 1m28s ETA:   0h 1m25s 0.465717 ETA:   0h 1m20s avg.loss:  0.395722 ETA:   0h 1m 8sh 1m 2s 0.394752 avg.loss:  0.369983 ETA:   0h 1m 1s ETA:   0h 0m53s words/sec/thread:  353661 lr:  0.326346 avg.loss:  0.337477 ETA:   0h 0m50s 68.8% words

In [3]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split

# Load dataset into memory
texts, labels = [], []
with open(train_file, "r", encoding="utf-8") as f:
    for line in f:
        label, text = line.strip().split(" ", 1)
        labels.append(label.replace("__label__", ""))
        texts.append(text)

# Split
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Predict using FastText
y_pred = []
for line in tqdm(X_test, desc="Predicting"):
    pred_label = model.predict(line)[0][0].replace("__label__", "")
    y_pred.append(pred_label)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))


Predicting: 100%|██████████| 600/600 [00:02<00:00, 222.89it/s]

Accuracy: 0.9983333333333333
F1 Score: 0.9983373373373373
Precision: 0.998421052631579
Recall: 0.9983333333333333





In [1]:
import fasttext

model_path = "/home/abhisek/Project/Language_detector_200/FastText_30/lang_detect_model.bin"
model = fasttext.load_model(model_path)

In [6]:
test_lines = [
    "The weather is absolutely delightful today, with a gentle, refreshing breeze rustling through the trees and the sun shining brightly, making it perfect for an outdoor stroll.",  # English
    "আজকের আবহাওয়া একেবারেই আনন্দদায়ক, গাছের পাতার মধ্যে দিয়ে একটি মৃদু, সতেজ বাতাস বইছে এবং সূর্য উজ্জ্বলভাবে আলোকিত হচ্ছে, যা বাইরের হাঁটার জন্য এটিকে নিখুঁত করে তুলেছে।",  # Bengali
    "El clima es absolutamente delicioso hoy, con una brisa suave y refrescante susurrando entre los árboles y el sol brillando intensamente, lo que lo hace perfecto para un paseo al aire libre.",  # Spanish
    "Das Wetter ist heute absolut herrlich, mit einer sanften, erfrischenden Brise, die durch die Bäume rauscht, und der Sonne, die hell scheint, was es perfekt für einen Spaziergang im Freien macht.",  # German
    "Le temps est absolument magnifique aujourd'hui, avec une brise douce et rafraîchissante qui frôle les arbres et le soleil qui brille de mille feux, ce qui en fait une journée parfaite pour une promenade en plein air.",  # French
    "O tempo está absolutamente delicioso hoje, com uma brisa suave e refrescante sussurrando entre as árvores e o sol brilhando intensamente, tornando-o perfeito para um passeio ao ar livre.",  # Portuguese
    "भारतीय वास्तुकला अपनी विविधता और ऐतिहासिक गहराई के लिए जानी जाती है।" # Hindi
]

for line in test_lines:
    label = model.predict(line)[0][0].replace("__label__", "")
    print(f"Predicted Language: {label}\nSentence: {line}\n")


Predicted Language: English
Sentence: The weather is absolutely delightful today, with a gentle, refreshing breeze rustling through the trees and the sun shining brightly, making it perfect for an outdoor stroll.

Predicted Language: Bengali
Sentence: আজকের আবহাওয়া একেবারেই আনন্দদায়ক, গাছের পাতার মধ্যে দিয়ে একটি মৃদু, সতেজ বাতাস বইছে এবং সূর্য উজ্জ্বলভাবে আলোকিত হচ্ছে, যা বাইরের হাঁটার জন্য এটিকে নিখুঁত করে তুলেছে।

Predicted Language: Spanish
Sentence: El clima es absolutamente delicioso hoy, con una brisa suave y refrescante susurrando entre los árboles y el sol brillando intensamente, lo que lo hace perfecto para un paseo al aire libre.

Predicted Language: German
Sentence: Das Wetter ist heute absolut herrlich, mit einer sanften, erfrischenden Brise, die durch die Bäume rauscht, und der Sonne, die hell scheint, was es perfekt für einen Spaziergang im Freien macht.

Predicted Language: French
Sentence: Le temps est absolument magnifique aujourd'hui, avec une brise douce et rafraî