In [18]:
import pandas as pd
import time
import pickle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import json

def progress_bar(progress, total, elapsed_time):
    percent = 100 * (progress / total)
    bar = '█' * int(percent) + '-' * (100 - int(percent))
    eta_minutes, eta_seconds = divmod(elapsed_time, 60)
    print(f"\rTesting:\t|{bar}| {percent:.2f}%\tETA: {eta_minutes:.0f} mins {eta_seconds:.0f} seconds    ", end='\r')

In [20]:
model = pickle.load(open("model.pkl", 'rb'))
df = pd.read_csv('dataset/test-data-test-case.csv', encoding='utf-8', dtype={'column1': 'string', 'column2': 'string'})
text_data = df['sentence'].tolist()
true_labels = df['lan_code'].tolist()
length_data = len(text_data)

with open('dataset/lan_to_language.json', 'r', encoding='utf-8') as file:
    lan_to_language = json.load(file)

start_time = time.time()
# Initial prediction for time estimation
predicted_language = model.predict(text_data[0])
elapsed_time = time.time() - start_time

total_time = elapsed_time * length_data * 2  # Estimated total time (200% buffer)

predicted_labels = [lan_to_language[predicted_language]]

for i in range(1, length_data):
    start_time = time.time()
    predicted_language = model.predict(text_data[i])
    elapsed_time = time.time() - start_time

    predicted_labels.append(lan_to_language[predicted_language])

    total_time -= elapsed_time
    progress_bar(i + 1, length_data, total_time)

eta_minutes, eta_seconds = divmod((elapsed_time * length_data * 3 - total_time), 60)
print(f"\nTesting Complete. Total time: {eta_minutes:.0f} mins {eta_seconds:.0f} seconds\n")

Testing:	|████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00%	ETA: 13 mins 13 seconds    
Testing Complete. Total time: 22 mins 36 seconds



In [24]:
def language(lan_code):
    file_path = 'dataset/lan_to_language.json'

    with open(file_path, 'r', encoding='utf-8') as file:
        lan_to_language = json.load(file)
        
    return lan_to_language[lan_code]

In [25]:
# Preprocess labels
true_labels = [language(label).lower() for label in true_labels]
predicted_labels = [language(label).lower() for label in predicted_labels]

In [27]:
print(true_labels)

['german', 'italian', 'lithuanian', 'berber languages', 'esperanto', 'italian', 'modern greek (1453-)', 'kabyle', 'russian', 'german', 'french', 'german', 'spanish', 'macedonian', 'german', 'kabyle', 'french', 'kabyle', 'spanish', 'french', 'german', 'esperanto', 'lojban', 'czech', 'iranian persian', 'danish', 'english', 'french', 'berber languages', 'russian', 'french', 'northern kurdish', 'english', 'marathi', 'turkish', 'hebrew', 'mandarin chinese', 'afrikaans', 'english', 'kabyle', 'english', 'czech', 'french', 'german', 'french', 'macedonian', 'turkish', 'tagalog', 'turkish', 'esperanto', 'italian', 'turkish', 'italian', 'english', 'finnish', 'russian', 'english', 'russian', 'portuguese', 'russian', 'lithuanian', 'hungarian', 'esperanto', 'turkish', 'italian', 'serbian', 'esperanto', 'finnish', 'hungarian', 'kabyle', 'french', 'italian', 'thai', 'hebrew', 'brithenig', 'english', 'portuguese', 'english', 'turkish', 'berber languages', 'portuguese', 'kabyle', 'german', 'russian', 'r

In [28]:
print(predicted_labels)

['german', 'italian', 'lithuanian', 'kabyle', 'esperanto', 'italian', 'modern greek (1453-)', 'berber languages', 'russian', 'german', 'french', 'german', 'spanish', 'macedonian', 'german', 'kabyle', 'french', 'berber languages', 'spanish', 'french', 'german', 'esperanto', 'lojban', 'spanish', 'iranian persian', 'danish', 'english', 'french', 'berber languages', 'russian', 'french', 'hungarian', 'english', 'marathi', 'turkish', 'hebrew', 'mandarin chinese', 'dutch', 'english', 'kabyle', 'english', 'czech', 'french', 'german', 'french', 'macedonian', 'turkish', 'tagalog', 'turkish', 'esperanto', 'italian', 'turkish', 'italian', 'english', 'finnish', 'russian', 'english', 'russian', 'portuguese', 'russian', 'lithuanian', 'hungarian', 'esperanto', 'turkish', 'italian', 'serbian', 'esperanto', 'finnish', 'hungarian', 'kabyle', 'french', 'portuguese', 'thai', 'hebrew', 'latin', 'english', 'portuguese', 'english', 'turkish', 'berber languages', 'portuguese', 'berber languages', 'german', 'ru

In [29]:
# Compute metrics
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='weighted', zero_division=0)
recall = recall_score(true_labels, predicted_labels, average='weighted', zero_division=0)
f1 = f1_score(true_labels, predicted_labels, average='weighted', zero_division=0)

# Print the metrics
print("Performance Metrics:")
print(f"Accuracy: {accuracy*100:.2f}")
print(f"Precision: {precision*100:.2f}")
print(f"Recall: {recall*100:.2f}")
print(f"F1-Score: {f1*100:.2f}")

Performance Metrics:
Accuracy: 94.49
Precision: 93.33
Recall: 94.49
F1-Score: 93.75
