In [1]:
import numpy as np
import unicodedata
import pandas as pd
from collections import defaultdict
import random_data_selection
import time

class NaiveBayesLanguageIdentifier:
    def __init__(self, n=20):
        self.n = n
        self.vocab = set()
        self.class_word_counts = defaultdict(lambda: defaultdict(int))
        self.class_counts = defaultdict(int)
        self.stop_words = defaultdict(set)
    
    def _normalize_text(self, text):
        normalized_text = unicodedata.normalize('NFKD', text).encode('utf-8','ignore').decode('utf-8')
        return normalized_text
    
    def _clean_text(self, text):
        cleaned_text = text.lower()
        cleaned_text = ''.join([char for char in cleaned_text if char.isalpha() or char == ' '])
        return cleaned_text
    
    def _extract_ngrams(self, text):
        ngrams = []
        text = ' ' + text + ' '
        for i in range(len(text) - self.n + 1):
            ngrams.append(text[i:i+self.n])
        return ngrams
    
    def _load_stop_words(self, languages):
        for language in languages:
            try:
                with open(f'stopwords/{language}.txt', 'r', encoding='utf8') as f:
                    self.stop_words[language] = set(f.read().splitlines())
            except FileNotFoundError:
                continue
    
    def load_data(self, file_path):
        df = pd.read_csv(file_path, encoding='utf-8')
        text_data = df['text'].tolist()
        labels = df['language'].tolist()
        return text_data, labels
    
    def train(self, text_data, labels):
        self._load_stop_words(set(labels))
        for doc, label in zip(text_data, labels):
            normalized_doc = self._normalize_text(doc)
            cleaned_doc = self._clean_text(normalized_doc)
            ngrams = self._extract_ngrams(cleaned_doc)
            self.class_counts[label] += 1
            for ngram in ngrams:
                if ngram not in self.stop_words[label]:
                    self.class_word_counts[label][ngram] += 1
                    self.vocab.add(ngram)
    
    def _calculate_log_likelihood(self, text, label):
        log_likelihood = 0.0
        ngrams = self._extract_ngrams(text)
        for ngram in ngrams:
            if ngram not in self.stop_words[label]:
                word_count = self.class_word_counts[label].get(ngram, 0) + 1 
                total_words = sum(self.class_word_counts[label].values()) + len(self.vocab)
                log_likelihood += np.log(word_count / total_words)
        return log_likelihood
    
    def predict(self, text):
        normalized_text = self._normalize_text(text)
        cleaned_text = self._clean_text(normalized_text)
        best_label = None
        max_log_prob = float('-inf')
        for label in self.class_counts.keys():
            log_prior = np.log(self.class_counts[label] / sum(self.class_counts.values()))
            log_likelihood = self._calculate_log_likelihood(cleaned_text, label)
            log_posterior = log_prior + log_likelihood
            if log_posterior > max_log_prob:
                max_log_prob = log_posterior
                best_label = label
        return best_label

In [2]:
# Load test data
def progress_bar(progress, total, elapsed_time):
    percent = 100 * (progress / total)
    bar = '█' * int(percent) + '-' * (100 - int(percent))
    if progress > 0:
        eta_minutes, eta_seconds = divmod(elapsed_time, 60)
        eta_seconds = int(eta_seconds)
    if progress == total:
        eta_seconds = 0
    print(f"\rTesting:\t|{bar}| {percent:.2f}%\tETA: {eta_minutes} mins {eta_seconds} seconds", end='\r')

def test(model):
    score = 0
    df = pd.read_csv('dataset/train-subDataSet.csv', encoding='utf-8')
    text_data = df['text'].tolist()
    labels = df['language'].tolist()
    length_data = len(text_data)
    
    start_time = time.time()
    predicted_language = model.predict(text_data[0])
    elapsed_time = time.time() - start_time
    total_time = elapsed_time * length_data * 1.6       # 60% extra time for the total time
    
    for i in range(length_data):
        start_time = time.time()
        predicted_language = model.predict(text_data[i])
        elapsed_time = time.time() - start_time
        
        if predicted_language.lower() == labels[i].lower():
            score += 1
        
        if(total_time < 0):
            total_time = elapsed_time * (length_data - i + 1)
        total_time -= elapsed_time
        progress_bar(i + 1, length_data, total_time)
        
    print("\n")
    print("\nTesting Completed.\n")
    print(f"Score: {score} out of {len(text_data)}")
    print("Accuracy: ", (score/len(text_data))*100, "%")
    return score

In [3]:
# Usage:
# Initialize NaiveBayesLanguageIdentifier
model = NaiveBayesLanguageIdentifier(n=5)
epochs = 10

for i in range(epochs):
    print("Epoch: ",i+1)
    random_data_selection.random_data_selection(size=8000)
    text_data, labels = model.load_data('dataset/train-subDataSet.csv')
    model.train(text_data, labels)
    test(model)

print("\nTraining completed.\n")

Epoch:  1


FileNotFoundError: [Errno 2] No such file or directory: 'dataset/test-subDataSet.csv'

In [None]:
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

# Load a test document from file
test_document = load_text_file('input/input.txt')

# Predict the language of the test document
predicted_language = identifier.predict(test_document)
print("Predicted language:", predicted_language)

Predicted language: Greek
