<b>Rule-based language identifier for EN,DE and FR. The identifier splits text into bigrams and trigrams of letters and compares the results to a stored list of bigrams and trigrams of each language from a training data</b>

In [43]:
from collections import Counter
import re

<b>Helper function to extract n-gram from a given text</b>

In [44]:
def extract_ngrams(text, n):
    """
    Extract n-grams from a given text.

    Parameters:
    - text (str): The input text from which n-grams are to be extracted.
    - n (int): The size of the n-grams.

    Returns:
    - list of str: A list containing the n-grams.
    """
    return [text[i:i+n] for i in range(len(text) - n + 1)]

<b>Function to get the top ngrams from a file. It get the top 100 by default. Try to adjust the number of n-grams (top 200 for example) and see if the accuray improves</b>

In [45]:
def get_top_ngrams(filename, n, top=200):
    """
    Extract the top n-grams from a text file.

    Parameters:
    - filename (str): The path to the input text file.
    - n (int): The size of the n-grams.
    - top (int, optional): The number of top n-grams to return. Defaults to 100.

    Returns:
    - list of str: A list containing the top n-grams.
    """
    with open(filename, 'r', encoding='utf-8') as f:
        text = f.read().lower()
        text = re.sub(r'[^a-z]', '', text)  # remove non-alphabetic characters
        ngrams = extract_ngrams(text, n)
        return [item[0] for item in Counter(ngrams).most_common(top)]


<b>Function to take user input and a list of ngrams and calculate a score of matched ngrams in compared to the list of ngrams from a training corpus</b>

In [46]:
def calculate_score(sentence, ngrams_list, n):
    sentence_ngrams = extract_ngrams(sentence, n)
    return sum([1 for ng in sentence_ngrams if ng in ngrams_list])

<b>Function to check the accuracy of the approach</b>

In [47]:
def test_accuracy(filename):
    en_bigram = get_top_ngrams('english.txt', 2)
    en_trigram = get_top_ngrams('english.txt', 3)
    de_bigram = get_top_ngrams('german.txt', 2)
    de_trigram = get_top_ngrams('german.txt', 3)
    fr_bigram = get_top_ngrams('french.txt', 2)
    fr_trigram = get_top_ngrams('french.txt', 3)
    rm_bigram = get_top_ngrams('romanian.txt', 2)
    rm_trigram = get_top_ngrams('romanian.txt', 3)
    it_bigram = get_top_ngrams('italian.txt', 2)
    it_trigram = get_top_ngrams('italian.txt', 3)

    correct_predictions = 0
    total_sentences = 0

    with open(filename, 'r', encoding='latin-1') as f:  # Changed this line
        for line in f:
            sentence, actual_language = line.strip().rsplit(',', 1)
            sentence = sentence.lower()
            sentence = re.sub(r'[^a-z]', '', sentence)  # remove non-alphabetic characters

            en_score = calculate_score(sentence, en_bigram, 2) + calculate_score(sentence, en_trigram, 3)
            de_score = calculate_score(sentence, de_bigram, 2) + calculate_score(sentence, de_trigram, 3)
            fr_score = calculate_score(sentence, fr_bigram, 2) + calculate_score(sentence, fr_trigram, 3)
            it_score = calculate_score(sentence, it_bigram, 2) + calculate_score(sentence, it_trigram, 3)
            rm_score = calculate_score(sentence, rm_bigram, 2) + calculate_score(sentence, rm_trigram, 3)
            scores = {'en': en_score, 'de': de_score, 'fr': fr_score, 'it': it_score, 'rm': rm_score}
            detected_language = max(scores, key=scores.get)

            if detected_language == actual_language:
                correct_predictions += 1
            total_sentences += 1

    accuracy = correct_predictions / total_sentences
    print(f"Accuracy: {accuracy:.2f}")

In [48]:
def main():
    test_accuracy('sentences.txt')
    en_bigram = get_top_ngrams('english.txt', 2)
    en_trigram = get_top_ngrams('english.txt', 3)
    de_bigram = get_top_ngrams('german.txt', 2)
    de_trigram = get_top_ngrams('german.txt', 3)
    fr_bigram = get_top_ngrams('french.txt', 2)
    fr_trigram = get_top_ngrams('french.txt', 3)
    rm_bigram = get_top_ngrams('romanian.txt', 2)
    rm_trigram = get_top_ngrams('romanian.txt', 3)
    it_bigram = get_top_ngrams('italian.txt', 2)
    it_trigram = get_top_ngrams('italian.txt', 3)

    while True:
        sentence = input("Enter a sentence (or 'exit' to quit): ").lower()
        if sentence == 'exit':
            break

        sentence = re.sub(r'[^a-z]', '', sentence)  # remove non-alphabetic characters

        en_score = calculate_score(sentence, en_bigram, 2) + calculate_score(sentence, en_trigram, 3)
        de_score = calculate_score(sentence, de_bigram, 2) + calculate_score(sentence, de_trigram, 3)
        fr_score = calculate_score(sentence, fr_bigram, 2) + calculate_score(sentence, fr_trigram, 3)
        it_score = calculate_score(sentence, it_bigram, 2) + calculate_score(sentence, it_trigram, 3)
        rm_score = calculate_score(sentence, rm_bigram, 2) + calculate_score(sentence, rm_trigram, 3)

        scores = {'English': en_score, 'German': de_score, 'French': fr_score, 'italian': it_score, 'Romanian': rm_score}
        detected_language = max(scores, key=scores.get)

        print(f"The detected language is: {detected_language}")
        print(scores)

In [49]:
if __name__ == "__main__":
    main()

Accuracy: 0.98
The detected language is: Romanian
{'English': 52, 'German': 52, 'French': 61, 'italian': 62, 'Romanian': 89}
The detected language is: italian
{'English': 53, 'German': 52, 'French': 57, 'italian': 80, 'Romanian': 54}
The detected language is: English
{'English': 58, 'German': 48, 'French': 42, 'italian': 43, 'Romanian': 38}
The detected language is: German
{'English': 55, 'German': 77, 'French': 54, 'italian': 55, 'Romanian': 46}
The detected language is: French
{'English': 50, 'German': 42, 'French': 62, 'italian': 58, 'Romanian': 56}


KeyboardInterrupt: Interrupted by user