In [None]:
import pandas as pd
import string
import collections
from sortedcontainers import SortedDict
import operator
import re
import numpy as np
from scipy.spatial import distance
import editdistance
import random
from tqdm.auto import tqdm

df = pd.read_csv('https://raw.githubusercontent.com/amankharwal/Website-data/master/dataset.csv', names=['Text', 'Language'], skiprows=1)
df.head()

Unnamed: 0,Text,Language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch


In [None]:
# Split data into train and test
random.seed(1337)
train = {}
test = {}
split = .7
for language in df.Language.unique():
    data = list(df[df.Language == language].Text)
    random.shuffle(data)
    i = int(len(data)*split)
    train[language] = data[:i]
    test[language] = data[i:]

In [None]:
class LanguageDetector():
    def __init__(self, data, n=2):
        self.n = n
        self.language_models = {}
        for language, text in data.items():
            self.language_models[language] = self.__create_ngram_model(text)
    
    # create ngram model from list of sentences
    def __create_ngram_model(self, sentences):
        ngram_list = []
        for sentence in sentences:
            sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()
            sentence = sentence.translate(str.maketrans('', '', string.digits))
            sentence = sentence.replace('\n', ' ').replace('\r', '')
            for word in re.split(r' +', sentence):
                letter_list = ['~',]*(self.n-1) + list(word) + ['!']
                for i in range(0, len(letter_list)-self.n+1):
                    ngram_list.append(''.join(letter_list[i:i+self.n]))
        freq_counter = collections.Counter(ngram_list)
        freq_dist = dict(freq_counter.most_common(300))
        #normalize
        total_ngrams = sum(freq_counter.values())
        for k, v in freq_dist.items():
            freq_dist[k] = v / total_ngrams
            
        return dict(sorted(freq_dist.items(), key=operator.itemgetter(1), reverse=True))

    # calculate inverse jaccard similarity of two ngram models
    def __jaccard_similarity(self, model1, model2):
        set1 = set(model1.keys())
        set2 = set(model2.keys())
        intersection = len(set1.intersection(set2))
        if intersection == 0:
            return float('inf')
        return 1/(intersection/len(set1.union(set2)))
    
    def __euclidian_distance(self, model1, model2):
        unique_ngrams = list(set(model1.keys()).union(set(model2.keys())))
        model1 = SortedDict(model1.copy())
        model2 = SortedDict(model2.copy())
        for n in unique_ngrams:
            if not n in model1.keys():
                model1[n] = 0
            if not n in model2.keys():
                model2[n] = 0
        
        l1 = np.array(list({k: v for k, v in sorted(model1.items(), key=lambda x: x[0], reverse=True)}.values()))
        l2 = np.array(list({k: v for k, v in sorted(model2.items(), key=lambda x: x[0], reverse=True)}.values()))

        return np.linalg.norm(l1-l2)

    def __cos_distance(self, model1, model2):
        unique_ngrams = list(set(model1.keys()).union(set(model2.keys())))
        model1 = SortedDict(model1.copy())
        model2 = SortedDict(model2.copy())
        for n in unique_ngrams:
            if not n in model1.keys():
                model1[n] = 0
            if not n in model2.keys():
                model2[n] = 0
        
        l1 = np.array(list({k: v for k, v in sorted(model1.items(), key=lambda x: x[0], reverse=True)}.values()))
        l2 = np.array(list({k: v for k, v in sorted(model2.items(), key=lambda x: x[0], reverse=True)}.values()))

        return distance.cosine(l1, l2)

    # levenshtein_distance, remorselessly stolen from rosetta code
    def __levenshtein_distance(self, model1, model2):
        model1 = list(SortedDict(model1.copy()).keys())
        model2 = list(SortedDict(model2.copy()).keys())
        return editdistance.eval(model1, model2)

    distance_algorithms = {
        'jaccard'     : __jaccard_similarity,
        'euclidian'   : __euclidian_distance,
        'cos'         : __cos_distance,
        'levenshtein': __levenshtein_distance,
    }

    def test(self, data, algorithm='cos'):
        columns = ['language', 'n', 'algorithm', 'accuracy', 'correct', 'total']
        df = pd.DataFrame(columns = columns)
        total_correct = 0
        total = 0
        for language, text in tqdm(data.items()):
            correct = 0
            for sentence in text:
                correct = correct + 1 if self.detect_language(sentence, algorithm=algorithm) == language else correct
            total_correct += correct
            total += len(text)
            df = df.append({'language': language, 'n': self.n, 'algorithm': algorithm, 'accuracy': correct/len(text), 'correct': correct, 'total': len(text)}, ignore_index=True)
        df = df.append({'language': 'Total', 'n': self.n, 'algorithm': algorithm, 'accuracy': total_correct/total, 'correct': total_correct, 'total': total}, ignore_index=True)
        return df

    # detect language of given sentence
    def detect_language(self, sentence, algorithm='cos', verbose=False):
        sentence_model = self.__create_ngram_model([sentence,])
        distances = {}
        for language, language_model in self.language_models.items():
            distances[language] = self.distance_algorithms[algorithm](self, sentence_model, language_model)
        if verbose:
          return distances
        return min(distances, key=distances.get)

In [None]:
#@title Test language detection model

ngram_size = 3  #@param {type:'integer'}
distance_alorithm = "euclidian" #@param ['cos', 'euclidian', 'jaccard', 'levenshtein']

ld = LanguageDetector(train, n=ngram_size)
ld.test(test, distance_alorithm)

  0%|          | 0/22 [00:00<?, ?it/s]

Unnamed: 0,language,n,algorithm,accuracy,correct,total
0,Estonian,2,levenshtein,0.943333,283,300
1,Swedish,2,levenshtein,0.993333,298,300
2,Thai,2,levenshtein,0.98,294,300
3,Tamil,2,levenshtein,0.99,297,300
4,Dutch,2,levenshtein,0.97,291,300
5,Japanese,2,levenshtein,0.903333,271,300
6,Turkish,2,levenshtein,0.98,294,300
7,Latin,2,levenshtein,0.876667,263,300
8,Urdu,2,levenshtein,0.973333,292,300
9,Indonesian,2,levenshtein,0.946667,284,300
