# fastText Language Identification Model

In [1]:
import fasttext
import pandas as pd
import os
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

## Download Tatoeba dataset

In [2]:
! wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
! bunzip2 sentences.tar.bz2
! tar xvf sentences.tar
! mv sentences.csv sentences.tar data_raw

URL transformed to HTTPS due to an HSTS policy
--2021-08-11 19:59:01--  https://downloads.tatoeba.org/exports/sentences.tar.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 152301202 (145M) [application/octet-stream]
Saving to: ‘sentences.tar.bz2’


2021-08-11 20:09:26 (239 KB/s) - ‘sentences.tar.bz2’ saved [152301202/152301202]

x sentences.csv


Create other required directories

In [3]:
os.makedirs('data_processed', exist_ok=True)
os.makedirs('models', exist_ok=True)

## Open dataset

There are 398 languages represented, some with very few examples.

In [4]:
sents = pd.read_csv('data_raw/sentences.csv', sep='\t', header=None)
sents.columns = ['index', 'lang', 'text']
len(sents['lang'].value_counts())

398

## Get mapping of Tatoeba three-letter ISO 639-3 codes to two-letter 639-1 codes

The Tatoeba dataset has three-letter ISO 639-3 language codes. We would like to map them to two-letter ISO 639-1 codes where available to correspond with the fastText language codes. This will require some of the codes to be mapped to their macrolanguage codes (e.g. `cmn` for Mandarin Chinese and `yue` for Yue Chinese would be mapped to `zh` for Chinese). This will cause the distinction between certain languages to be lost.

### Open language to macrolanguage mapping

In [5]:
with open('data_raw/iso-639-3_Code_Tables_20210218/iso-639-3-macrolanguages.tab', 'r', encoding='utf-8-sig') as f:
    macro_mapping = f.readlines()

In [6]:
macro_mapping_dict = {}
for mapping in macro_mapping:
    mapping_split = mapping.split('\t')
    macro_mapping_dict[mapping_split[1]] = mapping_split[0]

### Open three-letter to two-letter mapping

In [7]:
with open('data_raw/iso-639-3_Code_Tables_20210218/iso-639-3.tab', 'r', encoding='utf-8-sig') as f:
    three_to_two_mapping = f.readlines()

In [8]:
three_to_two_mapping_dict = {}
for mapping in three_to_two_mapping:
    mapping_split = mapping.split('\t')
    three_to_two_mapping_dict[mapping_split[0]] = mapping_split[3]

## Function to map language codes in Tatoeba dataset to two-letter codes

Map language code to a macro code if available. Then map this code or the original to a two-letter code, if available.

In [9]:
def map_code(lang):
    if lang in macro_mapping_dict:
        macro_code = macro_mapping_dict[lang]
    else:
        macro_code = lang
    if macro_code in three_to_two_mapping_dict:
        return three_to_two_mapping_dict[macro_code]
    else:
        return None

In [10]:
sents['lang_code'] = sents['lang'].apply(lambda x: map_code(x))

## Filter Tatoeba data to languages with at least 100 samples

We end up with 105 languages with at least 100 examples each.

In [11]:
sorted_value_counts = sents['lang_code'].value_counts().sort_values(ascending=False)
lang_list = sorted_value_counts[sorted_value_counts >= 100].index.tolist()
lang_list.remove('')
len(lang_list)

105

In [12]:
original_data = sents[sents['lang_code'].isin(lang_list)]

## Get romanised South Asian language data

In [13]:
samples_ind = pd.DataFrame()

In [14]:
langs_list = []
texts_list = []
for subdir, dirs, files in os.walk('data_raw/dakshina_dataset_v1.0_reduced'):
    for file in files:
        if 'roman.' in file:
            with open(os.path.join(subdir, file), 'r') as f:
                texts = f.readlines()
            texts_list.extend([t.strip() for t in texts])
            langs_list.extend([file.split('.')[0] + '-rom'] * len(texts))

In [15]:
samples_ind['lang_code'] = langs_list
samples_ind['text'] = texts_list

In [16]:
samples_ind.iloc[:10]

Unnamed: 0,lang_code,text
0,mr-rom,"jase katyamadhye nukilapan, usamadhe godva, ne..."
1,mr-rom,Gavat kuthehe ughde gatarvevasta uplabdh nahit.
2,mr-rom,udyogache ghari devata
3,mr-rom,Agryahun Sutaka
4,mr-rom,Dalit Premkavita
5,mr-rom,tyanni vividh shasan padhatincha tailnik abhya...
6,mr-rom,mukhy tara 3.4 drushyapraticha piwala tara asu...
7,mr-rom,Sant Nagi hee Namdavanche putani hote
8,mr-rom,aavhiyon
9,mr-rom,1973 ya kalkhandat pantpradhan aslelya kitteek...


In [17]:
samples_ind['lang_code'].value_counts()

te-rom    10000
ml-rom    10000
bn-rom    10000
gu-rom    10000
ta-rom    10000
hi-rom    10000
mr-rom    10000
pa-rom    10000
kn-rom    10000
si-rom    10000
sd-rom     9999
ur-rom     9759
Name: lang_code, dtype: int64

## Get romanised Arabic language data

Combining Egyptian Arabic, Lebanese Arabic and Tunisian Arabic (a subset with 9000 responses) datasets to get roughly 10000 responses like the South Asian data per language.

In [18]:
egy = pd.read_csv('data_raw/Arabizi Identification/arabizi-twitter-egy.csv')
leb = pd.read_csv('data_raw/Arabizi Identification/arabizi-twitter-leb.csv')
tun = pd.read_csv('data_raw/tunizi_train')

In [19]:
samples_ar = pd.DataFrame()

In [20]:
samples_ar['text'] = pd.concat([tun['text'].iloc[:9000], egy[egy['arabizi'] == '1']['tweet_filter'], leb[leb['arabizi'] == '1']['tweet_filter']])
samples_ar['lang_code'] = 'ar-rom'

In [21]:
samples_ar

Unnamed: 0,text,lang_code
0,alah yara7me,ar-rom
1,brabi atini najah wahed amalta fi akaber korat...,ar-rom
2,bravo slouma walah rajel,ar-rom
3,elboutoula ma nefhem chay,ar-rom
4,ma7laa zinkk,ar-rom
...,...,...
4973,d el zabet,ar-rom
4982,tab law omt w gbthalk..,ar-rom
4983,kont badawar fe el laptop la2eet awel soura la...,ar-rom
4984,bgd ya enn fe nass amhathum msh mwguda m3ahum,ar-rom


## Combine all data to create augmented data

In [22]:
samples_rom = pd.concat([samples_ind, samples_ar])
augmented_data = pd.concat([original_data[['lang_code', 'text']], samples_rom])
augmented_data.reset_index(inplace=True)
augmented_data['index'] = augmented_data.index
set(augmented_data['lang_code'])

{'af',
 'am',
 'an',
 'ar',
 'ar-rom',
 'as',
 'az',
 'ba',
 'be',
 'bg',
 'bn',
 'bn-rom',
 'br',
 'ca',
 'ch',
 'cs',
 'cv',
 'cy',
 'da',
 'de',
 'el',
 'en',
 'eo',
 'es',
 'et',
 'eu',
 'fa',
 'fi',
 'fo',
 'fr',
 'fy',
 'ga',
 'gd',
 'gl',
 'gn',
 'gu',
 'gu-rom',
 'he',
 'hi',
 'hi-rom',
 'hu',
 'hy',
 'ia',
 'ie',
 'io',
 'is',
 'it',
 'ja',
 'jv',
 'ka',
 'kk',
 'km',
 'kn',
 'kn-rom',
 'ko',
 'ku',
 'kw',
 'ky',
 'la',
 'lb',
 'lo',
 'lt',
 'lv',
 'mi',
 'mk',
 'ml',
 'ml-rom',
 'mn',
 'mr',
 'mr-rom',
 'ms',
 'mt',
 'my',
 'ne',
 'nl',
 'no',
 'oc',
 'or',
 'os',
 'pa',
 'pa-rom',
 'pl',
 'pt',
 'qu',
 'rn',
 'ro',
 'ru',
 'sa',
 'sd-rom',
 'se',
 'sh',
 'si-rom',
 'sk',
 'sl',
 'sq',
 'sv',
 'sw',
 'ta',
 'ta-rom',
 'te',
 'te-rom',
 'th',
 'ti',
 'tk',
 'tl',
 'tr',
 'tt',
 'ug',
 'uk',
 'ur',
 'ur-rom',
 'uz',
 'vi',
 'vo',
 'wo',
 'xh',
 'yi',
 'zh'}

# Define lookup table to strip out punctuation

In [23]:
punct_table = str.maketrans(dict.fromkeys(string.punctuation))

## Format in fastText format and split original data into train and test and save

In [24]:
original_data = original_data[['index', 'lang_code', 'text']]
original_data_list = original_data.values.tolist()
original_data_fasttext_format = ['__label__' + data[1] + ' ' + data[2].translate(punct_table) + '\n' for data in original_data_list]

Language-specific punctuation is kept, e.g. in Chinese.

In [25]:
original_data_fasttext_format[0:10] + original_data_fasttext_format[-10:]

['__label__zh 我們試試看！\n',
 '__label__zh 我该去睡觉了。\n',
 '__label__zh 你在干什麼啊？\n',
 '__label__zh 這是什麼啊？\n',
 '__label__zh 今天是６月１８号，也是Muiriel的生日！\n',
 '__label__zh 生日快乐，Muiriel！\n',
 '__label__zh Muiriel现在20岁了。\n',
 '__label__zh 密码是Muiriel。\n',
 '__label__zh 我很快就會回來。\n',
 '__label__zh 我不知道。\n',
 '__label__fr Merci de le transmettre aux autres amis \n',
 '__label__ms Aku tidak muda seperti kamu\n',
 '__label__fr Jai pris une douche et je suis allé au lit\n',
 '__label__hu A fiunk nehezen illeszkedik be az új iskolába\n',
 '__label__fr Où voulezvous aller les enfants \n',
 '__label__de Wo wollt ihr hin Kinder\n',
 '__label__hu A lányunk antiszociális – így hamar beilleszkedett az új iskola antiszociális lányai közé\n',
 '__label__fr On est passé à côté de beaucoup de choses\n',
 '__label__fr Jai raté tellement de choses\n',
 '__label__fr Je suis passé à côté de tellement de choses\n']

In [26]:
original_data_train, original_data_test = train_test_split(original_data_fasttext_format, test_size=0.2, random_state=42)
print(len(original_data_train))
print(len(original_data_test))

6900418
1725105


In [27]:
with open('data_processed/train_original.txt', 'w') as f:    
    f.writelines(original_data_train)

In [28]:
with open('data_processed/test_original.txt', 'w') as f:    
    f.writelines(original_data_test)

## Format in fastText format and split augmented data into train and test and save

In [29]:
augmented_data = augmented_data[['index', 'lang_code', 'text']]
augmented_data_list = augmented_data.values.tolist()
augmented_data_fasttext_format = ['__label__' + data[1] + ' ' + data[2].translate(punct_table) + '\n' for data in augmented_data_list]

In [30]:
augmented_data_fasttext_format[0:10] + augmented_data_fasttext_format[-10:]

['__label__zh 我們試試看！\n',
 '__label__zh 我该去睡觉了。\n',
 '__label__zh 你在干什麼啊？\n',
 '__label__zh 這是什麼啊？\n',
 '__label__zh 今天是６月１８号，也是Muiriel的生日！\n',
 '__label__zh 生日快乐，Muiriel！\n',
 '__label__zh Muiriel现在20岁了。\n',
 '__label__zh 密码是Muiriel。\n',
 '__label__zh 我很快就會回來。\n',
 '__label__zh 我不知道。\n',
 '__label__ar-rom selena 3mla tatto bel3rby ya gama3a\n',
 '__label__ar-rom danty ray2a neek \n',
 '__label__ar-rom la2 he will mat2oleesh kda\n',
 '__label__ar-rom ybne l sa3a a5oya hyege w hyfsh5ne lw ml2hash\n',
 '__label__ar-rom kol sa3a fe toul el seneen elly fatet 7ezent awy eny 7esertek 7afdal andam 3aleeky toul 3omry toul 3omry\n',
 '__label__ar-rom d el zabet\n',
 '__label__ar-rom tab law omt w gbthalk\n',
 '__label__ar-rom kont badawar fe el laptop la2eet awel soura la2etha leeky kanet men 3and he fere7t far7et el donia el youm da konty wa7shany ad el\n',
 '__label__ar-rom bgd ya enn fe nass amhathum msh mwguda m3ahum\n',
 '__label__ar-rom mashy \n']

In [31]:
augmented_data_train, augmented_data_test = train_test_split(augmented_data_fasttext_format, test_size=0.2, random_state=42)
print(len(augmented_data_train))
print(len(augmented_data_test))

7004188
1751048


In [32]:
with open('data_processed/train_augmented.txt', 'w') as f:    
    f.writelines(augmented_data_train)

In [33]:
with open('data_processed/test_augmented.txt', 'w') as f:    
    f.writelines(augmented_data_test)

## Train models on filtered original Tatoeba data and augmented data

In [80]:
model_original = fasttext.train_supervised("data_processed/train_original.txt", dim=100, minn=2, maxn=6, epoch=50, loss='hs')
model_original.quantize(input='data_processed/train_original.txt', retrain=True)

original_data_test_split = [sample.split() for sample in original_data_test]
preds_original = [model_original.predict(s[1]) for s in original_data_test_split]
original_stats = precision_recall_fscore_support([s[0] for s in original_data_test_split], [p[0][0] for p in preds_original], average='weighted')
print(f'Model trained on original data — Precision: {round(original_stats[0], 2)}, Recall: {round(original_stats[1], 2)}, F1 score: {round(original_stats[1], 2)}')

model_original.save_model("models/langdetect_original.ftz")

Model trained on original data — Precision: 0.79, Recall: 0.65, F1 score: 0.65


In [84]:
model_augmented = fasttext.train_supervised("data_processed/train_augmented.txt", dim=100, minn=2, maxn=6, epoch=50, loss='hs')
model_augmented.quantize(input='data_processed/train_augmented.txt', retrain=True)

augmented_data_test_split = [sample.split() for sample in augmented_data_test]
preds_augmented = [model_augmented.predict(s[1]) for s in augmented_data_test_split]
augmented_stats = precision_recall_fscore_support([s[0] for s in augmented_data_test_split], [p[0][0] for p in preds_augmented], average='weighted')
print(f'Model trained on augmented data — Precision: {round(augmented_stats[0], 2)}, Recall: {round(augmented_stats[1], 2)}, F1 score: {round(augmented_stats[1], 2)}')

model_augmented.save_model("models/langdetect_augmented.ftz")

Model trained on augmented data — Precision: 0.75, Recall: 0.66, F1 score: 0.66


In [89]:
print(model_original.predict('naan irukken seriya kavalaippadaathe'))
print(model_original.predict('veettukku sendru varugiren'))
print(model_original.predict('das habe ich nicht gesehen'))
print(model_original.predict('ga3d tsawe al7een'))
print(model_original.predict('konta dayir amshi le al ma7al dak fog al nil'))
print(model_original.predict('main jaa rahi hoon'))
print(model_original.predict('njan parayunna polathanne cheytha mathi'))
print(model_original.predict('Yaarige kok, yaarige lak? Illide sambhavya sacivara patti'))
print(model_original.predict('Bhalo achi re. Bohukaal por. Tui kemon achish?'))
print(model_original.predict('main apni bhasha mein baat kar rahi hoon'))
print(model_original.predict('yahan mat aaya karo'))
print(model_original.predict('kaunsi bhaasha mein baat kar rahe ho'))
print(model_original.predict('je ne veux pas y aller'))

(('__label__wo',), array([0.366339]))
(('__label__fi',), array([0.81258559]))
(('__label__de',), array([1.00002229]))
(('__label__af',), array([0.53687853]))
(('__label__vo',), array([0.19855516]))
(('__label__mt',), array([0.38710099]))
(('__label__gd',), array([0.35546938]))
(('__label__br',), array([0.25188208]))
(('__label__gd',), array([0.20130548]))
(('__label__wo',), array([0.36079821]))
(('__label__tl',), array([0.71375734]))
(('__label__wo',), array([0.55288911]))
(('__label__fr',), array([1.00003934]))


In [90]:
print(model_augmented.predict('naan irukken kavalaippadaathe'))
print(model_augmented.predict('veettukku sendru varugiren'))
print(model_augmented.predict('das habe ich nicht gesehen'))
print(model_augmented.predict('ga3d tsawe al7een'))
print(model_augmented.predict('amshi le al ma7al dak fog al nil'))
print(model_augmented.predict('main jaa rahi hoon'))
print(model_augmented.predict('njan parayunna polathanne cheytha mathi'))
print(model_augmented.predict('Yaarige kok, yaarige lak? Illide sambhavya sacivara patti'))
print(model_augmented.predict('Bhalo achi re. Bohukaal por. Tui kemon achish?'))
print(model_augmented.predict('main apni bhasha mein baat kar rahi hoon'))
print(model_augmented.predict('yahan mat aaya karo'))
print(model_augmented.predict('kaunsi bhaasha mein baat kar rahe ho'))
print(model_augmented.predict('je ne veux pas y aller'))

(('__label__ta-rom',), array([0.97140265]))
(('__label__ta-rom',), array([0.93819612]))
(('__label__de',), array([0.99997598]))
(('__label__ar-rom',), array([0.99352843]))
(('__label__ar-rom',), array([0.94152361]))
(('__label__ur-rom',), array([0.7084133]))
(('__label__ml-rom',), array([0.98326945]))
(('__label__kn-rom',), array([0.96642965]))
(('__label__bn-rom',), array([0.68598193]))
(('__label__ur-rom',), array([0.95979065]))
(('__label__ur-rom',), array([0.94612777]))
(('__label__hi-rom',), array([0.70675683]))
(('__label__fr',), array([0.99993479]))


All the examples here are coded correctly.