# fastText Language Identification Model

In [1]:
import fasttext
import pandas as pd
import os
from sklearn.model_selection import train_test_split

## Download Tatoeba dataset

In [2]:
! wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
! bunzip2 sentences.tar.bz2
! tar xvf sentences.tar
! mv sentences.csv sentences.tar data_raw

URL transformed to HTTPS due to an HSTS policy
--2021-08-01 19:14:50--  https://downloads.tatoeba.org/exports/sentences.tar.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 152006974 (145M) [application/octet-stream]
Saving to: ‘sentences.tar.bz2’


2021-08-01 19:27:34 (195 KB/s) - ‘sentences.tar.bz2’ saved [152006974/152006974]

x sentences.csv


## Open dataset

There are 398 languages represented, some with very few examples.

In [3]:
sents = pd.read_csv('data_raw/sentences.csv', sep='\t', header=None)
sents.columns = ['index', 'lang', 'text']
len(sents['lang'].value_counts())

398

## Get mapping of Tatoeba three-letter ISO 639-3 codes to two-letter 639-1 codes

The Tatoeba dataset has three-letter ISO 639-3 language codes. We would like to map them to two-letter ISO 639-1 codes where available to correspond with the fastText language codes. This will require some of the codes to be mapped to their macrolanguage codes (e.g. `cmn` for Mandarin Chinese and `yue` for Yue Chinese would be mapped to `zh` for Chinese). This will cause the distinction between certain languages to be lost.

### Open language to macrolanguage mapping

In [4]:
with open('data_raw/iso-639-3_Code_Tables_20210218/iso-639-3-macrolanguages.tab', 'r', encoding='utf-8-sig') as f:
    macro_mapping = f.readlines()

In [5]:
macro_mapping_dict = {}
for mapping in macro_mapping:
    mapping_split = mapping.split('\t')
    macro_mapping_dict[mapping_split[1]] = mapping_split[0]

### Open three-letter to two-letter mapping

In [6]:
with open('data_raw/iso-639-3_Code_Tables_20210218/iso-639-3.tab', 'r', encoding='utf-8-sig') as f:
    three_to_two_mapping = f.readlines()

In [7]:
three_to_two_mapping_dict = {}
for mapping in three_to_two_mapping:
    mapping_split = mapping.split('\t')
    three_to_two_mapping_dict[mapping_split[0]] = mapping_split[3]

## Function to map language codes in Tatoeba dataset to two-letter codes

Map language code to a macro code if available. Then map this code or the original to a two-letter code, if available.

In [8]:
def map_code(lang):
    if lang in macro_mapping_dict:
        macro_code = macro_mapping_dict[lang]
    else:
        macro_code = lang
    if macro_code in three_to_two_mapping_dict:
        return three_to_two_mapping_dict[macro_code]
    else:
        return None

In [9]:
sents['lang_code'] = sents['lang'].apply(lambda x: map_code(x))

## Filter Tatoeba data to languages with at least 100 samples

We end up with 105 languages with at least 100 examples each.

In [10]:
sorted_value_counts = sents['lang_code'].value_counts().sort_values(ascending=False)
lang_list = sorted_value_counts[sorted_value_counts >= 100].index.tolist()
lang_list.remove('')
len(lang_list)

105

In [11]:
original_data = sents[sents['lang_code'].isin(lang_list)]

## Get romanised South Asian language data

In [12]:
samples_ind = pd.DataFrame()

In [13]:
langs_list = []
texts_list = []
for subdir, dirs, files in os.walk('data_raw/dakshina_dataset_v1.0_reduced'):
    for file in files:
        if 'roman.' in file:
            with open(os.path.join(subdir, file), 'r') as f:
                texts = f.readlines()
            texts_list.extend([t.strip() for t in texts])
            langs_list.extend([file.split('.')[0] + '-rom'] * len(texts))

In [14]:
samples_ind['lang_code'] = langs_list
samples_ind['text'] = texts_list

In [15]:
samples_ind.iloc[:10]

Unnamed: 0,lang_code,text
0,mr-rom,"jase katyamadhye nukilapan, usamadhe godva, ne..."
1,mr-rom,Gavat kuthehe ughde gatarvevasta uplabdh nahit.
2,mr-rom,udyogache ghari devata
3,mr-rom,Agryahun Sutaka
4,mr-rom,Dalit Premkavita
5,mr-rom,tyanni vividh shasan padhatincha tailnik abhya...
6,mr-rom,mukhy tara 3.4 drushyapraticha piwala tara asu...
7,mr-rom,Sant Nagi hee Namdavanche putani hote
8,mr-rom,aavhiyon
9,mr-rom,1973 ya kalkhandat pantpradhan aslelya kitteek...


In [16]:
samples_ind['lang_code'].value_counts()

ml-rom    10000
gu-rom    10000
te-rom    10000
mr-rom    10000
ta-rom    10000
hi-rom    10000
si-rom    10000
bn-rom    10000
kn-rom    10000
pa-rom    10000
sd-rom     9999
ur-rom     9759
Name: lang_code, dtype: int64

## Get romanised Arabic language data

Combining Egyptian Arabic, Lebanese Arabic and Tunisian Arabic (a subset with 9000 responses) datasets to get roughly 10000 responses like the South Asian data per language.

In [58]:
egy = pd.read_csv('data_raw/Arabizi Identification/arabizi-twitter-egy.csv')
leb = pd.read_csv('data_raw/Arabizi Identification/arabizi-twitter-leb.csv')
tun = pd.read_csv('data_raw/tunizi_train')

In [59]:
samples_ar = pd.DataFrame()

In [64]:
samples_ar['text'] = pd.concat([tun['text'].iloc[:9000], egy[egy['arabizi'] == '1']['tweet_filter'], leb[leb['arabizi'] == '1']['tweet_filter']])
samples_ar['lang_code'] = 'ar-rom'

In [65]:
samples_ar

Unnamed: 0,text,lang_code
0,alah yara7me,ar-rom
1,brabi atini najah wahed amalta fi akaber korat...,ar-rom
2,bravo slouma walah rajel,ar-rom
3,elboutoula ma nefhem chay,ar-rom
4,ma7laa zinkk,ar-rom
...,...,...
4973,d el zabet,ar-rom
4982,tab law omt w gbthalk..,ar-rom
4983,kont badawar fe el laptop la2eet awel soura la...,ar-rom
4984,bgd ya enn fe nass amhathum msh mwguda m3ahum,ar-rom


## Combine all data to create augmented data

In [66]:
samples_rom = pd.concat([samples_ind, samples_ar])
augmented_data = pd.concat([original_data[['lang_code', 'text']], samples_rom])
augmented_data.reset_index(inplace=True)
augmented_data['index'] = augmented_data.index
set(augmented_data['lang_code'])

{'af',
 'am',
 'an',
 'ar',
 'ar-rom',
 'as',
 'az',
 'ba',
 'be',
 'bg',
 'bn',
 'bn-rom',
 'br',
 'ca',
 'ch',
 'cs',
 'cv',
 'cy',
 'da',
 'de',
 'el',
 'en',
 'eo',
 'es',
 'et',
 'eu',
 'fa',
 'fi',
 'fo',
 'fr',
 'fy',
 'ga',
 'gd',
 'gl',
 'gn',
 'gu',
 'gu-rom',
 'he',
 'hi',
 'hi-rom',
 'hu',
 'hy',
 'ia',
 'ie',
 'io',
 'is',
 'it',
 'ja',
 'jv',
 'ka',
 'kk',
 'km',
 'kn',
 'kn-rom',
 'ko',
 'ku',
 'kw',
 'ky',
 'la',
 'lb',
 'lo',
 'lt',
 'lv',
 'mi',
 'mk',
 'ml',
 'ml-rom',
 'mn',
 'mr',
 'mr-rom',
 'ms',
 'mt',
 'my',
 'ne',
 'nl',
 'no',
 'oc',
 'or',
 'os',
 'pa',
 'pa-rom',
 'pl',
 'pt',
 'qu',
 'rn',
 'ro',
 'ru',
 'sa',
 'sd-rom',
 'se',
 'sh',
 'si-rom',
 'sk',
 'sl',
 'sq',
 'sv',
 'sw',
 'ta',
 'ta-rom',
 'te',
 'te-rom',
 'th',
 'ti',
 'tk',
 'tl',
 'tr',
 'tt',
 'ug',
 'uk',
 'ur',
 'ur-rom',
 'uz',
 'vi',
 'vo',
 'wo',
 'xh',
 'yi',
 'zh'}

## Format in fastText format and split original data into train and test and save

In [22]:
original_data = original_data[['index', 'lang_code', 'text']]
original_data_list = original_data.values.tolist()
original_data_fasttext_format = ['__label__' + data[1] + ' ' + data[2] + '\n' for data in original_data_list]

In [23]:
original_data_fasttext_format[0:10]

['__label__zh 我們試試看！\n',
 '__label__zh 我该去睡觉了。\n',
 '__label__zh 你在干什麼啊？\n',
 '__label__zh 這是什麼啊？\n',
 '__label__zh 今天是６月１８号，也是Muiriel的生日！\n',
 '__label__zh 生日快乐，Muiriel！\n',
 '__label__zh Muiriel现在20岁了。\n',
 '__label__zh 密码是"Muiriel"。\n',
 '__label__zh 我很快就會回來。\n',
 '__label__zh 我不知道。\n']

In [24]:
original_data_train, original_data_test = train_test_split(original_data_fasttext_format, test_size=0.2, random_state=42)
print(len(original_data_train))
print(len(original_data_test))

6888029
1722008


In [25]:
with open('data_processed/train_original.txt', 'w') as f:    
    f.writelines(original_data_train)

In [26]:
with open('data_processed/test_original.txt', 'w') as f:    
    f.writelines(original_data_test)

## Format in fastText format and split augmented data into train and test and save

In [67]:
augmented_data = augmented_data[['index', 'lang_code', 'text']]
augmented_data_list = augmented_data.values.tolist()
augmented_data_fasttext_format = ['__label__' + data[1] + ' ' + data[2] + '\n' for data in augmented_data_list]

In [68]:
augmented_data_fasttext_format[0:10]

['__label__zh 我們試試看！\n',
 '__label__zh 我该去睡觉了。\n',
 '__label__zh 你在干什麼啊？\n',
 '__label__zh 這是什麼啊？\n',
 '__label__zh 今天是６月１８号，也是Muiriel的生日！\n',
 '__label__zh 生日快乐，Muiriel！\n',
 '__label__zh Muiriel现在20岁了。\n',
 '__label__zh 密码是"Muiriel"。\n',
 '__label__zh 我很快就會回來。\n',
 '__label__zh 我不知道。\n']

In [69]:
augmented_data_train, augmented_data_test = train_test_split(augmented_data_fasttext_format, test_size=0.2, random_state=42)
print(len(augmented_data_train))
print(len(augmented_data_test))

6991800
1747950


In [70]:
with open('data_processed/train_augmented.txt', 'w') as f:    
    f.writelines(augmented_data_train)

In [71]:
with open('data_processed/test_augmented.txt', 'w') as f:    
    f.writelines(augmented_data_test)

## Train models on filtered original Tatoeba data and augmented data

In [32]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

In [33]:
model_original = fasttext.train_supervised("data_processed/train_original.txt", dim=50, minn=2, maxn=4, epoch=25, loss='hs')
print_results(*model_original.test('data_processed/test_original.txt'))
model_original.quantize(input='data_processed/train_original.txt', retrain=True)
print_results(*model_original.test('data_processed/test_original.txt'))
model_original.save_model("models/langdetect_original.ftz")

N	1722008
P@1	0.993
R@1	0.993
N	1722008
P@1	0.988
R@1	0.988


In [72]:
model_augmented = fasttext.train_supervised("data_processed/train_augmented.txt", dim=100, minn=2, maxn=6, epoch=50, loss='hs')
print_results(*model_augmented.test('data_processed/test_augmented.txt'))
model_augmented.quantize(input='data_processed/train_augmented.txt', retrain=True)
print_results(*model_augmented.test('data_processed/test_augmented.txt'))
model_augmented.save_model("models/langdetect_augmented.ftz")

N	1747950
P@1	0.992
R@1	0.992
N	1747950
P@1	0.988
R@1	0.988


In [104]:
print(model_original.predict('appo naan sendru varugiren seriya kavalappadaathe'))
print(model_original.predict('kavalappadaathe ellam seriyaidum'))
print(model_original.predict('sendru varugiren veettukku'))
print(model_original.predict('das habe ich nicht gesehen'))
print(model_original.predict('Shnu ga3d tsawe al7een'))
print(model_original.predict('konta dayir amshi le al ma7al dak fog al nil'))
print(model_original.predict('main nahi jaa raha'))
print(model_original.predict('njan parayunna polathanne cheytha mathi'))
print(model_original.predict('Yaarige kok, yaarige lak? Illide sambhavya sacivara patti'))
print(model_original.predict('Bhalo achi re. Bohukaal por. Tui kemon achish?'))
print(model_original.predict('meri bhasha mein baat kar rahi hoon'))
print(model_original.predict('yahan mat aaya karo'))
print(model_original.predict('kaunsi zubaan hai yeh'))
print(model_original.predict('je ne veux pas y aller'))

(('__label__ms',), array([0.5621587]))
(('__label__ms',), array([0.85463762]))
(('__label__et',), array([0.28398207]))
(('__label__de',), array([1.00003994]))
(('__label__sv',), array([0.55302435]))
(('__label__ms',), array([0.38912854]))
(('__label__fi',), array([0.33342338]))
(('__label__az',), array([0.35316071]))
(('__label__et',), array([0.4496921]))
(('__label__la',), array([0.68537027]))
(('__label__gd',), array([0.28740054]))
(('__label__ms',), array([0.34166247]))
(('__label__tr',), array([0.47891927]))
(('__label__fr',), array([0.99980271]))


In [105]:
print(model_augmented.predict('appo naan sendru varugiren seriya kavalappadaathe'))
print(model_augmented.predict('kavalappadaathe ellam seriyaidum'))
print(model_augmented.predict('sendru varugiren veettukku'))
print(model_augmented.predict('das habe ich nicht gesehen'))
print(model_augmented.predict('Shnu ga3d tsawe al7een'))
print(model_augmented.predict('konta dayir amshi le al ma7al dak fog al nil'))
print(model_augmented.predict('main nahi jaa raha'))
print(model_augmented.predict('njan parayunna polathanne cheytha mathi'))
print(model_augmented.predict('Yaarige kok, yaarige lak? Illide sambhavya sacivara patti'))
print(model_augmented.predict('Bhalo achi re. Bohukaal por. Tui kemon achish?'))
print(model_augmented.predict('meri bhasha mein baat kar rahi hoon'))
print(model_augmented.predict('yahan mat aaya karo'))
print(model_augmented.predict('kaunsi zubaan hai yeh'))
print(model_augmented.predict('je ne veux pas y aller'))

(('__label__ta-rom',), array([0.89736366]))
(('__label__ta-rom',), array([0.81094557]))
(('__label__ta-rom',), array([0.84528291]))
(('__label__de',), array([1.00003994]))
(('__label__ar-rom',), array([0.91425329]))
(('__label__ar-rom',), array([0.9144628]))
(('__label__ur-rom',), array([0.78859597]))
(('__label__ml-rom',), array([0.965101]))
(('__label__kn-rom',), array([0.86080682]))
(('__label__bn-rom',), array([0.23797363]))
(('__label__hi-rom',), array([0.86997712]))
(('__label__ur-rom',), array([0.37752041]))
(('__label__ur-rom',), array([0.83652896]))
(('__label__fr',), array([0.868182]))
