# fastText Language Identification Model

In [1]:
import fasttext
import pandas as pd
import os
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

## Download Tatoeba dataset

In [None]:
! wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
! bunzip2 sentences.tar.bz2
! tar xvf sentences.tar
! mv sentences.csv sentences.tar data_raw

URL transformed to HTTPS due to an HSTS policy
--2021-08-11 17:17:19--  https://downloads.tatoeba.org/exports/sentences.tar.bz2
Resolving downloads.tatoeba.org (downloads.tatoeba.org)... 94.130.77.194
Connecting to downloads.tatoeba.org (downloads.tatoeba.org)|94.130.77.194|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 152301202 (145M) [application/octet-stream]
Saving to: ‘sentences.tar.bz2’

sentences.tar.bz2    15%[==>                 ]  22.52M   238KB/s    eta 4m 55s 

Create other required directories

In [None]:
os.makedirs('data_processed', exist_ok=True)
os.makedirs('models', exist_ok=True)

## Open dataset

There are 398 languages represented, some with very few examples.

In [None]:
sents = pd.read_csv('data_raw/sentences.csv', sep='\t', header=None)
sents.columns = ['index', 'lang', 'text']
len(sents['lang'].value_counts())

## Get mapping of Tatoeba three-letter ISO 639-3 codes to two-letter 639-1 codes

The Tatoeba dataset has three-letter ISO 639-3 language codes. We would like to map them to two-letter ISO 639-1 codes where available to correspond with the fastText language codes. This will require some of the codes to be mapped to their macrolanguage codes (e.g. `cmn` for Mandarin Chinese and `yue` for Yue Chinese would be mapped to `zh` for Chinese). This will cause the distinction between certain languages to be lost.

### Open language to macrolanguage mapping

In [None]:
with open('data_raw/iso-639-3_Code_Tables_20210218/iso-639-3-macrolanguages.tab', 'r', encoding='utf-8-sig') as f:
    macro_mapping = f.readlines()

In [None]:
macro_mapping_dict = {}
for mapping in macro_mapping:
    mapping_split = mapping.split('\t')
    macro_mapping_dict[mapping_split[1]] = mapping_split[0]

### Open three-letter to two-letter mapping

In [None]:
with open('data_raw/iso-639-3_Code_Tables_20210218/iso-639-3.tab', 'r', encoding='utf-8-sig') as f:
    three_to_two_mapping = f.readlines()

In [None]:
three_to_two_mapping_dict = {}
for mapping in three_to_two_mapping:
    mapping_split = mapping.split('\t')
    three_to_two_mapping_dict[mapping_split[0]] = mapping_split[3]

## Function to map language codes in Tatoeba dataset to two-letter codes

Map language code to a macro code if available. Then map this code or the original to a two-letter code, if available.

In [None]:
def map_code(lang):
    if lang in macro_mapping_dict:
        macro_code = macro_mapping_dict[lang]
    else:
        macro_code = lang
    if macro_code in three_to_two_mapping_dict:
        return three_to_two_mapping_dict[macro_code]
    else:
        return None

In [None]:
sents['lang_code'] = sents['lang'].apply(lambda x: map_code(x))

## Filter Tatoeba data to languages with at least 100 samples

We end up with 105 languages with at least 100 examples each.

In [None]:
sorted_value_counts = sents['lang_code'].value_counts().sort_values(ascending=False)
lang_list = sorted_value_counts[sorted_value_counts >= 100].index.tolist()
lang_list.remove('')
len(lang_list)

In [None]:
original_data = sents[sents['lang_code'].isin(lang_list)]

## Get romanised South Asian language data

In [None]:
samples_ind = pd.DataFrame()

In [None]:
langs_list = []
texts_list = []
for subdir, dirs, files in os.walk('data_raw/dakshina_dataset_v1.0_reduced'):
    for file in files:
        if 'roman.' in file:
            with open(os.path.join(subdir, file), 'r') as f:
                texts = f.readlines()
            texts_list.extend([t.strip() for t in texts])
            langs_list.extend([file.split('.')[0] + '-rom'] * len(texts))

In [None]:
samples_ind['lang_code'] = langs_list
samples_ind['text'] = texts_list

In [None]:
samples_ind.iloc[:10]

In [None]:
samples_ind['lang_code'].value_counts()

## Get romanised Arabic language data

Combining Egyptian Arabic, Lebanese Arabic and Tunisian Arabic (a subset with 9000 responses) datasets to get roughly 10000 responses like the South Asian data per language.

In [None]:
egy = pd.read_csv('data_raw/Arabizi Identification/arabizi-twitter-egy.csv')
leb = pd.read_csv('data_raw/Arabizi Identification/arabizi-twitter-leb.csv')
tun = pd.read_csv('data_raw/tunizi_train')

In [None]:
samples_ar = pd.DataFrame()

In [None]:
samples_ar['text'] = pd.concat([tun['text'].iloc[:9000], egy[egy['arabizi'] == '1']['tweet_filter'], leb[leb['arabizi'] == '1']['tweet_filter']])
samples_ar['lang_code'] = 'ar-rom'

In [None]:
samples_ar

## Combine all data to create augmented data

In [None]:
samples_rom = pd.concat([samples_ind, samples_ar])
augmented_data = pd.concat([original_data[['lang_code', 'text']], samples_rom])
augmented_data.reset_index(inplace=True)
augmented_data['index'] = augmented_data.index
set(augmented_data['lang_code'])

# Define lookup table to strip out punctuation

In [None]:
punct_table = str.maketrans(dict.fromkeys(string.punctuation))

## Format in fastText format and split original data into train and test and save

In [None]:
original_data = original_data[['index', 'lang_code', 'text']]
original_data_list = original_data.values.tolist()
original_data_fasttext_format = ['__label__' + data[1] + ' ' + data[2].translate(punct_table) + '\n' for data in original_data_list]

Language-specific punctuation is kept, e.g. in Chinese.

In [None]:
original_data_fasttext_format[0:10] + original_data_fasttext_format[-10:]

In [None]:
original_data_train, original_data_test = train_test_split(original_data_fasttext_format, test_size=0.2, random_state=42)
print(len(original_data_train))
print(len(original_data_test))

In [None]:
with open('data_processed/train_original.txt', 'w') as f:    
    f.writelines(original_data_train)

In [None]:
with open('data_processed/test_original.txt', 'w') as f:    
    f.writelines(original_data_test)

## Format in fastText format and split augmented data into train and test and save

In [None]:
augmented_data = augmented_data[['index', 'lang_code', 'text']]
augmented_data_list = augmented_data.values.tolist()
augmented_data_fasttext_format = ['__label__' + data[1] + ' ' + data[2].translate(punct_table) + '\n' for data in augmented_data_list]

In [None]:
augmented_data_fasttext_format[0:10] + augmented_data_fasttext_format[-10:]

In [None]:
augmented_data_train, augmented_data_test = train_test_split(augmented_data_fasttext_format, test_size=0.2, random_state=42)
print(len(augmented_data_train))
print(len(augmented_data_test))

In [None]:
with open('data_processed/train_augmented.txt', 'w') as f:    
    f.writelines(augmented_data_train)

In [None]:
with open('data_processed/test_augmented.txt', 'w') as f:    
    f.writelines(augmented_data_test)

## Train models on filtered original Tatoeba data and augmented data

In [None]:
model_original = fasttext.train_supervised("data_processed/train_original.txt", dim=50, minn=2, maxn=4, epoch=25, loss='hs')
model_original.quantize(input='data_processed/train_original.txt', retrain=True)

original_data_test_split = [sample.split() for sample in original_data_test]
preds_original = [model_original.predict(s[1]) for s in original_data_test_split]
original_stats = precision_recall_fscore_support([s[0] for s in original_data_test_split], [p[0][0] for p in preds_original], average='weighted')
print(f'Model trained on original data — Precision: {round(original_stats[0], 2)}, Recall: {round(original_stats[1], 2)}, F1 score: {round(original_stats[1], 2)}')

model_original.save_model("models/langdetect_original.ftz")

In [None]:
model_augmented = fasttext.train_supervised("data_processed/train_augmented.txt", dim=50, minn=2, maxn=4, epoch=25, loss='hs')
model_augmented.quantize(input='data_processed/train_augmented.txt', retrain=True)

augmented_data_test_split = [sample.split() for sample in augmented_data_test]
preds_augmented = [model_augmented.predict(s[1]) for s in augmented_data_test_split]
augmented_stats = precision_recall_fscore_support([s[0] for s in augmented_data_test_split], [p[0][0] for p in preds_augmented], average='weighted')
print(f'Model trained on augmented data — Precision: {round(augmented_stats[0], 2)}, Recall: {round(augmented_stats[1], 2)}, F1 score: {round(augmented_stats[1], 2)}')

model_augmented.save_model("models/langdetect_augmented.ftz")

In [None]:
print(model_original.predict('naan irukken seriya kavalaippadaathe'))
print(model_original.predict('veettukku sendru varugiren'))
print(model_original.predict('das habe ich nicht gesehen'))
print(model_original.predict('ga3d tsawe al7een'))
print(model_original.predict('konta dayir amshi le al ma7al dak fog al nil'))
print(model_original.predict('main nahi jaa raha hoon'))
print(model_original.predict('njan parayunna polathanne cheytha mathi'))
print(model_original.predict('Yaarige kok, yaarige lak? Illide sambhavya sacivara patti'))
print(model_original.predict('Bhalo achi re. Bohukaal por. Tui kemon achish?'))
print(model_original.predict('main apni bhasha mein baat kar rahi hoon'))
print(model_original.predict('yahan mat aaya karo'))
print(model_original.predict('kaunsi bhaasha mein baat kar rahe ho'))
print(model_original.predict('je ne veux pas y aller'))

In [None]:
print(model_augmented.predict('naan irukken kavalaippadaathe'))
print(model_augmented.predict('veettukku sendru varugiren'))
print(model_augmented.predict('das habe ich nicht gesehen'))
print(model_augmented.predict('ga3d tsawe al7een'))
print(model_augmented.predict('amshi le al ma7al dak fog al nil'))
print(model_augmented.predict('main nahi jaa raha hoon'))
print(model_augmented.predict('njan parayunna polathanne cheytha mathi'))
print(model_augmented.predict('Yaarige kok, yaarige lak? Illide sambhavya sacivara patti'))
print(model_augmented.predict('Bhalo achi re. Bohukaal por. Tui kemon achish?'))
print(model_augmented.predict('main apni bhasha mein baat kar rahi hoon'))
print(model_augmented.predict('yahan mat aaya karo'))
print(model_augmented.predict('kaunsi bhaasha mein baat kar rahe ho'))
print(model_augmented.predict('je ne veux pas y aller'))

All the examples here are coded correctly.