# Data Processing & Preparation

## Language Identification dataset

In [7]:
import pandas as pd
from pyarrow.parquet import ParquetFile
import pyarrow as pa 

In [3]:
li_data = pd.read_csv("data/li_test.csv")
li_data

Unnamed: 0,labels,text
0,nl,Een man zingt en speelt gitaar.
1,nl,De technologisch geplaatste Nasdaq Composite I...
2,es,Es muy resistente la parte trasera rígida y lo...
3,it,"""In tanti modi diversi, l'abilità artistica de..."
4,ar,منحدر يواجه العديد من النقاشات المتجهه إزاء ال...
...,...,...
9995,zh,史料很充分，对岸的很多观点与大陆迥异啊。
9996,tr,"Örneğin, teşhis Yunanca bir kelimeden alındı (..."
9997,vi,Nếu lite/light chỉ đơn giản là mô tả một đặc t...
9998,bg,"Например, една щатска столица, която посетихме..."


#### *For our purpose the only thing we need to do is to convert the labels to their full forms*

In [4]:
# Converting the labels to their full forms
label_mapping = {
    'ar': 'Arabic',
    'bg': 'Bulgarian',
    'de': 'German',
    'el': 'Greek',
    'en': 'English',
    'es': 'Spanish',
    'fr': 'French',
    'hi': 'Hindi',
    'it': 'Italian',
    'ja': 'Japanese',
    'nl': 'Dutch',
    'pl': 'Polish',
    'pt': 'Portuguese',
    'ru': 'Russian',
    'sw': 'Swahili',
    'th': 'Thai',
    'tr': 'Turkish',
    'ur': 'Urdu',
    'vi': 'Vietnamese',
    'zh': 'Chinese'
}
li_data['labels'] = li_data['labels'].map(label_mapping)
li_data.to_csv("data/li_full_labels.csv", index=False)

## OpenLID dataset

In [35]:
# The parquet file is quite large: 358 MB
# Read a batch of 50000 rows
# Source: https://huggingface.co/datasets/laurievb/open-lid-dataset/blob/main/data/train-00000-of-00044-49ff88767b70bfb2.parquet
pf = ParquetFile('data/train-00000-of-00044-49ff88767b70bfb2.parquet')
open_lid_data = pa.Table.from_batches([next(pf.iter_batches(batch_size=50000))]).to_pandas()
open_lid_data

Unnamed: 0,text,language,dataset_source
0,Dia hoy ny mpanapaka tamin'i Baroka: Mandehana...,0,mt560
1,"Lamun urang soméah bari ngucapkeun salam, mané...",1,mt560
2,"Натомість ті райони, які мають найнижчі рівні ...",2,mt560
3,¿Serás exaltada hasta el cielo?,3,lti
4,O tom da pergunta do senhor deputado surpreend...,4,mt560
...,...,...,...
49995,Neuvosto voi sallia mainitun liiton jäsenvalti...,63,mt560
49996,Em i putim gut yau long tok bilong Baibel mipe...,37,mt560
49997,Panon kasin moriaen nen Abraham so totoo a man...,16,mt560
49998,ധാരണാപത്രത്തിന്റെ നടത്തിപ്പ് നിരീക്ഷിക്കുന്നതി...,11,mt560


#### *For our purpose (Querying/testing and using it for RAG) 10 samples per langauge will suffice*

In [36]:
# Group the dataframe by the specified column
grouped = open_lid_data.groupby("language")
# Sample from each group
samples = grouped.apply(lambda x: x.sample(n=min(10, len(x)), random_state=42))
open_lid_sampled_df = samples.reset_index(drop=True)

# Check length of text (number of words)
open_lid_sampled_df['length'] = open_lid_sampled_df['text'].apply(lambda x: len(x.split()))
# Remove rows with length < 10
open_lid_sampled_df.drop(open_lid_sampled_df[open_lid_sampled_df['length'] < 10].index, inplace=True)
# Drop unneeded columns
open_lid_sampled_df.drop(columns=["dataset_source", "length"], axis=0, inplace=True)
open_lid_sampled_df

  samples = grouped.apply(lambda x: x.sample(n=min(10, len(x)), random_state=42))


Unnamed: 0,text,language
1,ary soraty eo amin'ny tolam-baravaran'ny trano...,0
2,Rehefa namatotra azy tamin'ny kofehy hoditra i...,0
3,Io menaka manitra io dia afaka namidy mihoatry...,0
4,"Ireo anjely, ireo manam-pahefana, ary ny hery ...",0
5,Nefa Estera tsy mbola nilaza ny fireneny na ny...,0
...,...,...
2004,{{Infobox writer | name = فرانسوا ویون | embed...,200
2006,آدری مدوز (اینگیلیسجه: Audrey Meadows) آمریکال...,200
2007,قارغیدالی (بیتکی) (اینگیلیسجه: Maize ، (فارسجا...,200
2008,339 ایلینه قده¬ر بالچیک Balchic توْرپاقلارینا ...,200


In [37]:
# Checking number of unique languages
open_lid_sampled_df['language'].nunique()

196

Now, we need to convert the numeric language code to the actual language name.

This is available in a markdown file in the dataset github repo (https://github.com/laurieburchell/open-lid-dataset/blob/main/languages.md)

Downloading and parsing to create a mapping that we can use: **Numerical Language code -> Textual Language code -> Language full form text**


In [38]:
languages = pd.read_csv("data/languages.md", sep="|")
languages

Unnamed: 0.1,Unnamed: 0,Language code,Language,Lines of training data,F1 score,False positive %,Unnamed: 6
0,,------------------------,-------------------------,------------------------,--------------------------------,-----------------------------,
1,,ace_Arab,Acehnese,6191,0.9679,0.0079,
2,,ace_Latn,Acehnese,18032,0.9980,0.0005,
3,,acm_Arab,Mesopotamian Arabic,4862,0.0328,0.0040,
4,,acq_Arab,Ta'izzi-Adeni Arabic,1598,0.0020,0.0000,
...,...,...,...,...,...,...,...
197,,yue_Hant,Yue Chinese,63254,0.0059,0.0025,
198,,zho_Hans,Chinese (Simplified),1046823,0.9891,0.0054,
199,,zho_Hant,Chinese (Traditional),2018541,0.6605,0.5020,
200,,zsm_Latn,Standard Malay,404380,0.9495,0.0346,


In [39]:
# Create a mapping language code -> language full form
language_mapping = languages[[" Language code          ", " Language                "]][1:].map(lambda x: x.strip())
language_mapping.reset_index(drop=True, inplace=True)
language_mapping = dict(zip(language_mapping[' Language code          '], language_mapping[' Language                ']))
language_mapping

{'ace_Arab': 'Acehnese',
 'ace_Latn': 'Acehnese',
 'acm_Arab': 'Mesopotamian Arabic',
 'acq_Arab': "Ta'izzi-Adeni Arabic",
 'aeb_Arab': 'Tunisian Arabic',
 'afr_Latn': 'Afrikaans',
 'ajp_Arab': 'South Levantine Arabic',
 'als_Latn': 'Tosk Albanian',
 'amh_Ethi': 'Amharic',
 'apc_Arab': 'North Levantine Arabic',
 'arb_Arab': 'Modern Standard Arabic',
 'ars_Arab': 'Najdi Arabic',
 'ary_Arab': 'Moroccan Arabic',
 'arz_Arab': 'Egyptian Arabic',
 'asm_Beng': 'Assamese',
 'ast_Latn': 'Asturian',
 'awa_Deva': 'Awadhi',
 'ayr_Latn': 'Central Aymara',
 'azb_Arab': 'South Azerbaijani',
 'azj_Latn': 'North Azerbaijani',
 'bak_Cyrl': 'Bashkir',
 'bam_Latn': 'Bambara',
 'ban_Latn': 'Balinese',
 'bel_Cyrl': 'Belarusian',
 'bem_Latn': 'Bemba',
 'ben_Beng': 'Bengali',
 'bho_Deva': 'Bhojpuri',
 'bjn_Arab': 'Banjar',
 'bjn_Latn': 'Banjar',
 'bod_Tibt': 'Standard Tibetan',
 'bos_Latn': 'Bosnian',
 'bug_Latn': 'Buginese',
 'bul_Cyrl': 'Bulgarian',
 'cat_Latn': 'Catalan',
 'ceb_Latn': 'Cebuano',
 'ces_Latn

In [40]:
# Language codes in order (in accordance with their numerical code)
language_codes = ["plt_Latn","sun_Latn","ukr_Cyrl","spa_Latn","por_Latn","mya_Mymr","mkd_Cyrl","war_Latn","nso_Latn","wol_Latn","kam_Latn","mal_Mlym","gle_Latn","ayr_Latn","rus_Cyrl","pbt_Arab","pag_Latn","twi_Latn","als_Latn","lit_Latn","amh_Ethi","tur_Latn","tel_Telu","vec_Latn","zsm_Latn","ckb_Arab","tgk_Cyrl","tha_Thai","hye_Armn","deu_Latn","tat_Cyrl","swh_Latn","kac_Latn","tuk_Latn","lvs_Latn","tso_Latn","fao_Latn","tpi_Latn","umb_Latn","mlt_Latn","cym_Latn","ben_Beng","hat_Latn","ron_Latn","tir_Ethi","ewe_Latn","ind_Latn","snd_Arab","nld_Latn","urd_Arab","vie_Latn","mar_Deva","fra_Latn","lug_Latn","pol_Latn","ban_Latn","est_Latn","srp_Cyrl","kin_Latn","nno_Latn","fur_Latn","kmr_Latn","bho_Deva","fin_Latn","mri_Latn","ilo_Latn","fij_Latn","slk_Latn","knc_Arab","guj_Gujr","kor_Hang","tum_Latn","kab_Latn","afr_Latn","eng_Latn","acq_Arab","som_Latn","tgl_Latn","epo_Latn","bjn_Arab","mni_Beng","sot_Latn","nob_Latn","kat_Geor","ory_Orya","arb_Arab","heb_Hebr","ibo_Latn","asm_Beng","uzn_Latn","sna_Latn","mos_Latn","fuv_Latn","hne_Deva","apc_Arab","hun_Latn","ita_Latn","bem_Latn","slv_Latn","ssw_Latn","szl_Latn","nya_Latn","kir_Cyrl","hrv_Latn","pap_Latn","kik_Latn","knc_Latn","lmo_Latn","hau_Latn","eus_Latn","ltz_Latn","grn_Latn","lus_Latn","taq_Latn","scn_Latn","kmb_Latn","azj_Latn","isl_Latn","swe_Latn","uig_Arab","jpn_Jpan","sag_Latn","xho_Latn","ast_Latn","kan_Knda","sin_Sinh","acm_Arab","tzm_Tfng","dan_Latn","zho_Hant","zho_Hans","pes_Arab","fon_Latn","tam_Taml","yor_Latn","run_Latn","arz_Arab","awa_Deva","pan_Guru","gaz_Latn","lao_Laoo","bos_Latn","ces_Latn","bam_Latn","crh_Latn","ltg_Latn","bul_Cyrl","gla_Latn","ell_Grek","prs_Arab","smo_Latn","ajp_Arab","tsn_Latn","bak_Cyrl","srd_Latn","ace_Arab","kas_Arab","lua_Latn","taq_Tfng","jav_Latn","cat_Latn","kon_Latn","hin_Deva","lin_Latn","khk_Cyrl","cjk_Latn","mag_Deva","dik_Latn","bug_Latn","bjn_Latn","yue_Hant","zul_Latn","npi_Deva","kas_Deva","dzo_Tibt","ary_Arab","bel_Cyrl","kbp_Latn","khm_Khmr","ace_Latn","nus_Latn","ceb_Latn","mai_Deva","san_Deva","dyu_Latn","quy_Latn","lim_Latn","min_Latn","oci_Latn","kaz_Cyrl","luo_Latn","sat_Olck","ydd_Hebr","shn_Mymr","ars_Arab","lij_Latn","aeb_Arab","bod_Tibt","glg_Latn","kea_Latn","azb_Arab"]
# Numerical Language code -> Textual Language code -> Language full form text
language_code_mapping = dict(zip(range(len(language_codes)), [language_mapping[lang] for lang in language_codes]))
language_code_mapping

{0: 'Plateau Malgasy',
 1: 'Sundanese',
 2: 'Ukrainian',
 3: 'Spanish',
 4: 'Portuguese',
 5: 'Burmese',
 6: 'Macedonian',
 7: 'Waray',
 8: 'Northern Sotho',
 9: 'Wolof',
 10: 'Kamba',
 11: 'Malayalam',
 12: 'Irish',
 13: 'Central Aymara',
 14: 'Russian',
 15: 'Southern Pasto',
 16: 'Pangasinan',
 17: 'Twi',
 18: 'Tosk Albanian',
 19: 'Lithuanian',
 20: 'Amharic',
 21: 'Turkish',
 22: 'Telugu',
 23: 'Venetian',
 24: 'Standard Malay',
 25: 'Central Kurdish',
 26: 'Tajik',
 27: 'Thai',
 28: 'Armenian',
 29: 'German',
 30: 'Tatar',
 31: 'Swahili',
 32: 'Jingpho',
 33: 'Turkmen',
 34: 'Standard Latvian',
 35: 'Tsonga',
 36: 'Faroese',
 37: 'Tok Pisin',
 38: 'Umbundu',
 39: 'Maltese',
 40: 'Welsh',
 41: 'Bengali',
 42: 'Haitian Creole',
 43: 'Romanian',
 44: 'Tigrinya',
 45: 'Ewe',
 46: 'Indonesian',
 47: 'Sindhi',
 48: 'Dutch',
 49: 'Urdu',
 50: 'Vietnamese',
 51: 'Marathi',
 52: 'French',
 53: 'Ganda',
 54: 'Polish',
 55: 'Balinese',
 56: 'Estonian',
 57: 'Serbian',
 58: 'Kinyarwanda',
 5

In [41]:
# Apply the mapping
open_lid_sampled_df['language'] = open_lid_sampled_df['language'].apply(lambda x: language_code_mapping[x])
print(open_lid_sampled_df)
# Save
open_lid_sampled_df.to_csv("data/open_lid_10_samples.csv", index=False)

                                                   text           language
1     ary soraty eo amin'ny tolam-baravaran'ny trano...    Plateau Malgasy
2     Rehefa namatotra azy tamin'ny kofehy hoditra i...    Plateau Malgasy
3     Io menaka manitra io dia afaka namidy mihoatry...    Plateau Malgasy
4     Ireo anjely, ireo manam-pahefana, ary ny hery ...    Plateau Malgasy
5     Nefa Estera tsy mbola nilaza ny fireneny na ny...    Plateau Malgasy
...                                                 ...                ...
2004  {{Infobox writer | name = فرانسوا ویون | embed...  South Azerbaijani
2006  آدری مدوز (اینگیلیسجه: Audrey Meadows) آمریکال...  South Azerbaijani
2007  قارغیدالی (بیتکی) (اینگیلیسجه: Maize ، (فارسجا...  South Azerbaijani
2008  339 ایلینه قده¬ر بالچیک Balchic توْرپاقلارینا ...  South Azerbaijani
2009  استرابون و هرودوت ساکالار حاققیندا ایلک معلوما...  South Azerbaijani

[1356 rows x 2 columns]
