In [29]:
#Packages
import pandas as pd #dataframes
import numpy as np #for arrays 

#NLP libraries
import nltk 
from nltk.corpus import udhr #corpora with texts 
import re #Regular expressions
import spacy 
import es_core_news_sm, ko_core_news_sm, fi_core_news_sm, zh_core_web_sm, ja_core_news_sm, pl_core_news_sm, de_core_news_sm #spacy models

#NLP objects for (as we can't use shortcuts for loading the objects)
nlp_es= spacy.load("es_core_news_sm") #Spanish
nlp_ko= spacy.load("ko_core_news_sm") #Korean
nlp_fi= spacy.load("fi_core_news_sm") #Finnish
nlp_zh= spacy.load("zh_core_web_sm") #Chinese
nlp_ja= spacy.load("ja_core_news_sm") #Japanese
nlp_pl= spacy.load("pl_core_news_sm") #Polish
nlp_de= spacy.load("de_core_news_sm") #German

#other spacy models for less explored languages 
from spacy.lang.tr import Turkish
nlp_tr= Turkish()
from spacy.lang.id import Indonesian
nlp_id= Indonesian()
from spacy.lang.ar import Arabic
nlp_ar= Arabic()
from spacy.lang.tl import Tagalog
nlp_tl= Tagalog()
from spacy.lang.eu import Basque
nlp_eu= Basque()
from spacy.lang.et import Estonian
nlp_et= Estonian()
from spacy.lang.kn import Kannada
nlp_kn= Kannada()
from spacy.lang.yo import Yoruba 
nlp_yo= Yoruba()
#from spacy.lang.vi import Vietnamese
#nlp_vi= Vietnamese()
from spacy.lang.ms import Malay
nlp_ms= Malay()
from spacy.lang.ga import Irish
nlp_ga= Irish()
from spacy.lang.tn import Setswana
nlp_tn= Setswana()
from spacy.lang.bg import Bulgarian
nlp_bg= Bulgarian()



In [16]:
df= pd.read_csv("language_data.csv", sep=";")
languages= df['Language'].values
df


Unnamed: 0,Language,Family,Tokenizer,Code,Spacy object
0,Spanish,Indo-European,es_core_news_sm,es,1
1,Korean,Koreanic,ko_core_news_sm,ko,1
2,Finnish,Uralic,fi_core_news_sm,fi,0
3,Turkish,Turkic,Turkish,tr,0
4,Indonesian,Austronesian,Indonesian,id,0
5,Chinese,Sino-Tibetan,zh_core_web_sm,zh,1
6,Japanese,Japonic,ja_core_news_sm,ja,1
7,Arabic,Austronesian,Arabic,ar,0
8,Tagalog,Afro-Asiatic,Tagalog,tl,0
9,Basque,N/D,Basque,eu,0


In [46]:
def extract_raw_texts(list_of_languages):
  raw_files_names= {}
  for language in list_of_languages:
    all_files= nltk.corpus.udhr.fileids()
    file= [f for f in all_files if re.findall(language, f)][0]
    raw= nltk.corpus.udhr.raw(file)
    raw_files_names[language]=raw
  return raw_files_names

def real_tokenizer(text, model_lang):
    nlp= model_lang #Opens spacy object
    doc=nlp(text) #Process text with spacy 
    tokens= [] #for storing tokens
    for token in doc:
        x= token.text #gets text from token
        tokens.append(x)
    return tokens

def tokens(dict_raw_texts, languages): #takes real_tokenizer and filters by language to tokenize
    tokens_langs= {} #dictionary to store output
    for lang in languages:
        if lang == 'Spanish':
            text= dict_raw_texts[lang] #gets text from dict in raw_files_names 
            model_lang= nlp_es #loads corresponding model
            tokens= tokenizer(text, model_lang) #tokenizes 
            tokens_langs[lang]=tokens #appends to output dictionary 
        elif lang == 'Korean':
            text= dict_raw_texts[lang]
            model_lang= nlp_ko
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Finnish':
            text= dict_raw_texts[lang]
            model_lang= nlp_fi
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Chinese':
            text= dict_raw_texts[lang]
            model_lang= nlp_zh
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Japanese':
            text= dict_raw_texts[lang]
            model_lang= nlp_ja
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Polish':
            text= dict_raw_texts[lang]
            model_lang= nlp_pl
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'German':
            text= dict_raw_texts[lang]
            model_lang= nlp_de
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Turkish':
            text= dict_raw_texts[lang]
            model_lang= nlp_tr
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Indonesian':
            text= dict_raw_texts[lang]
            model_lang= nlp_id
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Arabic':
            text= dict_raw_texts[lang]
            model_lang= nlp_ar
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Tagalog':
            text= dict_raw_texts[lang]
            model_lang= nlp_tl
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Basque':
            text= dict_raw_texts[lang]
            model_lang= nlp_eu
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Estonian':
            text= dict_raw_texts[lang]
            model_lang= nlp_et
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Kannada':
            text= dict_raw_texts[lang]
            model_lang= nlp_kn
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Yoruba':
            text= dict_raw_texts[lang]
            model_lang= nlp_yo
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Malay':
            text= dict_raw_texts[lang]
            model_lang= nlp_ms
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        #elif lang == 'Vietnamese':
          #  text= dict_raw_texts[lang]
           # model_lang= nlp_vi
           # tokens= tokenizer(text, model_lang)
           # tokens_langs[lang]=tokens
        elif lang == 'Setswana':
            text= dict_raw_texts[lang]
            model_lang= nlp_tn
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Bulgarian':
            text= dict_raw_texts[lang]
            model_lang= nlp_bg
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Irish':
            text= dict_raw_texts[lang]
            model_lang= nlp_ga
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
    return tokens_langs
    


In [42]:
raw_texts= extract_raw_texts(languages) #returns a dictionary where KEY is language and VALUE a string with raw text. 

In [47]:
all_languages_tokens= tokens(raw_texts, languages) #returns a dictionary where KEY is language and VALUE is list with tokens.
print(len(all_languages_tokens)) #just for checking how many variables have been processed 

19

In [51]:
#If you want to check on the individual variables 
all_languages_tokens['Japanese']

['『',
 '世界',
 '人権',
 '宣言',
 '』',
 '\n \n\n',
 '（',
 '1948',
 '.',
 '12',
 '.',
 '10',
 '第',
 '３',
 '回',
 '国連',
 '総会',
 '採択',
 '）',
 '\n\n \n\n',
 '〈',
 '前文',
 '〉',
 '\n\u3000\n',
 '人類',
 '社会',
 'の',
 'すべて',
 'の',
 '構成',
 '員',
 'の',
 '固有',
 'の',
 '尊厳',
 'と',
 '平等',
 'で',
 '譲る',
 'こと',
 'の',
 'でき',
 'ない',
 '権利',
 'と',
 'を',
 '承認',
 'する',
 'こと',
 'は',
 '、',
 '世界',
 'に',
 'おけ',
 'る',
 '自由',
 '、',
 '正義',
 '及び',
 '平和',
 'の',
 '基礎',
 'で',
 'ある',
 'の',
 'で',
 '、',
 '\n\n',
 '人権',
 'の',
 '無視',
 '及び',
 '軽侮',
 'が',
 '、',
 '人類',
 'の',
 '良心',
 'を',
 '踏み',
 'にじっ',
 'た',
 '野蛮',
 '行為',
 'を',
 'もたらし',
 '、',
 '言論',
 '及び',
 '信仰',
 'の',
 '自由',
 'が',
 '受け',
 'られ',
 '、',
 '恐怖',
 '及び',
 '欠乏',
 'の',
 'ない',
 '世界',
 'の',
 '到来',
 'が',
 '、',
 '一般',
 'の',
 '人々',
 'の',
 '最高',
 'の',
 '願望',
 'と',
 'し',
 'て',
 '宣言',
 'さ',
 'れ',
 'た',
 'の',
 'で',
 '、',
 '\u3000\n\n',
 '人間',
 'が',
 '専制',
 'と',
 '圧迫',
 'と',
 'に',
 '対する',
 '最後',
 'の',
 '手段',
 'と',
 'し',
 'て',
 '反逆',
 'に',
 '訴える',
 'こと',
 'が',
 'ない',
 'よう',
 'に',
 'する',
 '