In [1]:
#Packages
import pandas as pd #dataframes
import numpy as np #for arrays 

#NLP libraries
import nltk 
from nltk.corpus import udhr #corpora with texts 
import re #Regular expressions
import spacy 
import es_core_news_sm, ko_core_news_sm, fi_core_news_sm, zh_core_web_sm, ja_core_news_sm, pl_core_news_sm, de_core_news_sm #spacy models

nltk.download('udhr')

#NLP objects for (as we can't use shortcuts for loading the objects)
nlp_es= spacy.load("es_core_news_sm") #Spanish
nlp_ko= spacy.load("ko_core_news_sm") #Korean
nlp_fi= spacy.load("fi_core_news_sm") #Finnish
nlp_zh= spacy.load("zh_core_web_sm") #Chinese
nlp_ja= spacy.load("ja_core_news_sm") #Japanese
nlp_pl= spacy.load("pl_core_news_sm") #Polish
nlp_de= spacy.load("de_core_news_sm") #German

#other spacy models for less explored languages 
from spacy.lang.tr import Turkish
nlp_tr= Turkish()
from spacy.lang.id import Indonesian
nlp_id= Indonesian()
from spacy.lang.ar import Arabic
nlp_ar= Arabic()
from spacy.lang.tl import Tagalog
nlp_tl= Tagalog()
from spacy.lang.eu import Basque
nlp_eu= Basque()
from spacy.lang.et import Estonian
nlp_et= Estonian()
from spacy.lang.kn import Kannada
nlp_kn= Kannada()
from spacy.lang.yo import Yoruba 
nlp_yo= Yoruba()
from spacy.lang.sk import Slovak
nlp_sk= Slovak()
from spacy.lang.ms import Malay
nlp_ms= Malay()
from spacy.lang.ga import Irish
nlp_ga= Irish()
from spacy.lang.tn import Setswana
nlp_tn= Setswana()
from spacy.lang.bg import Bulgarian
nlp_bg= Bulgarian()



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd #dataframes
[nltk_data] Downloading package udhr to
[nltk_data]     C:\Users\aleja\AppData\Roaming\nltk_data...
[nltk_data]   Package udhr is already up-to-date!


In [16]:
root= "D:/CCiL/Quantitative Linguistics/lab_22"
df= pd.read_csv(root + "/language_data_2.csv", sep=",")
languages= df['Language'].values
codes= df['ISO code'].values
df


Unnamed: 0.1,Unnamed: 0,Language,Family,ISO code,tokens,types
0,7,Arabic,Austronesian,ar,1318,725
1,9,Basque,N/D,eu,1236,652
2,15,Bulgarian,Indo-European,bg,2273,653
3,5,Chinese,Sino-Tibetan,zh,2693,532
4,10,Estonian,Uralic,et,1250,654
5,2,Finnish,Uralic,fi,1113,672
6,14,German,Indo-European,de,1330,545
7,4,Indonesian,Austronesian,id,1302,488
8,19,Irish,Indo-European,ga,1640,598
9,6,Japanese,Japonic,ja,2325,517


In [26]:
def extract_raw_texts(list_of_languages, list_of_codes):
  raw_files_names= {}
  for language in list_of_languages:
    for code in list_of_codes:
        all_files= nltk.corpus.udhr.fileids()
        file= [f for f in all_files if re.findall(language, f)]
        raw= nltk.corpus.udhr.raw(file)   
        raw_files_names[language]=raw
  return raw_files_names

def tokenizer(text, model_lang):
    nlp= model_lang #Opens spacy object
    doc=nlp(text) #Process text with spacy 
    tokens = [token.text for token in doc if not token.is_space and not token.is_punct and not token.is_digit]
    return tokens

def tokens(dict_raw_texts, languages): #takes real_tokenizer and filters by language to tokenize
    tokens_langs= {} #dictionary to store output
    for lang in languages:
        if lang == 'Spanish':
            text= dict_raw_texts[lang] #gets text from dict in raw_files_names 
            model_lang= nlp_es #loads corresponding model
            tokens= tokenizer(text, model_lang) #tokenizes 
            tokens_langs[lang]=tokens #appends to output dictionary 
        elif lang == 'Korean':
            text= dict_raw_texts[lang]
            model_lang= nlp_ko
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Finnish':
            text= dict_raw_texts[lang]
            model_lang= nlp_fi
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Chinese':
            text= dict_raw_texts[lang]
            model_lang= nlp_zh
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Japanese':
            text= dict_raw_texts[lang]
            model_lang= nlp_ja
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Polish':
            text= dict_raw_texts[lang]
            model_lang= nlp_pl
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'German':
            text= dict_raw_texts[lang]
            model_lang= nlp_de
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Turkish':
            text= dict_raw_texts[lang]
            model_lang= nlp_tr
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Indonesian':
            text= dict_raw_texts[lang]
            model_lang= nlp_id
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Arabic':
            text= dict_raw_texts[lang]
            model_lang= nlp_ar
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Tagalog':
            text= dict_raw_texts[lang]
            model_lang= nlp_tl
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Basque':
            text= dict_raw_texts[lang]
            model_lang= nlp_eu
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Estonian':
            text= dict_raw_texts[lang]
            model_lang= nlp_et
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Kannada':
            text= dict_raw_texts[lang]
            model_lang= nlp_kn
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Yoruba':
            text= dict_raw_texts[lang]
            model_lang= nlp_yo
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Malay':
            text= dict_raw_texts[lang]
            model_lang= nlp_ms
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Slovak':
            text= dict_raw_texts[lang]
            model_lang= nlp_sk
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Setswana':
            text= dict_raw_texts[lang]
            model_lang= nlp_tn
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Bulgarian':
            text= dict_raw_texts[lang]
            model_lang= nlp_bg
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
        elif lang == 'Irish':
            text= dict_raw_texts[lang]
            model_lang= nlp_ga
            tokens= tokenizer(text, model_lang)
            tokens_langs[lang]=tokens
    return tokens_langs
    


In [27]:
raw_texts= extract_raw_texts(languages, codes) #returns a dictionary where KEY is language and VALUE a string with raw text. 

In [35]:
#Writing files in .txt files 
for language, code in zip(languages, codes):
    file_name = root + "/data/" + str(code) + ".txt"
    text = str(raw_texts[language])  # Fetch text for the current language
    with open(file_name, "w", encoding="utf-8") as file:
        file.write(text)
    print(f'{code} Done!')

ar Done!
eu Done!
bg Done!
zh Done!
et Done!
fi Done!
de Done!
id Done!
ga Done!
ja Done!
kn Done!
ko Done!
ms Done!
pl Done!
tn Done!
sk Done!
es Done!
tl Done!
tr Done!
yo Done!


In [9]:
all_languages_tokens= tokens(raw_texts, languages) #returns a dictionary where KEY is language and VALUE is list with tokens.
print(len(all_languages_tokens)) #just for checking how many variables have been processed 

20


In [34]:
df= df.sort_values('Language')
languages= df['Language'].values
tokens_languages=[]
types_languages=[]
for i in languages:
    txt= all_languages_tokens[i]
    tokens_languages.append(len(txt))
    types_languages.append(len(set(txt)))

In [38]:
df['tokens']=tokens_languages
df['types']=types_languages

df.to_csv('language_data_2.csv')

In [6]:
from collections import Counter
import csv

def process(tokens):
    token_freq = Counter(tokens)
    matrix = []
    for token in set(tokens):
        matrix.append([token, len(token), token_freq[token]])
    
    matrix.sort(key=lambda x: x[2], reverse=True)
    
    return matrix

tokens_langs= {}
for language in all_languages_tokens:
    tokens_langs[language] = process(all_languages_tokens[language])

def matrix_to_csv(matrix, filename):
    with open(filename, 'w', newline='', encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        # Write each row of the matrix to the CSV file
        writer.writerow(['Token', 'Length', 'Frequency'])
        for row in matrix:
            writer.writerow(row)

for language in tokens_langs:
    filename = f"data/output_{language.lower()}.csv"
    matrix_to_csv(tokens_langs[language], filename)