# Data preprocessing

Parsing and preprocessing the raw corpora.

In [36]:
import html
import re
from estnltk import Text
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

In [37]:
DATA_DIR = "./corpus"

In [38]:
def parse_corpus(filename):
    """
    Reads in one corpus file, dealing with input file irregularities.
    Returns words and lemmas of the input file as a 2D list of sentences of words. 
    """
    all_lemmas = []
    all_words = []

    with open(f'{DATA_DIR}/{filename}', encoding="UTF8") as f:
        for line in f:
            if line[:7] in ("ILU1900", "AJA1900", "ILU1910", "AJA1910"):
                line = line.split(" ", 1) #split only from the first space
            else:
                line = line.split("    ")
            #print(line)
            assert len(line) == 2
            sentence = html.unescape(line[1]) #&ouml -> ö
            
            # Writing style normalization
            sentence = sentence.replace("w", "v") #normalizing the old style of writing - w can be replaced with v
            sentence = sentence.replace("ß", "s") #normalizing the old style of writing - ß is a German letter
            sentence = sentence.replace("á", "a")
            sentence = sentence.replace("ú", "u")
            sentence = sentence.replace("ñ", "n")
            # á, ú, ñ
            #estnltk analysis - must not be preprocessed to work as good as possible
            text = Text(sentence)
            text.tag_layer(["words", "morph_analysis"])
            
            #finding tokens and lemmas and filtering out punctuation, numbers
            pattern = re.compile("[a-zöüõäA-Zöüõä]") #leave only tokens/lemmas that contain letters and only letters
            words = [word.text.lower() for word in text.words if pattern.match(word.text)]
            
            lemmas = [word[0].lower() for word in text.morph_analysis.lemma if pattern.match(word[0])] #if multiple lemmas available, take the first
            
            all_words.append(words)
            all_lemmas.append(lemmas)
            
    return all_words, all_lemmas
#parse_corpus("1900_ilu")

In [39]:
# Finding all filenames in the data folder
all_files = [f for f in listdir(DATA_DIR) if isfile(join(DATA_DIR, f))]
all_files = [filename for filename in all_files if filename[-4:] not in [".zip", "r.gz"]]

In [40]:
all_files

['1890_aja',
 '1890_ilu',
 '1900_aja',
 '1900_ilu',
 '1910_aja',
 '1910_ilu',
 '1930_aja',
 '1930_ilu',
 '1950_aja',
 '1950_ilu',
 '1960_aja',
 '1960_ilu',
 '1970_aja',
 '1970_ilu',
 '1980_aja',
 '1980_ilu',
 '1980_muu',
 '1980_tea',
 '1990e_aja',
 '1990_ilu']

In [41]:
include_from = 1890 #from which year to include the chosen files
files = [filename for filename in all_files if int(filename[:4]) >= include_from]
len(files)

20

In [42]:
#Saving the previously found words and lemmas
for filename in tqdm(files):
    words, lemmas = parse_corpus(filename)
    with open(f"{DATA_DIR}/words/{filename}_words.txt", mode="w+", encoding="utf8") as f:
        for sentence in words:
            f.write(" ".join(sentence))
            f.write("\n")
    with open(f"{DATA_DIR}/lemmas/{filename}_lemmas.txt", mode="w+", encoding="utf8") as f:
        for sentence in lemmas:
            f.write(" ".join(sentence))
            f.write("\n")

100%|█████████████████████████████████████████████████████████████████████████████████| 20/20 [54:46<00:00, 164.31s/it]


## Corpus analytics

Finding basic analytics about the corpora.

### Words

In [47]:
#Words statistical analysis
DATA_DIR = "./corpus/words" #select either words or lemmas
decades = [1890, 1900, 1910, 1930, 1950, 1960, 1970, 1980, 1990] #1920 is a different data domain, 1940 was not included
all_files = [f for f in listdir(DATA_DIR) if isfile(join(DATA_DIR, f))]
all_files = [filename for filename in all_files if filename[-4:] not in [".zip", "r.gz"]]

for decade in decades:
    print(decade, end="\t")

1890	1900	1910	1930	1950	1960	1970	1980	1990	

In [53]:
for decade in decades:
    chosen_files = [filename for filename in all_files if int(filename[:4]) == decade]
    #print(chosen_files)
    
    sentences_combined = []
    unique_tokens = set()
    for filename in chosen_files:
        #print(filename)
        with open(f"{DATA_DIR}/{filename}", mode="r", encoding="utf8") as f:
            sentences = []
            for row in f:
                row = row.strip().split(" ")
                #row = [word.replace("w", "v") for word in row] #normalizing the old style of writing
                #row = [word.replace("ß", "s") for word in row] #normalizing the old style of writing
                sentences.append(row)
                sentences_combined.append(row)
                for token in row:
                    unique_tokens.add(token)
            # sentences analysis
            #print("Count of sentences", len(sentences), end=" ")
            #print("Count of words", sum([len(sentence) for sentence in sentences]), end=" ")
            #print("Average sentence length", round(sum([len(sentence) for sentence in sentences]) / len(sentences)), end=" ")
    #sentences_combined analysis
    #print("Total count of sentences", len(sentences_combined), end=" ")
    #print("Total count of words", sum([len(sentence) for sentence in sentences_combined]), end=" ")
    print("Total count of unique words", len(unique_tokens), end=" ")
    #print("", round(sum([len(sentence) for sentence in sentences_combined]) / len(sentences_combined), 2), end=" ")

 48835  47743  54398  68032  56561  66196  80872  159813  153227 

### Lemmas

In [59]:
#Lemmas statistical analysis
DATA_DIR = "./corpus/lemmas" #select either words or lemmas
decades = [1890, 1900, 1910, 1930, 1950, 1960, 1970, 1980, 1990] #1920 is a different data domain, 1940 was not included
all_files = [f for f in listdir(DATA_DIR) if isfile(join(DATA_DIR, f))]
all_files = [filename for filename in all_files if filename[-4:] not in [".zip", "r.gz"]]

In [61]:
for decade in decades:
    chosen_files = [filename for filename in all_files if int(filename[:4]) == decade]
    #print(chosen_files)
    
    sentences_combined = []
    unique_tokens = set()
    for filename in chosen_files:
        #print(filename)
        with open(DATA_DIR + "/" + filename, mode="r", encoding="utf8") as f:
            sentences = []
            for row in f:
                row = row.strip().split(" ")
                row = [word.replace("w", "v") for word in row] #normalizing the old style of writing
                row = [word.replace("ß", "s") for word in row] #normalizing the old style of writing
                sentences.append(row)
                sentences_combined.append(row)
                for token in row:
                    unique_tokens.add(token)
            # sentences analysis
            #print("Count of sentences", len(sentences), end=" ")
            #print("Count of lemmas", sum([len(sentence) for sentence in sentences]), end=" ")
            #print("Average sentence length", round(sum([len(sentence) for sentence in sentences]) / len(sentences)), end=" ")
    #sentences_combined analysis
    #print("Total count of sentences", len(sentences_combined), end=" ")
    #print("", sum([len(sentence) for sentence in sentences_combined]), end=" ")
    print("", len(unique_tokens), end=" ")
    #print("Average sentence length", round(sum([len(sentence) for sentence in sentences_combined]) / len(sentences_combined)), end=" ")

 26113  24884  28214  33508  27357  33271  41547  76858  74132 