# Preprocessing

## Loading necessary libraries

In [30]:
import os
import re
from tqdm import tqdm
from turkishnlp import detector
import pickle

## Create a Dataset Folder

In [31]:
def create_dataset_folder():
    folder_name = 'dataset'
    # Check if the folder already exists
    if not os.path.exists(folder_name):
        os.makedirs('dataset')  # Create the folder

        os.makedirs('./dataset/syllable')
        os.makedirs('./dataset/character')

        os.makedirs('./dataset/character/train')
        os.makedirs('./dataset/character/test')

        os.makedirs('./dataset/syllable/train')
        os.makedirs('./dataset/syllable/test')


        print(f'Created folder: {folder_name}')
    else:
        print(f'Recreated folder: {folder_name}')
        os.system('rm -rf ./dataset')
        os.makedirs(folder_name)

        os.makedirs('./dataset/syllable')
        os.makedirs('./dataset/character')

        os.makedirs('./dataset/character/train')
        os.makedirs('./dataset/character/test')

        os.makedirs('./dataset/syllable/train')
        os.makedirs('./dataset/syllable/test')

create_dataset_folder()

Recreated folder: dataset


In [32]:
input_file = './wikipedia_data.txt'

In [33]:
punctuation_to_token = {
    ' ': '<space>',
    '.': '<period>', 
    '\n': '<new_line>', 
    '\t': '<space>',
    '!': '<exclamation_mark>',
    '-': '<dash>',
    '?': '<question_mark>',
    '/': '<forward_slash>',
    '\\': '<back_slash>',
    '/': '<forward_slash>',
    '"': '<double_quotes>',
    "'": '<single_quotes>',
    ";": '<semicolon>',
    ":": '<column>',
    "(": '<open_paranthesis>',
    ")": '<close_paranthesis>',
    "[": '<open_square_bracket>',
    "]": '<close_square_bracket>',
    "{": '<open_curly_bracket>',
    "}": '<close_curly_bracket>',
    "%": '<percentage_symbol>'
}



sentence_end_tokens = [
    '<period>',
    '<exclamation_mark>',
    '<question_mark>'
]

## process_line function process given line and returns a list of tokens

In [34]:
def process_line(line):
    # Define the regex patterns for words, digits, and punctuation
    turkish_word_pattern = r'[a-zA-ZşŞıİğĞçÇöÖüÜ]+'
    digit_pattern = r'0|[1-9][0-9]*'
    
    # Initialize the output list
    token_list = []
    
    # Replace punctuation with tokens
    line_with_tokens = line
    for punc, token in punctuation_to_token.items():
        line_with_tokens = line_with_tokens.replace(punc, f' {token} ')
    
    # Use regex to find all words, digits, and tokens
    matches = re.findall(r'(' + turkish_word_pattern + r'|' + digit_pattern + r'|<[^>]+>)', line_with_tokens)


    # each line starts with <start> symbol end ends with a <end> symbol
    for match in matches:

        if re.match(turkish_word_pattern, match):
            token_list.append(match)
        elif re.match(digit_pattern, match):
            token_list.extend(match)  # Extend with individual digits
        elif match in list(punctuation_to_token.values()):
            token_list.append(match)


    if '<new_line>' in token_list:
        token_list.remove('<new_line>') # get rid of new line

    output = []
    output.append("<start>")
    output.extend(token_list)
    output.append("<end>")
    
    return output



## Creating Dataset For Syllable-Based Model

In [35]:
def initialize_turkish_nlp(data_folder=None):
    """
    Loads Turkish NLP data from a local directory and initializes the TurkishNLP detector.
    
    Args:
        data_folder (str): Path to the folder containing the TurkishNLP data files. 
                           If None, it defaults to '~/TRnlpdata/'.

    Returns:
        obj (TurkishNLP): Initialized TurkishNLP object with data loaded from local files.
    """
    if data_folder is None:
        data_folder = os.path.expanduser('~/TRnlpdata/')
    
    words_alt_path = os.path.join(data_folder, 'words_alt.pkl')
    words_counted_path = os.path.join(data_folder, 'words_counted.pkl')
    words_path = os.path.join(data_folder, 'words.pkl')

    # Load the data from pickle files
    def load_data(file_path):
        with open(file_path, 'rb') as f:
            return pickle.load(f)
    
    try:
        words_alt = load_data(words_alt_path)
        words_counted = load_data(words_counted_path)
        words = load_data(words_path)
    except FileNotFoundError as e:
        print(f"Error: {e}. Please check if the file exists in the specified directory.")
        return None

    # Initialize TurkishNLP detector and set the loaded data
    obj = detector.TurkishNLP()
    obj.words_alt = words_alt
    obj.words_counted = words_counted
    obj.words = words

    return obj

obj = initialize_turkish_nlp('./TRnlpdata')

In [36]:
def syllabify_words(input_list):
    output = []
    
    for item in input_list:
        # Check if the item is a digit or a punctuation token
        if item in punctuation_to_token.values() or item.isdigit():
            output.append(item)  # Append the item as is
        elif item in ['<start>', '<end>']:
            output.append(item)  # Append the item as is
        else:
            # Syllabify the word
            syllables = obj.syllabicate_sentence(item)
            # Flatten the list of syllables and add to output
            output.extend([syllable for sublist in syllables for syllable in sublist])
    
    return output



In [37]:
def process_file_for_syllable_data(input_file_path, output_file_path):
    print(f'Processing the file `{input_file_path}` and writing output data to file `{output_file_path}`')
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        for line in tqdm(infile):

            line = line.lower()

            if len(line.split()) == 0:
                continue

            # Check if the line consists of HTML tags
            if re.match(r'^\s*<.*?>\s*$', line):
                continue  # Skip this line
            
            # Process the line to get words, punctuation, and digits
            processed_line = process_line(line)  # Using your process_line function
            # Syllabify the processed line
            syllabified_line = syllabify_words(processed_line)  # Using your syllabify_words function

            # Write the result to the output file, joined by spaces, followed by a newline
            outfile.write(' '.join(syllabified_line) + '\n')  # Adding a newline at the end of each line
    print('The preprocessing has been completed!')

In [38]:
output_file = os.path.join('./dataset', 'syllable_data.txt')
process_file_for_syllable_data(input_file, output_file)

Processing the file `./wikipedia_data.txt` and writing output data to file `./dataset/syllable_data.txt`


4547965it [03:23, 22335.87it/s]

The preprocessing has been completed!





## Creating Dataset for Character-Based Model

In [39]:
def process_characters(processed_line):
    output_list = []

    for token in processed_line:
        if token in list(punctuation_to_token.values()):
            output_list.append(token)
        elif token in ['<start>', '<end>']:
            output_list.append(token)
        elif token.isdigit():
            output_list.extend(token)  # Add each digit as a separate character
        else:
            output_list.extend(list(token))  # Add each character of the word

    return output_list

In [40]:
def process_file_for_character_data(input_file_path, output_file_path):
    print(f'Processing the file `{input_file_path}` for character data and writing output data to file `{output_file_path}`')
    
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        for line in tqdm(infile):

            line = line.lower()

            if len(line.split()) == 0:
                continue

            # Check if the line consists of HTML tags
            if re.match(r'^\s*<.*?>\s*$', line):
                continue  # Skip this line
            
            # Process the line to get words, punctuation, and digits
            processed_line = process_line(line)  # Using your process_line function
            # Convert processed line to characters
            character_data = process_characters(processed_line)
            

            # Write the result to the output file, joined by spaces, followed by a newline
            outfile.write(' '.join(character_data) + '\n')  # Adding a newline at the end of each line
    
    print('The character data processing has been completed!')


In [41]:
output_file = os.path.join('./dataset', 'character_data.txt')
process_file_for_character_data(input_file, output_file)

Processing the file `./wikipedia_data.txt` for character data and writing output data to file `./dataset/character_data.txt`


4547965it [02:18, 32928.61it/s]

The character data processing has been completed!





## From Dataset Files Create Unigram, Bigram, and Trigram Dataset Files

### Unigram Dataset File

In [42]:
def unigram_generator(infile: str, outfile: str):
    print(f'Creating the unigram from file `{infile}` and saving it in file `{outfile}`')
    with open(infile, 'r') as infile, open(outfile, 'w') as outfile:
        for line in tqdm(infile):
            line_splitted = line.split()
            for token in line_splitted:
                s = token + '\n'
                outfile.write(s)

    print('Creating the unigram has been completed!')

In [43]:
unigram_generator('./dataset/character_data.txt', './dataset/character_unigram.txt')

Creating the unigram from file `./dataset/character_data.txt` and saving it in file `./dataset/character_unigram.txt`


3922529it [00:31, 124038.31it/s]

Creating the unigram has been completed!





In [44]:
unigram_generator('./dataset/syllable_data.txt', './dataset/syllable_unigram.txt')

Creating the unigram from file `./dataset/syllable_data.txt` and saving it in file `./dataset/syllable_unigram.txt`


1736199it [00:22, 75514.17it/s]

Creating the unigram has been completed!





In [45]:
def bigram_generator(infile: str, outfile: str):
    print(f'Creating the bigram from file `{infile}` and saving it in file `{outfile}`')
    with open(infile, 'r') as infile, open(outfile, 'w') as outfile:
        for line in tqdm(infile):
            line_splitted = line.split()
            for i in range(len(line_splitted) - 1):
                w0 = line_splitted[i]
                w1 = line_splitted[i+1]
                s = w0 + ' ' + w1 + '\n'
                outfile.write(s)

    print('Creating the bigram has been completed!')

In [46]:
bigram_generator('./dataset/character_data.txt', './dataset/character_bigram.txt')

Creating the bigram from file `./dataset/character_data.txt` and saving it in file `./dataset/character_bigram.txt`


3922529it [00:53, 73197.25it/s] 

Creating the bigram has been completed!





In [47]:
bigram_generator('./dataset/syllable_data.txt', './dataset/syllable_bigram.txt')

Creating the bigram from file `./dataset/syllable_data.txt` and saving it in file `./dataset/syllable_bigram.txt`


1736199it [00:36, 48222.82it/s]

Creating the bigram has been completed!





In [48]:
def trigram_generator(infile: str, outfile: str):
    print(f'Creating the trigram from file `{infile}` and saving it in file `{outfile}`')
    with open(infile, 'r') as infile, open(outfile, 'w') as outfile:
        for line in tqdm(infile):
            line_splitted = line.split()
            for i in range(len(line_splitted) - 2):
                w0 = line_splitted[i]
                w1 = line_splitted[i+1]
                w2 = line_splitted[i+2]
                s = w0 + ' ' + w1 + ' ' + w2 + '\n'
                outfile.write(s)

    print('Creating the trigram has been completed!')

In [49]:
trigram_generator('./dataset/character_data.txt', './dataset/character_trigram.txt')

Creating the trigram from file `./dataset/character_data.txt` and saving it in file `./dataset/character_trigram.txt`


3922529it [01:13, 53288.61it/s] 

Creating the trigram has been completed!





In [50]:
trigram_generator('./dataset/syllable_data.txt', './dataset/syllable_trigram.txt')

Creating the trigram from file `./dataset/syllable_data.txt` and saving it in file `./dataset/syllable_trigram.txt`


1736199it [00:51, 33685.65it/s]

Creating the trigram has been completed!





### Split N-Gram Dataset to Train & Test

In [51]:
def count_number_of_samples(infile):
    count = 0
    with open(infile, 'r') as file:
        for line in file:
            count += 1
    return count

def train_test_split_n_gram(infile, train_outfile, test_outfile, train_ratio=0.95):
    total_number_of_samples = count_number_of_samples(infile)
    train_size = int(total_number_of_samples * train_ratio)
    
    with open(infile, 'r') as infile, open(train_outfile, 'w') as train_outfile, open(test_outfile, 'w') as test_outfile:
        for i, line in tqdm(enumerate(infile)):
            if i <= train_size:
                train_outfile.write(line)
            else:
                test_outfile.write(line)

In [52]:
train_test_split_n_gram('./dataset/character_unigram.txt', 
                        './dataset/character/train/unigram.txt', 
                        './dataset/character/test/unigram.txt')

395631701it [00:55, 7111028.66it/s]


In [53]:
train_test_split_n_gram('./dataset/character_bigram.txt', 
                        './dataset/character/train/bigram.txt', 
                        './dataset/character/test/bigram.txt')

391709172it [00:59, 6634821.49it/s]


In [54]:
train_test_split_n_gram('./dataset/character_trigram.txt', 
                        './dataset/character/train/trigram.txt', 
                        './dataset/character/test/trigram.txt')

387786643it [01:03, 6094700.70it/s]


In [55]:
train_test_split_n_gram('./dataset/syllable_unigram.txt', 
                        './dataset/syllable/train/unigram.txt', 
                        './dataset/syllable/test/unigram.txt')

209175035it [00:33, 6197672.69it/s]


In [56]:
train_test_split_n_gram('./dataset/syllable_bigram.txt', 
                        './dataset/syllable/train/bigram.txt', 
                        './dataset/syllable/test/bigram.txt')

207438836it [00:36, 5659802.36it/s]


In [57]:
train_test_split_n_gram('./dataset/syllable_trigram.txt', 
                        './dataset/syllable/train/trigram.txt', 
                        './dataset/syllable/test/trigram.txt')

205702637it [00:40, 5077890.59it/s]


## Remove Uncessary Files from Dataset Folder

In [58]:
os.system("rm -rf ./dataset/character_data.txt")
os.system("rm -rf ./dataset/character_unigram.txt")
os.system("rm -rf ./dataset/character_bigram.txt")
os.system("rm -rf ./dataset/character_trigram.txt")

os.system("rm -rf ./dataset/syllable_data.txt")
os.system("rm -rf ./dataset/syllable_unigram.txt")
os.system("rm -rf ./dataset/syllable_bigram.txt")
os.system("rm -rf ./dataset/syllable_trigram.txt")


0