In [1]:
import os
import glob

directory = '/kaggle/input/lepetitprince100langues'
input_files = glob.glob(os.path.join(directory, '*.txt'))
results_file = 'results.csv'
trained_files_log = 'trained_files.log'

In [2]:
import tensorflow as tf

# Check available GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth for all GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPUs available: {len(gpus)}")
    except RuntimeError as e:
        print(e)
else:
    print("No GPUs found")

# Verify GPU usage
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

GPUs available: 1
Num GPUs Available:  1


In [3]:
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [4]:
from tokenizers import SentencePieceBPETokenizer

tokenizer = SentencePieceBPETokenizer()
tokenizer.train(
    files=input_files,
    vocab_size=8000,
    min_frequency=2,
    special_tokens=["<unk>", "<pad>"]
)
tokenizer.save("spm_tokenizer.json")

In [5]:
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam

def load_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

def create_sequences(text, sequence_length, tokenizer):
    tokens = tokenizer.encode(text, add_special_tokens=True)
    input_sequences = []
    for i in range(sequence_length, len(tokens)):
        seq = tokens[i-sequence_length:i]
        input_sequences.append(seq)
    return np.array(input_sequences), tokenizer.vocab_size

def data_generator(input_sequences, total_words, batch_size):
    num_samples = len(input_sequences)
    while True:
        for i in range(0, num_samples, batch_size):
            batch = input_sequences[i:i+batch_size]
            batch_one_hot = to_categorical_3d(batch, num_classes=total_words)
            yield batch_one_hot, batch_one_hot

def create_autoencoder(sequence_length, total_words, learning_rate=0.001):
    model = Sequential([
        Input(shape=(sequence_length, total_words)),
        LSTM(100, return_sequences=False),
        BatchNormalization(),
        RepeatVector(sequence_length),
        LSTM(100, return_sequences=True),
        BatchNormalization(),
        TimeDistributed(Dense(total_words, activation='softmax'))
    ])
    model.compile(
        optimizer=Adam(learning_rate=learning_rate),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

In [6]:
from transformers import PreTrainedTokenizerFast
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

sequence_length = 50
epochs = 25

def to_categorical_3d(y, num_classes):
    batch_size, sequence_length = y.shape
    one_hot = np.zeros((batch_size, sequence_length, num_classes), dtype=np.float32)
    for i in range(batch_size):
        for j in range(sequence_length):
            one_hot[i, j, y[i, j]] = 1
    return one_hot

def data_generator(input_sequences, total_words, batch_size):
    num_samples = len(input_sequences)
    while True:
        for i in range(0, num_samples, batch_size):
            batch = input_sequences[i:i+batch_size]
            batch_one_hot = to_categorical_3d(batch, num_classes=total_words)
            yield batch_one_hot, batch_one_hot

processed_files = {}
if os.path.exists(trained_files_log):
    with open(trained_files_log, 'r') as f:
        for line in f:
            filename, beginning_loss, final_loss = line.strip().split(': ')
            processed_files[filename] = (float(beginning_loss), float(final_loss))

In [7]:
for filename in os.listdir(directory):
    if filename in processed_files:
        print(f"Skipping {filename}: already trained.")
        continue

    file_path = os.path.join(directory, filename)
    text = load_text(file_path)
    tokenizer = PreTrainedTokenizerFast(tokenizer_file="spm_tokenizer.json")
    input_sequences, total_words = create_sequences(text, sequence_length, tokenizer)
    
    X_train, X_test = train_test_split(input_sequences, test_size=0.2, random_state=42)
    print("Dataset Created")

    with tf.device('/GPU:0'):
        model = create_autoencoder(sequence_length, total_words)
    print("Autoencoder Created")

    batch_size = 32
    train_generator = data_generator(X_train, total_words, batch_size)
    test_generator = data_generator(X_test, total_words, batch_size)

    train_steps = len(X_train) // batch_size
    test_steps = len(X_test) // batch_size

    history = model.fit(
        train_generator,
        steps_per_epoch=train_steps,
        epochs=epochs,
        verbose=1,
        validation_data=test_generator,
        validation_steps=test_steps,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
        ]
    )

    beginning_val_loss = history.history['val_loss'][0]
    final_val_loss = history.history['val_loss'][-1]
    normalized_beginning_val_loss = beginning_val_loss / np.log(total_words)
    normalized_final_val_loss = final_val_loss / np.log(total_words)
    processed_files[filename] = (normalized_beginning_val_loss, normalized_final_val_loss)

    with open(trained_files_log, 'a') as f:
        f.write(f"{filename}: {normalized_beginning_val_loss}: {normalized_final_val_loss}\n")
    
    print(f"Processed {filename}: normalized beginning validation loss = {normalized_beginning_val_loss}, normalized final validation loss = {normalized_final_val_loss}")



Dataset Created
Autoencoder Created
Epoch 1/25
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 152ms/step - accuracy: 0.0248 - loss: 7.6988 - val_accuracy: 0.0262 - val_loss: 5.4624
Epoch 2/25
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 146ms/step - accuracy: 0.0407 - loss: 4.9579 - val_accuracy: 0.0323 - val_loss: 5.0106
Epoch 3/25
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 147ms/step - accuracy: 0.0467 - loss: 4.5497 - val_accuracy: 0.0412 - val_loss: 4.5498
Epoch 4/25
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 146ms/step - accuracy: 0.0494 - loss: 4.2698 - val_accuracy: 0.0410 - val_loss: 4.4190
Epoch 5/25
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 147ms/step - accuracy: 0.0507 - loss: 4.1246 - val_accuracy: 0.0322 - val_loss: 4.6973
Epoch 6/25
[1m280/280[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 147ms/step - accuracy: 0.0519 - loss: 4.0223 - val_accuracy: 