# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Install required libraries
!pip install transformers datasets torch

In [None]:
# Define the file path and content
file_path = "./sample_file.txt"
content = "This is a sample content written to a text file."

# Open the file in write mode and save the content
with open(file_path, "w") as file:
    file.write(content)
!pip install wandb
import wandb

from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("wandb_api_key")

wandb.login(key=secret_value_0)

In [None]:
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForMaskedLM, Trainer, TrainingArguments, pipeline
import torch

# Step 2: Load the dataset
dataset = load_dataset('text', data_files='/kaggle/input/assamesetxt/as.txt')
print("Sample Data:", dataset['train'][0])

# Split the dataset into train and validation sets
dataset = dataset['train'].train_test_split(test_size=0.1)
train_dataset = dataset['train']
eval_dataset = dataset['test']

# Step 3: Initialize a tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

# Step 4: Define tokenization and masking function
def tokenize_function(examples):
    inputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    inputs["labels"] = inputs["input_ids"].clone()
    
    # Apply Masked Language Modeling (MLM)
    mask_probability = 0.15
    labels = inputs["labels"]
    rand = torch.rand(labels.shape)
    mask_arr = (rand < mask_probability) & (labels != tokenizer.pad_token_id)
    inputs["input_ids"][mask_arr] = tokenizer.mask_token_id
    
    return inputs

# Apply tokenization to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Step 5: Load pre-trained BERT model
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')

# Step 6: Set up training parameters
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Step 7: Initialize Trainer with both train and eval datasets
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,  # Add the evaluation dataset here
)

# Step 8: Train the model
trainer.train()

# Step 9: Save the model and tokenizer
model.save_pretrained('./assamese_model')
tokenizer.save_pretrained('./assamese_model')

# Step 10: Evaluate the model
trainer.evaluate()

# Step 11: Load the model for inference
model = BertForMaskedLM.from_pretrained('./assamese_model')
tokenizer = BertTokenizerFast.from_pretrained('./assamese_model')

# Step 12: Create a pipeline for masked language modeling
nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# Example usage
print(nlp("অসমীয়া ভাষা [MASK]।"))  # Example masked input
