In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split

from datasets import load_dataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_fwf('words_250000_train.txt',names=['word'])

In [None]:
def generate_masked_words(correct_word, totalMask, num_masks):
    word_len = len(correct_word)
    masked_words = []

    for _ in range(num_masks):
        # Randomly decide how many characters to mask (1 to totalMask)
        num_to_mask = random.randint(1, totalMask)
        # num_to_mask = 1
        
        # Randomly select positions to mask
        mask_positions = random.sample(range(word_len), num_to_mask)
        masked_word = list(correct_word)

        for pos in mask_positions:
            masked_word[pos] = '_'

        masked_words.append(''.join(masked_word))

    return masked_words

In [None]:
import pandas as pd

correct_masked = {}
perMask = 0.5
dt = pd.DataFrame(columns=['masked', 'correct'])
words = df['word'].unique()
words = [word for word in words if pd.notna(word)]

def process_word(word):
    word_len = len(word)

    if word_len == 1:
        return None  # Return None for words of length 1 to filter them out later

    totalMask = int(word_len * perMask)
    masked_word_list = generate_masked_words(word, totalMask, word_len * 2)
    masked_word_list = list(set(masked_word_list))

    correctWordList = [word] * len(masked_word_list)
    return list(zip(masked_word_list, correctWordList))

# Use map to process each word
result = list(map(process_word, words))

# Filter out None values
result = [item for sublist in result if sublist is not None for item in sublist]



In [None]:
# Convert to DataFrame
dt = pd.DataFrame(result, columns=['masked', 'label'])

In [None]:
dt.head()

In [None]:
# Function to transform masked and correct words
def transform_words(row):
    masked = ' '.join(list(row['masked']))
    correct = ' '.join(list(row['label']))
    return masked, correct

In [None]:
# Apply transformation
dt[['masked', 'label']] = dt.apply(transform_words, axis=1, result_type='expand')

In [None]:
# Split the data into train, validation, and test sets
train_val, test = train_test_split(dt, test_size=0.2, random_state=42)
train, val = train_test_split(train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Save the splits to CSV files
train.to_csv('./data/train.csv', index=False)
val.to_csv('./data/validation.csv', index=False)
test.to_csv('./data/test.csv', index=False)

In [2]:
# Loading the training, validation and testing dataset
#"train"     : ["data/train.csv"],
dataset = load_dataset('csv', data_files={ 
                                          "validation": ["data/validation.csv"], 
                                          "test"      : ["data/test.csv"]})
dataset

DatasetDict({
    validation: Dataset({
        features: ['masked', 'label'],
        num_rows: 738890
    })
    test: Dataset({
        features: ['masked', 'label'],
        num_rows: 738890
    })
})

In [None]:
#dataset['train'][0]

In [5]:
from transformers import DistilBertTokenizer
from transformers import AutoTokenizer

In [6]:
# Tokenizer
checkpoint = "distilbert-base-cased"
#tokenizer = DistilBertTokenizer.from_pretrained(checkpoint,vocab_file="vocab.txt")#,vocab_file="vocab.txt")
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [7]:
t = tokenizer("a b c d e f g h i j k l m n o p q r s t u v w x y z _")
t

{'input_ids': [101, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 168, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [8]:
tokens = "a b c d e f g h i j k l m n o p q r s t u v w x y z _".split(" ")
token_ids = list(t.input_ids)[1:-1]

In [9]:
label2id = {label:id for label, id in zip(tokens, token_ids)}
id2label = {value:key for key, value in label2id.items()}

In [10]:
label2id

{'a': 170,
 'b': 171,
 'c': 172,
 'd': 173,
 'e': 174,
 'f': 175,
 'g': 176,
 'h': 177,
 'i': 178,
 'j': 179,
 'k': 180,
 'l': 181,
 'm': 182,
 'n': 183,
 'o': 184,
 'p': 185,
 'q': 186,
 'r': 187,
 's': 188,
 't': 189,
 'u': 190,
 'v': 191,
 'w': 192,
 'x': 193,
 'y': 194,
 'z': 195,
 '_': 168}

In [11]:
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [None]:
t = tokenizer(dataset['train'][0]['masked'])
t

In [None]:
t.word_ids

In [None]:
tokenizer.decode(t['input_ids'])

In [None]:
dataset['test'][0]

In [12]:
def getTargetEncode(batches):
  words_encoded=[]
  for word in batches:
    word_encoded = [-100]
    for let in word.split(" "):
      let_encode =  label2id[let]
      word_encoded.append(let_encode)
    word_encoded = word_encoded + [-100]
    words_encoded.append(word_encoded)
  return words_encoded


In [13]:
MAX_TEXT_LENGTH = 20
MAX_correct_LENGTH = 20

# Tokenize for both input and target(label)
def tokenize_fn(batch):
    # Tokenize the input seq first
    # It will populate inputs_ids, attention_mask etc
    tokenized_inputs = tokenizer(batch['masked'], truncation=True)
    #tokenized_outputs = tokenizer(batch['label'], truncation=True, padding=True, max_length=MAX_correct_LENGTH)
    tokenized_outputs_str = batch['label']
    tokenized_inputs['labels'] = getTargetEncode(tokenized_outputs_str)

    #tokenized_inputs['labels'] = tokenized_outputs.input_ids
    return tokenized_inputs

In [14]:
tokenized_datasets = dataset.map(
    tokenize_fn, 
    batched=True,
    remove_columns=dataset['test'].column_names)


Map: 100%|██████████| 738890/738890 [00:26<00:00, 28333.57 examples/s]


In [None]:
tokenized_datasets['test'][0]

In [None]:
id2label

In [None]:
# label2id = {chr(i): i-92 for i in range(97, 123)}
# label2id['_'] = 31

# id2label = {value:key for key, value in label2id.items()}

In [15]:
from transformers import DistilBertForTokenClassification
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label = id2label,
    label2id = label2id,
    
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import TrainingArguments

train_args = TrainingArguments(
    "distilbert-finetuned-hangman",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.1,
)



In [17]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [18]:
from transformers import Trainer
trainer = Trainer(
    model = model,
    args = train_args,
    train_dataset=tokenized_datasets['validation'],
    eval_dataset=tokenized_datasets['test'],
    data_collator = data_collator,
    tokenizer=tokenizer
)

In [19]:
trainer.train()

  0%|          | 0/277086 [00:00<?, ?it/s]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [20]:
tokenized_datasets

DatasetDict({
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 738890
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 738890
    })
})