In [None]:
!pip install datasets transformers

# Setting up custom dataset

In [11]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/reskill_dataset_v1.csv')
df.head()

Unnamed: 0,resume_text,skills
0,david garcia lopez daviddgl@gmail.com https //...,"ionic, flutter, aws, bdd, ionic, flutter, node..."
1,manjunath email manjunathjava261@gmail.com mob...,"java, j2ee, spring, hibernate, jdbc, servlets,..."
2,avinash kumar itpl main road 6th cross kundalh...,"html, html5, css, javascript, jquery, bootstra..."
3,contact ibropamela@gmail.com +44 7383 151 935 ...,"microsoft office, hris, adp, opentable, seven ..."
4,vandana . salesforce consultant profile deadli...,"salesforce, lightning, apex, javascript, visua..."


In [12]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

train_dataset = train_dataset.remove_columns(['__index_level_0__'])
test_dataset = test_dataset.remove_columns(['__index_level_0__'])


dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})

dataset

DatasetDict({
    train: Dataset({
        features: ['resume_text', 'skills'],
        num_rows: 9629
    })
    test: Dataset({
        features: ['resume_text', 'skills'],
        num_rows: 2408
    })
})

In [28]:
dataset['train'][7]

{'resume_text': 'venkatesh gunuru sterling integrator edi developer email id venkatesh11gunuru@gmail.com phone# +919949918860 professional summary having work experience of 8+ years as an it professional in edi sterling integrator which includes design development implementation and documentation as a b2b developer on si with good exposure in areas of map developments enhancements testing end to end process deployment and support with knowledge in edi ansi x12 standards along with good knowledge on retail and logistics business flow. as part of this assignment i have been working extensively in trading partner setup migration for edi projects. experienced in design mapping business processes and production support activities in sterling integrator version 5.2.x worked extensively in retail logistics domain. experience in troubleshooting issues on daily basis and reporting to respective teams or fixing at our end and moving the artifact to production and reprocessing the failed files. h

# Training a custom tokenizer

In [33]:
def get_training_corpus():
  custom_dataset = dataset["train"]
  for start_idx in range(0, len(custom_dataset), 1000): # this loop iterates over the dataset in chunks of 1000 samples at a time
    samples = custom_dataset[start_idx : start_idx + 1000] # slices the dataset to get a chunk of 1000 samples
    yield samples["skills"] # yield is used instead of return to create a generator, which is more memory efficient for large datasets

In [34]:
from transformers import AutoTokenizer

training_corpus = get_training_corpus()

old_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")

new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=52000)

In [35]:
new_tokenizer.save_pretrained('reskill-bert-uncased-tokenizer-v2')

('reskill-bert-uncased-tokenizer-v2/tokenizer_config.json',
 'reskill-bert-uncased-tokenizer-v2/special_tokens_map.json',
 'reskill-bert-uncased-tokenizer-v2/vocab.txt',
 'reskill-bert-uncased-tokenizer-v2/added_tokens.json',
 'reskill-bert-uncased-tokenizer-v2/tokenizer.json')

# Custom tokenizer -> yield samples["resume_text"]

In [41]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file='/content/reskill-bert-uncased-tokenizer/tokenizer.json')

text = 'skills: python, html, javascript, sql server, c#, .net core, mongodb, azuredevops, hp application lifecycle management (alm), quality center, quix, jira, postman, soap ui, putty, winscp, hpsim, vsphere client''sql server, c#, .net core, html, javascript, mongodb, azuredevops, sterling integrator, edi, ansi x12, edifact, idoc, xml, csv, ftp, sftp, as2, http, oracle, db2, windows, linux'

tokenizer.tokenize(text)

['skills',
 '[UNK]',
 'python',
 '[UNK]',
 'html',
 '[UNK]',
 'javascript',
 '[UNK]',
 'sql',
 'server',
 '[UNK]',
 'c',
 '#',
 '[UNK]',
 '.',
 'net',
 'core',
 '[UNK]',
 'mongodb',
 '[UNK]',
 'azuredevops',
 '[UNK]',
 'hp',
 'application',
 'lifecycle',
 'management',
 '[UNK]',
 'alm',
 '[UNK]',
 '[UNK]',
 'quality',
 'center',
 '[UNK]',
 'quix',
 '[UNK]',
 'jira',
 '[UNK]',
 'postman',
 '[UNK]',
 'soap',
 'ui',
 '[UNK]',
 'putty',
 '[UNK]',
 'winscp',
 '[UNK]',
 'hps',
 '##im',
 '[UNK]',
 'vsphere',
 'clients',
 '##ql',
 'server',
 '[UNK]',
 'c',
 '#',
 '[UNK]',
 '.',
 'net',
 'core',
 '[UNK]',
 'html',
 '[UNK]',
 'javascript',
 '[UNK]',
 'mongodb',
 '[UNK]',
 'azuredevops',
 '[UNK]',
 'sterling',
 'integrator',
 '[UNK]',
 'edi',
 '[UNK]',
 'ansi',
 'x12',
 '[UNK]',
 'edifact',
 '[UNK]',
 'idoc',
 '[UNK]',
 'xml',
 '[UNK]',
 'csv',
 '[UNK]',
 'ftp',
 '[UNK]',
 'sftp',
 '[UNK]',
 'as2',
 '[UNK]',
 'http',
 '[UNK]',
 'oracle',
 '[UNK]',
 'db2',
 '[UNK]',
 'windows',
 '[UNK]',
 'linux']

# Custom tokenizer -> yield samples["skills"]


In [42]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file='/content/reskill-bert-uncased-tokenizer-v2/tokenizer.json')

text = 'skills: python, html, javascript, sql server, c#, .net core, mongodb, azuredevops, hp application lifecycle management (alm), quality center, quix, jira, postman, soap ui, putty, winscp, hpsim, vsphere client''sql server, c#, .net core, html, javascript, mongodb, azuredevops, sterling integrator, edi, ansi x12, edifact, idoc, xml, csv, ftp, sftp, as2, http, oracle, db2, windows, linux'

tokenizer.tokenize(text)

['skills',
 ':',
 'python',
 ',',
 'html',
 ',',
 'javascript',
 ',',
 'sql',
 'server',
 ',',
 'c',
 '#',
 ',',
 '.',
 'net',
 'core',
 ',',
 'mongodb',
 ',',
 'azuredevops',
 ',',
 'hp',
 'application',
 'lifecycle',
 'management',
 '(',
 'alm',
 ')',
 ',',
 'quality',
 'center',
 ',',
 'quix',
 ',',
 'jira',
 ',',
 'postman',
 ',',
 'soap',
 'ui',
 ',',
 'putty',
 ',',
 'winscp',
 ',',
 'hpsim',
 ',',
 'vsphere',
 'clients',
 '##ql',
 'server',
 ',',
 'c',
 '#',
 ',',
 '.',
 'net',
 'core',
 ',',
 'html',
 ',',
 'javascript',
 ',',
 'mongodb',
 ',',
 'azuredevops',
 ',',
 'sterling',
 'integrator',
 ',',
 'edi',
 ',',
 'ansi',
 'x12',
 ',',
 'edifact',
 ',',
 'idoc',
 ',',
 'xml',
 ',',
 'csv',
 ',',
 'ftp',
 ',',
 'sftp',
 ',',
 'as2',
 ',',
 'http',
 ',',
 'oracle',
 ',',
 'db2',
 ',',
 'windows',
 ',',
 'linux']

# Bert Tokenizer results with comparison

In [40]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('google-bert/bert-base-uncased')

text = 'skills: python, html, javascript, sql server, c#, .net core, mongodb, azuredevops, hp application lifecycle management (alm), quality center, quix, jira, postman, soap ui, putty, winscp, hpsim, vsphere client''sql server, c#, .net core, html, javascript, mongodb, azuredevops, sterling integrator, edi, ansi x12, edifact, idoc, xml, csv, ftp, sftp, as2, http, oracle, db2, windows, linux'

tokenizer.tokenize(text)

['skills',
 ':',
 'python',
 ',',
 'html',
 ',',
 'java',
 '##script',
 ',',
 'sql',
 'server',
 ',',
 'c',
 '#',
 ',',
 '.',
 'net',
 'core',
 ',',
 'mon',
 '##go',
 '##db',
 ',',
 'azure',
 '##dev',
 '##ops',
 ',',
 'hp',
 'application',
 'life',
 '##cycle',
 'management',
 '(',
 'al',
 '##m',
 ')',
 ',',
 'quality',
 'center',
 ',',
 'qui',
 '##x',
 ',',
 'ji',
 '##ra',
 ',',
 'post',
 '##man',
 ',',
 'soap',
 'ui',
 ',',
 'put',
 '##ty',
 ',',
 'wins',
 '##cp',
 ',',
 'hp',
 '##si',
 '##m',
 ',',
 'vs',
 '##pher',
 '##e',
 'clients',
 '##q',
 '##l',
 'server',
 ',',
 'c',
 '#',
 ',',
 '.',
 'net',
 'core',
 ',',
 'html',
 ',',
 'java',
 '##script',
 ',',
 'mon',
 '##go',
 '##db',
 ',',
 'azure',
 '##dev',
 '##ops',
 ',',
 'sterling',
 'int',
 '##eg',
 '##rator',
 ',',
 'ed',
 '##i',
 ',',
 'an',
 '##si',
 'x',
 '##12',
 ',',
 'ed',
 '##if',
 '##act',
 ',',
 'id',
 '##oc',
 ',',
 'xml',
 ',',
 'cs',
 '##v',
 ',',
 'ft',
 '##p',
 ',',
 'sf',
 '##tp',
 ',',
 'as',
 '##2',
 ',',
 'http

# Training GPT2 Tokenizer on my dataset

In [43]:
from transformers import AutoTokenizer

training_corpus = get_training_corpus()

old_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=52000)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [44]:
new_tokenizer.save_pretrained('reskill-gpt2-tokenizer-v1')

('reskill-gpt2-tokenizer-v1/tokenizer_config.json',
 'reskill-gpt2-tokenizer-v1/special_tokens_map.json',
 'reskill-gpt2-tokenizer-v1/vocab.json',
 'reskill-gpt2-tokenizer-v1/merges.txt',
 'reskill-gpt2-tokenizer-v1/added_tokens.json',
 'reskill-gpt2-tokenizer-v1/tokenizer.json')

In [45]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file='/content/reskill-gpt2-tokenizer-v1/tokenizer.json')

text = 'skills: python, html, javascript, sql server, c#, .net core, mongodb, azuredevops, hp application lifecycle management (alm), quality center, quix, jira, postman, soap ui, putty, winscp, hpsim, vsphere client''sql server, c#, .net core, html, javascript, mongodb, azuredevops, sterling integrator, edi, ansi x12, edifact, idoc, xml, csv, ftp, sftp, as2, http, oracle, db2, windows, linux'

tokenizer.tokenize(text)

['sk',
 'ill',
 's',
 ':',
 'Ġpython',
 ',',
 'Ġhtml',
 ',',
 'Ġjavascript',
 ',',
 'Ġsql',
 'Ġserver',
 ',',
 'Ġc',
 '#,',
 'Ġ.',
 'net',
 'Ġcore',
 ',',
 'Ġmongodb',
 ',',
 'Ġazuredevops',
 ',',
 'Ġhp',
 'Ġapplication',
 'Ġlifecycle',
 'Ġmanagement',
 'Ġ(',
 'alm',
 '),',
 'Ġquality',
 'Ġcenter',
 ',',
 'Ġquix',
 ',',
 'Ġjira',
 ',',
 'Ġpostman',
 ',',
 'Ġsoap',
 'Ġui',
 ',',
 'Ġputty',
 ',',
 'Ġwinscp',
 ',',
 'Ġhpsim',
 ',',
 'Ġvsphere',
 'Ġclient',
 'sql',
 'Ġserver',
 ',',
 'Ġc',
 '#,',
 'Ġ.',
 'net',
 'Ġcore',
 ',',
 'Ġhtml',
 ',',
 'Ġjavascript',
 ',',
 'Ġmongodb',
 ',',
 'Ġazuredevops',
 ',',
 'Ġsterling',
 'Ġintegrator',
 ',',
 'Ġedi',
 ',',
 'Ġansi',
 'Ġx',
 '12',
 ',',
 'Ġedifact',
 ',',
 'Ġidoc',
 ',',
 'Ġxml',
 ',',
 'Ġcsv',
 ',',
 'Ġftp',
 ',',
 'Ġsftp',
 ',',
 'Ġas',
 '2',
 ',',
 'Ġhttp',
 ',',
 'Ġoracle',
 ',',
 'Ġdb',
 '2',
 ',',
 'Ġwindows',
 ',',
 'Ġlinux']

# Training sentence-transformers/all-MiniLM-L6-v2 tokenizer

In [46]:
from transformers import AutoTokenizer

training_corpus = get_training_corpus()

old_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, vocab_size=52000)

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [47]:
new_tokenizer.save_pretrained('reskill-all-miniLM-tokenizer-v1')

('reskill-all-miniLM-tokenizer-v1/tokenizer_config.json',
 'reskill-all-miniLM-tokenizer-v1/special_tokens_map.json',
 'reskill-all-miniLM-tokenizer-v1/vocab.txt',
 'reskill-all-miniLM-tokenizer-v1/added_tokens.json',
 'reskill-all-miniLM-tokenizer-v1/tokenizer.json')

In [48]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file='/content/reskill-all-miniLM-tokenizer-v1/tokenizer.json')

text = 'skills: python, html, javascript, sql server, c#, .net core, mongodb, azuredevops, hp application lifecycle management (alm), quality center, quix, jira, postman, soap ui, putty, winscp, hpsim, vsphere client''sql server, c#, .net core, html, javascript, mongodb, azuredevops, sterling integrator, edi, ansi x12, edifact, idoc, xml, csv, ftp, sftp, as2, http, oracle, db2, windows, linux'

tokenizer.tokenize(text)

['skills',
 ':',
 'python',
 ',',
 'html',
 ',',
 'javascript',
 ',',
 'sql',
 'server',
 ',',
 'c',
 '#',
 ',',
 '.',
 'net',
 'core',
 ',',
 'mongodb',
 ',',
 'azuredevops',
 ',',
 'hp',
 'application',
 'lifecycle',
 'management',
 '(',
 'alm',
 ')',
 ',',
 'quality',
 'center',
 ',',
 'quix',
 ',',
 'jira',
 ',',
 'postman',
 ',',
 'soap',
 'ui',
 ',',
 'putty',
 ',',
 'winscp',
 ',',
 'hpsim',
 ',',
 'vsphere',
 'clients',
 '##ql',
 'server',
 ',',
 'c',
 '#',
 ',',
 '.',
 'net',
 'core',
 ',',
 'html',
 ',',
 'javascript',
 ',',
 'mongodb',
 ',',
 'azuredevops',
 ',',
 'sterling',
 'integrator',
 ',',
 'edi',
 ',',
 'ansi',
 'x12',
 ',',
 'edifact',
 ',',
 'idoc',
 ',',
 'xml',
 ',',
 'csv',
 ',',
 'ftp',
 ',',
 'sftp',
 ',',
 'as2',
 ',',
 'http',
 ',',
 'oracle',
 ',',
 'db2',
 ',',
 'windows',
 ',',
 'linux']

# Training a Custom Tokenizer from Scratch

In [56]:
def get_training_corpus_v2(dataset, feature_key='skills', batch_size=1000):
    for start_idx in range(0, len(dataset), batch_size):
        samples = dataset[start_idx:start_idx + batch_size]
        yield samples[feature_key]

In [57]:
from tokenizers import ByteLevelBPETokenizer

# Initialize a new tokenizer
tokenizer = ByteLevelBPETokenizer()

In [58]:
training_corpus = get_training_corpus_v2(dataset['train'])

# Training the tokenizer
tokenizer.train_from_iterator(training_corpus, vocab_size=52000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model("reskill_tokenizer")

['reskill_tokenizer/vocab.json', 'reskill_tokenizer/merges.txt']

In [60]:
from transformers import PreTrainedTokenizerFast

tokenizer = PreTrainedTokenizerFast(tokenizer_file='/content/reskill_tokenizer/tokenizer.json')

text = 'skills: python, html, javascript, sql server, c#, .net core, mongodb, azuredevops, hp application lifecycle management (alm), quality center, quix, jira, postman, soap ui, putty, winscp, hpsim, vsphere client''sql server, c#, .net core, html, javascript, mongodb, azuredevops, sterling integrator, edi, ansi x12, edifact, idoc, xml, csv, ftp, sftp, as2, http, oracle, db2, windows, linux'

tokenizer.tokenize(text)

Exception: No such file or directory (os error 2)