In [28]:
import tiktoken
from datasets import load_dataset

In [29]:
solidity_keywords = [
 '!=',
 '%=',
 '&=',
 '*=',
 '-=',
 '// SPDX-License-Identifier:',
 '/=',
 '<<=',
 '<=',
 '>=',
 '>>=',
 '>>>=',
 '?',
 '[',
 ']',
 '^',
 '^=',
 'abstract',
 'add',
 'addmod',
 'address',
 'and',
 'anonymous',
 'assembly',
 'assert',
 'at',
 'balance',
 'basefee',
 'blobbasefee',
 'blobhash',
 'block',
 'blockhash',
 'bool',
 'break',
 'byte',
 'bytes',
 'bytes1',
 'bytes10',
 'bytes11',
 'bytes12',
 'bytes13',
 'bytes14',
 'bytes15',
 'bytes16',
 'bytes17',
 'bytes18',
 'bytes19',
 'bytes2',
 'bytes20',
 'bytes21',
 'bytes22',
 'bytes23',
 'bytes24',
 'bytes25',
 'bytes26',
 'bytes27',
 'bytes28',
 'bytes29',
 'bytes3',
 'bytes30',
 'bytes31',
 'bytes32',
 'bytes4',
 'bytes5',
 'bytes6',
 'bytes7',
 'bytes8',
 'bytes9',
 'call',
 'callcode',
 'calldata',
 'calldataload,calldatasize,calldatacopy,extcodesize',
 'caller',
 'callvalue',
 'case',
 'catch',
 'chainid',
 'clz',
 'coinbase',
 'constant',
 'constructor',
 'continue',
 'contract',
 'create',
 'create2',
 'days',
 'default',
 'define',
 'delegatecall,staticcall',
 'delete',
 'difficulty',
 'div',
 'do',
 'else',
 'emit',
 'enum',
 'eq',
 'error',
 'ether',
 'event',
 'exp',
 'extcodecopy',
 'extcodehash',
 'external',
 'fallback',
 'false',
 'final',
 'fixed',
 'for',
 'from',
 'function',
 'gas',
 'gaslimit',
 'gasprice',
 'global',
 'gt',
 'gwei',
 'hex',
 'hours',
 'if',
 'in',
 'indexed',
 'inline',
 'int',
 'int104',
 'int112',
 'int120',
 'int128',
 'int136',
 'int144',
 'int152',
 'int16',
 'int160',
 'int168',
 'int176',
 'int184',
 'int192',
 'int200',
 'int208',
 'int216',
 'int224',
 'int232',
 'int24',
 'int240',
 'int248',
 'int256',
 'int32',
 'int40',
 'int48',
 'int56',
 'int64',
 'int72',
 'int8',
 'int80',
 'int88',
 'int96',
 'interface',
 'internal',
 'iszero',
 'keccak256',
 'layout',
 'leave',
 'let',
 'library',
 'log0',
 'log1',
 'log2',
 'log3',
 'log4',
 'lt',
 'macro',
 'mapping',
 'match',
 'memory',
 'minutes',
 'mload',
 'mod',
 'modifier',
 'msg',
 'msize',
 'mstore',
 'mstore8',
 'mul',
 'mulmod',
 'mutable',
 'new',
 'not',
 'null',
 'number',
 'of',
 'or',
 'origin',
 'override',
 'partial',
 'payable',
 'pop',
 'pragma',
 'prevrandao',
 'private',
 'public',
 'pure',
 'receive',
 'relocatable',
 'require',
 'return',
 'returndatasi,zereturndataco,pymcopy',
 'returns',
 'revert',
 'sar',
 'sdiv',
 'seconds',
 'selfbalance',
 'selfdestruct,invalid',
 'sender',
 'sgt',
 'shl',
 'shr',
 'signextend',
 'sload',
 'slt',
 'smod',
 'solidity',
 'sstore',
 'static',
 'stop',
 'storage',
 'string',
 'struct',
 'sub',
 'super',
 'switch',
 'this',
 'timestamp',
 'tload',
 'transient',
 'true',
 'try',
 'tstore',
 'tx',
 'ufixed',
 'uint',
 'uint104',
 'uint112',
 'uint120',
 'uint128',
 'uint136',
 'uint144',
 'uint152',
 'uint16',
 'uint160',
 'uint168',
 'uint176',
 'uint184',
 'uint192',
 'uint200',
 'uint208',
 'uint216',
 'uint224',
 'uint232',
 'uint24',
 'uint240',
 'uint248',
 'uint256',
 'uint32',
 'uint40',
 'uint48',
 'uint56',
 'uint64',
 'uint72',
 'uint8',
 'uint80',
 'uint88',
 'uint96',
 'using',
 'var',
 'view',
 'virtual',
 'weeks',
 'wei',
 'while',
 'xor',
 'years',
 '{',
 '|',
 '|=',
 '||',
 '}',
 '~']

print(len(solidity_keywords))

288


In [12]:
# find out which are not represented by a single token, if not represented by single token add as a special token
gpt2_base = tiktoken.get_encoding('gpt2')
special_tokens = []

for keyword in solidity_keywords:
  enc = gpt2_base.encode(keyword)
  if len(enc) > 1:
    special_tokens.append(keyword)

print(len(special_tokens))

185


In [13]:
special_tokens[:10], special_tokens[-10:]

(['!=',
  '%=',
  '&=',
  '*=',
  '-=',
  '// SPDX-License-Identifier:',
  '/=',
  '<<=',
  '<=',
  '>='],
 ['uint56',
  'uint64',
  'uint72',
  'uint8',
  'uint80',
  'uint88',
  'uint96',
  'weeks',
  'xor',
  '|='])

In [14]:
simple_solidity = '''// SPDX-License-Identifier: MIT
pragma solidity ^0.8.26;
contract SimpleStorage {
    uint256 public num;

    function set(uint256 _num) public {
        num = _num;
    }
    
    function get() public view returns (uint256) {
        return num;
    }
}'''

In [15]:
special_tokens_ids = { token: gpt2_base.n_vocab + i for i, token in enumerate(special_tokens)}
special_tokens_ids

{'!=': 50257,
 '%=': 50258,
 '&=': 50259,
 '*=': 50260,
 '-=': 50261,
 '// SPDX-License-Identifier:': 50262,
 '/=': 50263,
 '<<=': 50264,
 '<=': 50265,
 '>=': 50266,
 '>>=': 50267,
 '>>>=': 50268,
 '^=': 50269,
 'abstract': 50270,
 'addmod': 50271,
 'anonymous': 50272,
 'basefee': 50273,
 'blobbasefee': 50274,
 'blobhash': 50275,
 'blockhash': 50276,
 'bytes1': 50277,
 'bytes10': 50278,
 'bytes11': 50279,
 'bytes12': 50280,
 'bytes13': 50281,
 'bytes14': 50282,
 'bytes15': 50283,
 'bytes16': 50284,
 'bytes17': 50285,
 'bytes18': 50286,
 'bytes19': 50287,
 'bytes2': 50288,
 'bytes20': 50289,
 'bytes21': 50290,
 'bytes22': 50291,
 'bytes23': 50292,
 'bytes24': 50293,
 'bytes25': 50294,
 'bytes26': 50295,
 'bytes27': 50296,
 'bytes28': 50297,
 'bytes29': 50298,
 'bytes3': 50299,
 'bytes30': 50300,
 'bytes31': 50301,
 'bytes32': 50302,
 'bytes4': 50303,
 'bytes5': 50304,
 'bytes6': 50305,
 'bytes7': 50306,
 'bytes8': 50307,
 'bytes9': 50308,
 'callcode': 50309,
 'calldata': 50310,
 'callda

In [16]:
enc = tiktoken.Encoding(
  name='sol_gpt2',
  pat_str=gpt2_base._pat_str,
  mergeable_ranks=gpt2_base._mergeable_ranks,
  special_tokens={
        **gpt2_base._special_tokens,
        **special_tokens_ids
    }
)

In [17]:
gpt2_base = tiktoken.get_encoding('gpt2')
encoding = gpt2_base.encode(simple_solidity)
print(encoding)
print(len(encoding))

[1003, 30628, 55, 12, 34156, 12, 33234, 7483, 25, 17168, 198, 1050, 363, 2611, 4735, 414, 10563, 15, 13, 23, 13, 2075, 26, 198, 28484, 17427, 31425, 1391, 198, 220, 220, 220, 20398, 11645, 1171, 997, 26, 628, 220, 220, 220, 2163, 900, 7, 28611, 11645, 4808, 22510, 8, 1171, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 997, 796, 4808, 22510, 26, 198, 220, 220, 220, 1782, 628, 220, 220, 220, 2163, 651, 3419, 1171, 1570, 5860, 357, 28611, 11645, 8, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 1441, 997, 26, 198, 220, 220, 220, 1782, 198, 92]
102


In [18]:
encoding_custom = enc.encode(simple_solidity, allowed_special=set(special_tokens))
print(encoding_custom)
print(len(encoding_custom))

[50262, 17168, 198, 50382, 220, 50400, 10563, 15, 13, 23, 13, 2075, 26, 198, 28484, 17427, 31425, 1391, 198, 220, 220, 220, 220, 50428, 1171, 997, 26, 628, 220, 220, 220, 2163, 900, 7, 50428, 4808, 22510, 8, 1171, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 997, 796, 4808, 22510, 26, 198, 220, 220, 220, 1782, 628, 220, 220, 220, 2163, 651, 3419, 1171, 1570, 220, 50387, 357, 50428, 8, 1391, 198, 220, 220, 220, 220, 220, 220, 220, 1441, 997, 26, 198, 220, 220, 220, 1782, 198, 92]
91


In [19]:
enc.encode('Hello World'), gpt2_base.encode('Hello World')

([15496, 2159], [15496, 2159])

In [20]:
enc.encode_batch(['Hello', ' World'], num_threads=8 ), gpt2_base.encode_batch(['Hello', ' World'], num_threads=8)

([[15496], [2159]], [[15496], [2159]])

In [21]:
gpt2_base.encode('uint256'), enc.encode('uint256', allowed_special=set(special_tokens))

([28611, 11645], [50428])

In [22]:
enc.n_vocab

50442

In [23]:
gpt2_base.special_tokens_set

{'<|endoftext|>'}

In [33]:
special_tokens_set = len(set(special_tokens).union(set(gpt2_base.special_tokens_set)))
special_tokens_set

186

In [25]:
enc.eot_token

50256

In [26]:
dataset = load_dataset('mwritescode/slither-audited-smart-contracts', 'all-multilabel', num_proc=5)

Using the latest cached version of the dataset since mwritescode/slither-audited-smart-contracts couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'all-multilabel' at /Users/arpankumarnandi/.cache/huggingface/datasets/mwritescode___slither-audited-smart-contracts/all-multilabel/1.1.0/4cf503b59ce9d3157914e47f6253de773b7ab828f46642685d4b470b88ca1f13 (last modified on Mon Jan 19 17:14:44 2026).


In [27]:
split_dataset = dataset["train"].train_test_split(test_size=0.010, seed=1337, shuffle=True)
split_dataset['val'] = split_dataset.pop('test')
split_dataset

DatasetDict({
    train: Dataset({
        features: ['address', 'source_code', 'bytecode', 'slither'],
        num_rows: 119401
    })
    val: Dataset({
        features: ['address', 'source_code', 'bytecode', 'slither'],
        num_rows: 1207
    })
})

In [34]:
def process(dataset_col):
    ids = enc.encode(dataset_col['source_code'], allowed_special=set(special_tokens).union(set(gpt2_base.special_tokens_set)))
    out = {'ids': ids, 'len': len(ids)}
    return out
  
# tokenize the dataset
tokenized_dataset = split_dataset.map(
    process,
    remove_columns=['source_code'],
    desc="tokenizing the splits",
    num_proc=5,
  )

tokenized_dataset

tokenizing the splits (num_proc=5): 100%|██████████| 119401/119401 [01:18<00:00, 1528.17 examples/s]
tokenizing the splits (num_proc=5): 100%|██████████| 1207/1207 [00:02<00:00, 434.11 examples/s]


DatasetDict({
    train: Dataset({
        features: ['address', 'bytecode', 'slither', 'ids', 'len'],
        num_rows: 119401
    })
    val: Dataset({
        features: ['address', 'bytecode', 'slither', 'ids', 'len'],
        num_rows: 1207
    })
})

In [41]:
import numpy as np
import os
from tqdm import tqdm

for split, dset in tokenized_dataset.items():
    print(f'{split=}')
    total_token_length = np.sum(dset['len'],  dtype=np.uint64)
    print(f'{total_token_length=}')
    filename = f'{split}.bin'
    print(f'{filename=}')
    dtype = np.uint16
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(total_token_length,))
    total_batches = 1024

    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
      batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
      array_batch = np.concatenate(batch['ids'])
      arr[idx: idx + len(array_batch)] = array_batch
      idx += len(array_batch)

    arr.flush() # complete writing the file

split='train'
total_token_length=np.uint64(1269618024)
filename='train.bin'


writing train.bin: 100%|██████████| 1024/1024 [00:33<00:00, 30.31it/s]


split='val'
total_token_length=np.uint64(12534342)
filename='val.bin'


writing val.bin: 100%|██████████| 1024/1024 [00:01<00:00, 598.96it/s]
