In [4]:
import pickle
from gensim.models import Word2Vec
from pathlib import Path
from typing import Union
from datasets import Dataset
from datasets import load_from_disk

def load_corpus_dataset(corpus_path: Union[str, Path]) -> Dataset:
    corpus_path = Path(corpus_path)
    processed_path = corpus_path.parent / f"{corpus_path.stem}_processed"
    if processed_path.exists():
        print(f"Loading processed dataset from cache: {processed_path}")

        dataset = load_from_disk(str(processed_path))
        print(f"Loaded processed dataset: {len(dataset)} samples (memory-mapped)")
        return dataset
    
    print(f"Processing dataset from: {corpus_path}")
    print("Using generator to avoid loading all data into RAM at once...")
    
    def data_generator():
        with open(corpus_path, 'rb') as f:
            while True:
                try:
                    corpus_batch = pickle.load(f)
                    if isinstance(corpus_batch, list):
                        for sentence_tokens in corpus_batch:
                            if isinstance(sentence_tokens, list):
                                yield {"text": " ".join(sentence_tokens)}
                except EOFError:
                    break
                except Exception as e:
                    print(f"Error reading batch: {e}")
                    break
    
    dataset = Dataset.from_generator(data_generator)
    
    print(f"Saving processed dataset to cache: {processed_path}")
    dataset.save_to_disk(str(processed_path))
    print(f"Dataset cached: {len(dataset)} samples (memory-mapped format)")
    
    return dataset

corpus_path = '/home/tommy/Project/PcodeBERT/outputs/pcode_corpus_x86_64_new_data.pkl'

dataset = load_corpus_dataset(corpus_path)

print(type(dataset))
i=0
sentence = []
for item in dataset:
    tokens = item["text"].split()
    sentence.append(tokens)
    i+=1
    if(i==5):
        break
print(sentence)

Loading processed dataset from cache: /home/tommy/Project/PcodeBERT/outputs/pcode_corpus_x86_64_new_data_processed
Loaded processed dataset: 1069216 samples (memory-mapped)
<class 'datasets.arrow_dataset.Dataset'>
[['LOAD', 'UNIQUE', 'CONST', 'REG', 'INT_ZEXT', 'REG', 'UNIQUE', 'INT_ADD', 'UNIQUE', 'REG', 'CONST', 'LOAD', 'UNIQUE', 'CONST', 'UNIQUE', 'CAST', 'UNIQUE', 'UNIQUE', 'COPY', 'STACK', 'UNIQUE', 'LOAD', 'UNIQUE', 'CONST', 'UNIQUE', 'PTRSUB', 'UNIQUE', 'CONST', 'CONST', 'PTRADD', 'UNIQUE', 'UNIQUE', 'REG', 'CONST', 'INT_LESSEQUAL', 'UNIQUE', 'REG', 'UNIQUE', 'CBRANCH', 'MEM', 'UNIQUE', 'CAST', 'UNIQUE', 'UNIQUE', 'INT_EQUAL', 'REG', 'UNIQUE', 'CONST', 'LOAD', 'UNIQUE', 'CONST', 'UNIQUE', 'PTRADD', 'UNIQUE', 'REG', 'CONST', 'CONST', 'CAST', 'UNIQUE', 'UNIQUE', 'CBRANCH', 'MEM', 'REG', 'INT_NOTEQUAL', 'REG', 'UNIQUE', 'CONST', 'CBRANCH', 'MEM', 'REG', 'LOAD', 'UNIQUE', 'CONST', 'UNIQUE', 'INT_AND', 'UNIQUE', 'UNIQUE', 'CONST', 'STORE', 'CONST', 'UNIQUE', 'UNIQUE', 'INDIRECT', 'ME