In [1]:
import pandas as pd

df = pd.read_parquet("hf://datasets/GPUMODE/KernelBook/dataset_permissive.parquet")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
df.shape

(18162, 12)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18162 entries, 0 to 18161
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   entry_point           18162 non-null  object
 1   original_triton_code  18162 non-null  object
 2   python_code           18162 non-null  object
 3   triton_code           18162 non-null  object
 4   repo_name             18162 non-null  object
 5   module_name           18162 non-null  object
 6   synthetic             18162 non-null  bool  
 7   uuid                  18162 non-null  int64 
 8   licenses              18162 non-null  object
 9   stars                 18162 non-null  int64 
 10  sha                   18162 non-null  object
 11  repo_link             18162 non-null  object
dtypes: bool(1), int64(2), object(9)
memory usage: 1.5+ MB


In [7]:
df = df.drop(columns=["entry_point", "original_triton_code", "repo_name", "module_name", "synthetic", "uuid", "licenses", "stars", "sha", "repo_link"])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18162 entries, 0 to 18161
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   python_code  18162 non-null  object
 1   triton_code  18162 non-null  object
dtypes: object(2)
memory usage: 283.9+ KB


In [9]:
import json

def create_messages(row):
    """Create messages list for each row"""
    messages = [
        {
            'role': 'system',
            'content': 'You are an expert in optimizing PyTorch code to Triton GPU kernels. Convert the provided PyTorch implementations to efficient Triton code.'
        },
        {
            'role': 'user',
            'content': f'Convert the following PyTorch code to Triton:\n\n```python\n{row["python_code"]}\n```'
        },
        {
            'role': 'assistant',
            'content': f'```python\n{row["triton_code"]}\n```'
        }
    ]
    return messages

df['messages'] = df.apply(create_messages, axis=1)

print("First row messages:")
print(json.dumps(df['messages'].iloc[0], indent=2))

First row messages:
[
  {
    "role": "system",
    "content": "You are an expert in optimizing PyTorch code to Triton GPU kernels. Convert the provided PyTorch implementations to efficient Triton code."
  },
  {
    "role": "user",
    "content": "Convert the following PyTorch code to Triton:\n\n```python\nimport torch\nimport torch.nn as nn\n\n\nclass SumAggregator(nn.Module):\n\n    def __init__(self):\n        super(SumAggregator, self).__init__()\n\n    def forward(self, neighbor):\n        return torch.sum(neighbor, dim=1)\n\n\ndef get_inputs():\n    return [torch.rand([4, 4, 4, 4])]\n\n\ndef get_init_inputs():\n    return [[], {}]\n\n```"
  },
  {
    "role": "assistant",
    "content": "```python\nimport torch\nimport triton\nimport triton.language as tl\nfrom torch._inductor.runtime.triton_heuristics import grid\nfrom torch._C import _cuda_getCurrentRawStream as get_raw_stream\nimport torch.nn as nn\nassert_size_stride = torch._C._dynamo.guards.assert_size_stride\nempty_stride

In [10]:
# Convert messages to JSON strings for CSV storage
df['messages'] = df['messages'].apply(json.dumps)
df.to_csv('kernelbook.csv', index=False)

print(f"Dataset saved to kernelbook.csv with {len(df)} rows")
print(f"Columns: {list(df.columns)}")

Dataset saved to kernelbook.csv with 18162 rows
Columns: ['python_code', 'triton_code', 'messages']


In [11]:
# Example: How to load the CSV back and parse messages
df_loaded = pd.read_csv('kernelbook.csv')
df_loaded['messages'] = df_loaded['messages'].apply(json.loads)
print(df_loaded['messages'].iloc[0])


[{'role': 'system', 'content': 'You are an expert in optimizing PyTorch code to Triton GPU kernels. Convert the provided PyTorch implementations to efficient Triton code.'}, {'role': 'user', 'content': 'Convert the following PyTorch code to Triton:\n\n```python\nimport torch\nimport torch.nn as nn\n\n\nclass SumAggregator(nn.Module):\n\n    def __init__(self):\n        super(SumAggregator, self).__init__()\n\n    def forward(self, neighbor):\n        return torch.sum(neighbor, dim=1)\n\n\ndef get_inputs():\n    return [torch.rand([4, 4, 4, 4])]\n\n\ndef get_init_inputs():\n    return [[], {}]\n\n```'}, {'role': 'assistant', 'content': '```python\nimport torch\nimport triton\nimport triton.language as tl\nfrom torch._inductor.runtime.triton_heuristics import grid\nfrom torch._C import _cuda_getCurrentRawStream as get_raw_stream\nimport torch.nn as nn\nassert_size_stride = torch._C._dynamo.guards.assert_size_stride\nempty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda\n\n\n@t