In [5]:
from datasets import load_dataset


ds = load_dataset("archit11/hyperswitch-flat")
df = ds["train"].to_pandas()

In [6]:
df

Unnamed: 0,repo,file_path,extension,content,token_count
0,hyperswitch,Dockerfile,none,FROM rust:bookworm as builder\n\nARG EXTRA_FEA...,632
1,hyperswitch,generate_code_coverage.sh,.sh,"#! /bin/bash\n\nCOV_PROGRAM=""grcov""\nINSTALL_C...",227
2,hyperswitch,add_connector.md,.md,# Guide to Integrate a Connector\n\n## Introdu...,8108
3,hyperswitch,justfile,none,# List available recipes\nlist:\n @just --l...,2305
4,hyperswitch,INSTALL_dependencies.sh,.sh,#!/usr/bin/env bash\n#\n# Description: One cli...,2100
...,...,...,...,...,...
10771,hyperswitch-client-core,reactNativeWeb/babel.config.js,.js,module.exports = {\n presets: [\n [\n ...,191
10772,hyperswitch-client-core,reactNativeWeb/DemoApp/DemoAppIndex.html,.html,<!DOCTYPE html>\n<html>\n <head>\n <meta c...,218
10773,hyperswitch-client-core,reactNativeWeb/DemoApp/webpack.config.js,.js,const path = require('path');\nconst webpack =...,199
10774,hyperswitch-client-core,reactNativeWeb/DemoApp/DemoAppIndex.js,.js,"let defaultProps = {\n local: false,\n confi...",839


In [14]:
from datasets import load_dataset
import pandas as pd
from pathlib import Path

# Load dataset
ds = load_dataset("archit11/hyperswitch-code-only")
df = ds["train"].to_pandas()

# Handle missing values and extract extensions
df['token_count'] = df['token_count'].fillna(0)
df['extension'] = df['file_path'].apply(lambda x: Path(x).suffix if pd.notna(x) else 'none')
df['extension'] = df['extension'].replace('', 'none')

# Extension distribution for each repository
repo_extension_distribution = (
   df.groupby(['repo', 'extension'])
   .agg({
       'extension': 'size',
       'token_count': 'sum'
   })
   .rename(columns={'extension': 'file_count', 'token_count': 'token_sum'})
   .reset_index()
)

print("=== Token Count Distribution by Repository and Extension ===")
print(repo_extension_distribution.sort_values(['repo', 'token_sum'], ascending=[True, False]))

# Pivot table for better visualization
pivot_table = df.pivot_table(
   index='repo', 
   columns='extension', 
   values='token_count', 
   aggfunc='sum', 
   fill_value=0
)

print("\n=== Pivot Table: Repositories vs Extensions (Token Sums) ===")
print(pivot_table)

# For each repo, show extension breakdown with percentages
print("\n=== Per Repository Extension Breakdown ===")
for repo in df['repo'].unique():
   repo_df = df[df['repo'] == repo]
   
   extension_breakdown = (
       repo_df.groupby('extension')
       .agg({
           'extension': 'size',
           'token_count': 'sum'
       })
       .rename(columns={'extension': 'file_count', 'token_count': 'token_sum'})
       .reset_index()
   )
   
   total_tokens = extension_breakdown['token_sum'].sum()
   extension_breakdown['percentage'] = (extension_breakdown['token_sum'] / total_tokens * 100).round(2)
   extension_breakdown = extension_breakdown.sort_values('token_sum', ascending=False)
   
   print(f"\n--- {repo} ---")
   print(extension_breakdown)

README.md:   0%|          | 0.00/474 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/12.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10345 [00:00<?, ? examples/s]

=== Token Count Distribution by Repository and Extension ===
                          repo extension  file_count  token_sum
9                  hyperswitch       .rs        1566    4835455
5                  hyperswitch     .json        3937    3585747
4                  hyperswitch       .js        2410    1204842
6                  hyperswitch       .md          46     450000
12                 hyperswitch     .toml          55     355118
11                 hyperswitch      .sql         736      76021
0                  hyperswitch     .conf           2      26216
14                 hyperswitch      .yml          22      20217
10                 hyperswitch       .sh          14      15995
2                  hyperswitch     .html          15      14706
3                  hyperswitch      .ini           1      11050
1                  hyperswitch      .css           7       8008
13                 hyperswitch     .yaml          16       7654
15                 hyperswitch      none   

In [10]:
from datasets import load_dataset, Dataset
import pandas as pd
from pathlib import Path

# Load original dataset
ds = load_dataset("archit11/hyperswitch-flat")
df = ds["train"].to_pandas()

# Handle missing values and extract extensions
df['token_count'] = df['token_count'].fillna(0)
df['extension'] = df['file_path'].apply(lambda x: Path(x).suffix if pd.notna(x) else 'none')
df['extension'] = df['extension'].replace('', 'none')

# Define code-related extensions to keep
code_extensions = {
    '.rs',      # Rust
    '.js',      # JavaScript
    '.ts',      # TypeScript
    '.tsx',     # TypeScript JSX
    '.json',    # JSON (config/data files)
    '.md',      # Markdown (documentation)
    '.toml',    # TOML config
    '.sql',     # SQL
    '.sh',      # Shell scripts
    '.html',    # HTML
    '.css',     # CSS
    '.yaml',    # YAML config
    '.yml',     # YAML config
    '.proto',   # Protocol buffers
    '.conf',    # Config files
    '.ini',     # Config files
    '.res',     # ReScript
    '.resi',    # ReScript interface
    '.ml',      # OCaml
    '.mli',     # OCaml interface
    '.mjs',     # ES modules
    '.nix',     # Nix files
    'none'      # Files without extension (often config)
}

# Extensions to remove (non-code files)
remove_extensions = {
    '.svg',      # Images
    '.wasm',     # WebAssembly binaries
    '.woff2',    # Font files
    '.patch',    # Patch files
    '.txt',      # Plain text files
    '.Dockerfile' # Could keep this, but it's minimal
}

print("Original dataset size:", len(df))
print("\nExtension distribution before filtering:")
print(df['extension'].value_counts())

# Filter to keep only code files
df_filtered = df[df['extension'].isin(code_extensions)].copy()

print(f"\nFiltered dataset size: {len(df_filtered)} (removed {len(df) - len(df_filtered)} files)")
print(f"Token count before: {df['token_count'].sum():,}")
print(f"Token count after: {df_filtered['token_count'].sum():,}")

print("\nExtension distribution after filtering:")
print(df_filtered['extension'].value_counts())

# Show what was removed
removed_df = df[~df['extension'].isin(code_extensions)]
print(f"\nRemoved {len(removed_df)} files with extensions:")
print(removed_df['extension'].value_counts())
print(f"Removed {removed_df['token_count'].sum():,} tokens")

# Create new dataset
filtered_dataset = Dataset.from_pandas(df_filtered)

# Push to Hugging Face (you'll need to be logged in)
# filtered_dataset.push_to_hub("your-username/hyperswitch-code-only")

# Or save locally first
df_filtered.to_csv("hyperswitch_code_only.csv", index=False)
print("\nFiltered dataset saved to 'hyperswitch_code_only.csv'")

# Show final stats per repo
print("\n=== Final Repository Stats (Code Only) ===")
for repo in df_filtered['repo'].unique():
    repo_df = df_filtered[df_filtered['repo'] == repo]
    print(f"{repo}: {len(repo_df)} files, {repo_df['token_count'].sum():,} tokens")

Original dataset size: 10776

Extension distribution before filtering:
extension
.json          3983
.js            2494
.rs            1566
.res           1266
.sql            736
.svg            218
.mdx            188
.md              63
.toml            57
.resi            32
.yml             23
.html            21
none             21
.sh              20
.woff2           18
.ts              17
.yaml            16
.css             14
.proto            4
.patch            2
.mjs              2
.conf             2
.tsx              2
.txt              2
.Dockerfile       2
.ml               2
.mli              2
.nix              1
.ini              1
.wasm             1
Name: count, dtype: int64

Filtered dataset size: 10345 (removed 431 files)
Token count before: 31,911,449
Token count after: 14,362,546

Extension distribution after filtering:
extension
.json     3983
.js       2494
.rs       1566
.res      1266
.sql       736
.md         63
.toml       57
.resi       32
.yml       

In [13]:
from datasets import load_dataset, Dataset
import pandas as pd
from pathlib import Path
from huggingface_hub import login

# Step 1: Login to Hugging Face (you'll need your token)
# Get your token from: https://huggingface.co/settings/tokens

# Step 2: Load and filter the dataset
ds = load_dataset("archit11/hyperswitch-flat")
df = ds["train"].to_pandas()

# Handle missing values and extract extensions
df['token_count'] = df['token_count'].fillna(0)
df['extension'] = df['file_path'].apply(lambda x: Path(x).suffix if pd.notna(x) else 'none')
df['extension'] = df['extension'].replace('', 'none')

# Define code-related extensions to keep
code_extensions = {
    '.rs', '.js', '.ts', '.tsx', '.json', '.md', '.toml', '.sql', 
    '.sh', '.html', '.css', '.yaml', '.yml', '.proto', '.conf', 
    '.ini', '.res', '.resi', '.ml', '.mli', '.mjs', '.nix', 'none'
}

# Filter dataset
df_filtered = df[df['extension'].isin(code_extensions)].copy()

print(f"Original: {len(df)} files, {df['token_count'].sum():,} tokens")
print(f"Filtered: {len(df_filtered)} files, {df_filtered['token_count'].sum():,} tokens")
print(f"Removed: {len(df) - len(df_filtered)} files")

# Step 3: Create and push the dataset
filtered_dataset = Dataset.from_pandas(df_filtered)

# Push to your Hugging Face account
dataset_name = "hyperswitch-code-only"  # Choose your dataset name
filtered_dataset.push_to_hub(f"archit11/{dataset_name}")

print(f"\nTo push the dataset, uncomment the lines above and:")
print(f"1. Get your HF token from: https://huggingface.co/settings/tokens")
print(f"2. Replace 'your-username' with your actual HF username")
print(f"3. Replace 'your_hf_token_here' with your actual token")
print(f"4. Run the script")

# Alternative: Save locally first to review
df_filtered.to_parquet("hyperswitch_code_only.parquet")
print(f"\nDataset also saved locally as 'hyperswitch_code_only.parquet'")

Original: 10776 files, 31,911,449 tokens
Filtered: 10345 files, 14,362,546 tokens
Removed: 431 files


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/11 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   4%|4         |  557kB / 12.9MB            


To push the dataset, uncomment the lines above and:
1. Get your HF token from: https://huggingface.co/settings/tokens
2. Replace 'your-username' with your actual HF username
3. Replace 'your_hf_token_here' with your actual token
4. Run the script

Dataset also saved locally as 'hyperswitch_code_only.parquet'


In [8]:
from datasets import load_dataset


ds = load_dataset("archit11/hyperswitch-flat")
df = ds["train"].to_pandas()# ...existing code...
# cell id: acdf1899 — show counts of unique values in `extension` plus token_count sums and cumulative totals
import pandas as pd

# make sure token_count has no NaNs (treat missing as 0)
df['token_count'] = df['token_count'].fillna(0)

# treat missing extensions explicitly so they appear in the summary
tmp = df.copy()
tmp['extension'] = tmp['extension'].fillna('<NA>')

summary = (
    tmp.groupby('extension', sort=False)
       .agg(count=('extension', 'size'),
            token_count_sum=('token_count', 'sum'))
       .sort_values('count', ascending=False)
       .assign(cumulative_token_count=lambda x: x['token_count_sum'].cumsum(),
               cumulative_count=lambda x: x['count'].cumsum())
       .reset_index()
)

summary



Unnamed: 0,extension,count,token_count_sum,cumulative_token_count,cumulative_count
0,.json,3983,5239482,5239482,3983
1,.js,2494,1292330,6531812,6477
2,.rs,1566,4835455,11367267,8043
3,.res,1266,1690252,13057519,9309
4,.sql,736,76021,13133540,10045
5,.svg,218,14915637,28049177,10263
6,.mdx,188,15595,28064772,10451
7,.md,63,703584,28768356,10514
8,.toml,57,355881,29124237,10571
9,.resi,32,17563,29141800,10603
