In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

## Build training and validation split

In [None]:
files = ['../data/dataset_E101.csv', '../data/dataset_E111.csv', '../data/dataset_E112.csv']
df = pd.concat([pd.read_csv(f) for f in files], ignore_index=True)

In [None]:
train_pd_ds, val_pd_ds = train_test_split(
    df,
    test_size=0.2,  # 20% of the data goes into the validation set
    random_state=48,  # seed for reproducibility
    stratify=df[df.columns[2]]  # stratifying by the rule column to ensure representation of all categories
)

## Create HuggingFace dataset

In [None]:
from datasets import Dataset, DatasetDict, ClassLabel

# reset index to avoid the "__index_level_0__" column
train_pd_ds = train_pd_ds.reset_index(drop=True)
val_pd_ds = val_pd_ds.reset_index(drop=True)

train_hf_ds = Dataset.from_pandas(train_pd_ds)
val_hf_ds = Dataset.from_pandas(val_pd_ds)

label_feature = ClassLabel(num_classes=2, names=["non-compliant", "compliant"])

train_hf_ds = train_hf_ds.cast_column('label', label_feature)
val_hf_ds = val_hf_ds.cast_column('label', label_feature)

dataset_dict = DatasetDict({
    'train': train_hf_ds,
    'validation': val_hf_ds
})


## Push to HuggingFace hub

In [None]:
from huggingface_hub import create_repo, upload_file

dataset_name = 'pep8_indentation_compliance'
username = 'aholovko'
token = ''

# create the dataset repository
repo_id = f"{username}/{dataset_name}"
create_repo(repo_id, token=token)

# save dataset_dict to the hub
dataset_dict.push_to_hub(repo_id)

# create a dataset card markdown file
dataset_card_markdown = """
---
dataset_info:
  features:
  - name: code
    dtype: string
  - name: label
    dtype:
      class_label:
        names:
          '0': non-compliant
          '1': compliant
  - name: rule
    dtype: string
  splits:
  - name: train
    num_bytes: 7127
    num_examples: 96
  - name: validation
    num_bytes: 1726
    num_examples: 24
  download_size: 7422
  dataset_size: 8853
configs:
- config_name: default
  data_files:
  - split: train
    path: data/train-*
  - split: validation
    path: data/validation-*
---

# Dataset Name

This dataset contains Python code samples and their compliance with specific PEP-8 indentation rules (E101, E111, and E112).

## Dataset Structure

**Features:**
  - `code`: Python code snippet.
  - `rule`: PEP-8 rule (E101, E111, or E112) that the code snippet is associated with.
  - `label`: A binary label indicating compliance ('non-compliant' or 'compliant').

## Usage

This dataset can be used for training machine learning models for code compliance and style enforcement tasks.
"""

# save the dataset card to README.md file
with open("README.md", "w") as f:
    f.write(dataset_card_markdown)

# upload README.md to the repo
upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",
    repo_id=repo_id,
    token=token
)
