### BigCode python code dataset validation

We find that in a subset of 10k filesin the **bigcode/python_safe_license**, only 0.24% of the files don't compile and have syntax errors

In [1]:
from datasets import load_dataset

In [3]:
data = load_dataset('bigcode/python_safe_license', streaming=True, split="train", use_auth_token=True, ignore_verifications=True)

Using custom data configuration bigcode--python_safe_license-05b1c2b44aa5d114


In [18]:
import tempfile
import subprocess
from tqdm import tqdm

def compile_python_code(sample):
    string = sample["content"]
    tmp = tempfile.NamedTemporaryFile()
    with open(tmp.name, "w") as f:
        f.write(string)
    py_command = "python{v} -m py_compile " + tmp.name
    
    try:
        subprocess.check_call(py_command.format(v=3).split())
        python3_works = True
    except subprocess.CalledProcessError:
        python3_works = False

    try:
        subprocess.check_call(py_command.format(v=2).split())
        python2_works = True
    except subprocess.CalledProcessError:
        python2_works = False

    return python2_works or python3_works

In [32]:
subset = list(data.shuffle(seed=42).take(10_000))

In [None]:
python_checks = []
for i in tqdm(range(len(subset))):
    python_checks.append(compile_python_code(subset[i]))

In [26]:
print(f"number of valid python files in the first 10k: {sum(python_checks)}")
print(f"percentage of non valid files: {(len(python_checks) - sum(python_checks)) * 100 / len(python_checks)}%")

number of valid python files in the first 10k: 9976
percentage of non valid files: 0.24%


### Examples of files that don't compile

In [27]:
# show which files weren't valid
for i, e in enumerate(python_checks):
    if not e:
        print(i)

705
1412
1490
1584
1962
2378
2431
2555
2581
2659
3044
3060
3069
3711
3837
4629
6101
6901
6917
6920
7794
8084
8560
9495


In [28]:
compile_python_code(subset[705])

  File "/var/folders/yr/snw4cq790975mv7_mbwkxm500000gn/T/tmp5r13si_2", line 11
    %matplotlib inline
    ^
SyntaxError: invalid syntax
  File "/var/folders/yr/snw4cq790975mv7_mbwkxm500000gn/T/tmp5r13si_2", line 11
    %matplotlib inline
    ^
SyntaxError: invalid syntax



False

In [29]:
compile_python_code(subset[1412])

  File "/var/folders/yr/snw4cq790975mv7_mbwkxm500000gn/T/tmpgk6cray7", line 11
    from tensorflow.keras.metrics import AUC, MeanSquaredError, RootMeanSquaredError,
                                                                                     ^
SyntaxError: trailing comma not allowed without surrounding parentheses
Sorry: IndentationError: unexpected indent (tmpgk6cray7, line 12)


False

In [30]:
compile_python_code(subset[1490])

  File "/var/folders/yr/snw4cq790975mv7_mbwkxm500000gn/T/tmp2zt_z_cm", line 167
    elif:
        ^
SyntaxError: invalid syntax
  File "/var/folders/yr/snw4cq790975mv7_mbwkxm500000gn/T/tmp2zt_z_cm", line 167
    elif:
        ^
SyntaxError: invalid syntax



False