# Preparing the CodeSearchNet Challenge dataset as CSV files

## Ready to be loaded into the models

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
codeSearchNet_challenge_path = './CodeSearchNet_challenge_dataset'
all_csv_path = './CodeSearchNet_challenge_dataset/CodeSearchNet_challenge_dataset.csv'
train_csv_path = './CodeSearchNet_challenge_dataset/python/final/jsonl/train/train_CodeSearchNet_challenge_dataset.csv'
test_csv_path = './CodeSearchNet_challenge_dataset/python/final/jsonl/test/test_CodeSearchNet_challenge_dataset.csv'
valid_csv_path = './CodeSearchNet_challenge_dataset/python/final/jsonl/valid/valid_CodeSearchNet_challenge_dataset.csv'

In [3]:
import pandas as pd
import glob
import os
from pathlib import Path

In [4]:
def get_train_py_files(dataset_path=codeSearchNet_challenge_path):
    train_files = []
    for path in Path(f'{dataset_path}/python/final/jsonl/train').rglob('*.py'):
        train_files.append(path)

    return train_files

def get_test_py_files(dataset_path=codeSearchNet_challenge_path):
    test_files = []
    for path in Path(f'{dataset_path}/python/final/jsonl/test').rglob('*.py'):
        test_files.append(path)

    return test_files

def get_valid_py_files(dataset_path=codeSearchNet_challenge_path):
    valid_files = []
    for path in Path(f'{dataset_path}/python/final/jsonl/valid').rglob('*.py'):
        valid_files.append(path)

    return valid_files


def get_dataset_py_files(dataset_path=codeSearchNet_challenge_path):
    train_files = get_train_py_files()
    test_files = get_test_py_files()
    valid_files = get_valid_py_files()
    
    return train_files, test_files, valid_files

In [5]:
def export_py_files_into_csv(_files, csv_file_path):
    li = []
    for filename in _files:
        with open(filename) as file:  
            try:
                data = file.read() 

                df = pd.DataFrame({'filename': [filename], 'text': [data]})   
                li.append(df)
            except:
                pass # File not readable. Skipping for ingesting data

    df = pd.concat(li, axis=0, ignore_index=True)
    df.dropna(inplace=True)
    df.to_csv(csv_file_path, index=False)
    df = pd.read_csv(csv_file_path)
    df.dropna(inplace=True)
    df.to_csv(csv_file_path, index=False)
    display(df.head())
    display(df.shape)
    return True

In [6]:
train_files, test_files, valid_files = get_dataset_py_files()

# Exporting train files into a CSV if it does not exist
if not os.path.exists(train_csv_path):
    print('Exporting /train python files to CSV')
    export_py_files_into_csv(train_files, train_csv_path)

# Exporting test files into a CSV if it does not exist
if not os.path.exists(test_csv_path):
    print('Exporting /test python files to CSV')
    export_py_files_into_csv(test_files, test_csv_path)

# Exporting valid files into a CSV if it does not exist
if not os.path.exists(valid_csv_path):
    print('Exporting /valid python files to CSV')
    export_py_files_into_csv(valid_files, valid_csv_path)

# Exporting all the python files into a CSV if it does not exist
if not os.path.exists(all_csv_path):
    print('Exporting every python files to CSV')
    all_files = train_files + test_files + valid_files
    export_py_files_into_csv(all_files, all_csv_path)


Exporting /train python files to CSV


Unnamed: 0,filename,text
0,CodeSearchNet_challenge_dataset/python/final/j...,"def show_slug_with_level(context, page, lang=N..."
1,CodeSearchNet_challenge_dataset/python/final/j...,"def public(self):\n """"""True if the Slot is ..."
2,CodeSearchNet_challenge_dataset/python/final/j...,def zmq_device(self):\n '''\n Multiproce...
3,CodeSearchNet_challenge_dataset/python/final/j...,"def get_datetime_sorted_rows(dbconn, table_nam..."
4,CodeSearchNet_challenge_dataset/python/final/j...,"def export_ruptures_csv(ekey, dstore):\n """"..."


(14, 2)

Exporting /test python files to CSV


Unnamed: 0,filename,text
0,CodeSearchNet_challenge_dataset/python/final/j...,"def get_vid_from_url(url):\n """"""Extracts vi..."


(1, 2)

Exporting /valid python files to CSV


Unnamed: 0,filename,text
0,CodeSearchNet_challenge_dataset/python/final/j...,"def learn(env,\n network,\n ..."


(1, 2)

Exporting every python files to CSV


Unnamed: 0,filename,text
0,CodeSearchNet_challenge_dataset/python/final/j...,"def show_slug_with_level(context, page, lang=N..."
1,CodeSearchNet_challenge_dataset/python/final/j...,"def public(self):\n """"""True if the Slot is ..."
2,CodeSearchNet_challenge_dataset/python/final/j...,def zmq_device(self):\n '''\n Multiproce...
3,CodeSearchNet_challenge_dataset/python/final/j...,"def get_datetime_sorted_rows(dbconn, table_nam..."
4,CodeSearchNet_challenge_dataset/python/final/j...,"def export_ruptures_csv(ekey, dstore):\n """"..."


(16, 2)

In [7]:
print("Total lines of Python code in the dataset (after removing empty lines)")
!find ./CodeSearchNet_challenge_dataset/ -name '*.py' | xargs cat | sed '/^\s*$/d' | wc -l 

print("\n\nTotal lines of Python code in the dataset's train folder (after removing empty lines)")
!find ./CodeSearchNet_challenge_dataset/python/final/jsonl/train/ -name '*.py' | xargs cat | sed '/^\s*$/d' | wc -l 

print("\n\nTotal lines of Python in the dataset's test folder (after removing empty lines)")
!find ./CodeSearchNet_challenge_dataset/python/final/jsonl/test/ -name '*.py' | xargs cat | sed '/^\s*$/d' | wc -l 

print("\n\nTotal lines of Python in the dataset's valid folder")
!find ./CodeSearchNet_challenge_dataset/python/final/jsonl/valid/ -name '*.py' | xargs cat | sed '/^\s*$/d' | wc -l 

Total lines of Python code in the dataset (after removing empty lines)
11206193


Total lines of Python code in the dataset's train folder (after removing empty lines)
10054266


Total lines of Python in the dataset's test folder (after removing empty lines)
541338


Total lines of Python in the dataset's valid folder
610589
