# Downloading and preparing the CodeSearchNet Challenge dataset 

## Get it if it does not exist in the expected path, removing comments and autoformatting using autopep8

Take care, the `autopep8` process can take ours up to days

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Downloading the dataset if it does not exist in local

In [2]:
import os

codeSearchNet_challenge_path = './CodeSearchNet_challenge_dataset'

In [3]:
def check_codesearchnet_dataset_exists(dataset_path=codeSearchNet_challenge_path):
    if os.path.exists(dataset_path):
        contents = os.listdir(dataset_path) 
        if len(contents) > 0:
            return True
    return False

def download_codesearchnet_dataset():
    !wget https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
    !unzip python.zip -d {codeSearchNet_challenge_path}

In [4]:
if not check_codesearchnet_dataset_exists():
    download_codesearchnet_dataset()

--2020-08-11 16:00:18--  https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.46.238
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.46.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 940909997 (897M) [application/zip]
Saving to: ‘python.zip’


2020-08-11 16:00:29 (82.4 MB/s) - ‘python.zip’ saved [940909997/940909997]

Archive:  python.zip
   creating: ./CodeSearchNet_challenge_dataset/python/
   creating: ./CodeSearchNet_challenge_dataset/python/final/
   creating: ./CodeSearchNet_challenge_dataset/python/final/jsonl/
   creating: ./CodeSearchNet_challenge_dataset/python/final/jsonl/train/
  inflating: ./CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_9.jsonl.gz  
  inflating: ./CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_12.jsonl.gz  
  inflating: ./CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_10.jsonl.

## Unpacking jsonl.gz files from dataset and clean them if needed

In [5]:
from pathlib import Path
import gzip

In [6]:
def check_python_files_exist_dataset(dataset_path=codeSearchNet_challenge_path):
    py_file_array = []
    for _file in Path(dataset_path).rglob('*.py'):
        py_file_array.append(str(_file))
    if len(py_file_array) > 0:
        return True
    return False
    
def unpack_jsonl_files_dataset(dataset_path=codeSearchNet_challenge_path):
    paths_array = []
    for _file in Path(dataset_path).rglob('*.jsonl.gz'):
        paths_array.append(str(_file))
        with gzip.open(str(_file), 'rb') as gzipped_file:
            new_filename = str(_file).strip('.jsonl.gz')
            print(f"Decompressing file: {_file} -> {new_filename}.py")
            with open(f"{new_filename}.py", "a") as output_py_file:
                for line in gzipped_file:
                    decoded_line = line.decode()
                    d = json.loads(decoded_line)
                    output_py_file.write(d['original_string'])
                    output_py_file.write("\n\n")

                    
# Retrieved from https://stackoverflow.com/a/62074206
import io, tokenize, re
def remove_comments_and_docstrings(source):
    io_obj = io.StringIO(source)
    out = ""
    prev_toktype = tokenize.INDENT
    last_lineno = -1
    last_col = 0
    for tok in tokenize.generate_tokens(io_obj.readline):
        token_type = tok[0]
        token_string = tok[1]
        start_line, start_col = tok[2]
        end_line, end_col = tok[3]
        ltext = tok[4]
        if start_line > last_lineno:
            last_col = 0
        if start_col > last_col:
            out += (" " * (start_col - last_col))
        if token_type == tokenize.COMMENT:
            pass
        elif token_type == tokenize.STRING:
            if prev_toktype != tokenize.INDENT:
                if prev_toktype != tokenize.NEWLINE:
                    if start_col > 0:
                        out += token_string
        else:
            out += token_string
        prev_toktype = token_type
        last_col = end_col
        last_lineno = end_line
    out = '\n'.join(l for l in out.splitlines() if l.strip())
    return out

import subprocess
def clean_and_autoformat_py_files(dataset_path=codeSearchNet_challenge_path):
    _subprocesses=[]
    for py_file_path in Path(dataset_path).rglob('*.py'):
        print(f'Removing comments and formatting file: {py_file_path}')
        try:
            remove_comments_and_docstrings(str(py_file_path))
        except BaseException as e:
            print(f'Unable to remove comments and docstrings from {str(py_file_path)} because of {e}')
        
        _subprocesses.append(subprocess.Popen(["autopep8","--in-place","--aggressive",py_file_path]))
    
    exit_codes = [p.wait() for p in _subprocesses]
    print("\n\nDone!")
    
    return True

In [7]:
if not check_python_files_exist_dataset():
    unpack_jsonl_files_dataset()
    clean_and_autoformat_py_files()

Decompressing file: CodeSearchNet_challenge_dataset/python/final/jsonl/valid/python_valid_0.jsonl.gz -> CodeSearchNet_challenge_dataset/python/final/jsonl/valid/python_valid_0.py
Decompressing file: CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_12.jsonl.gz -> CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_12.py
Decompressing file: CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_1.jsonl.gz -> CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_1.py
Decompressing file: CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_5.jsonl.gz -> CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_5.py
Decompressing file: CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_3.jsonl.gz -> CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_3.py
Decompressing file: CodeSearchNet_challenge_dataset/python/final/jsonl/train/python_train_10.jsonl.gz -

In [8]:
print("CodeSearchNet challenge dataset ready to use!")

CodeSearchNet challenge dataset ready to use!
