In [1]:
import pandas as pd
from question_answering.paths import generative_qa_paths

In [2]:
def read_feature_file_as_list(filename: str):
    feature_file = open(filename, "r", encoding="utf-8")
    features = [line.strip() for line in feature_file.readlines()]
    return features


def extract_data(original_set_type_name: str, final_set_type_name: str, language: str):
    base_path = generative_qa_paths.code_qa_data_dir / language
    base_features_path = base_path / original_set_type_name

    questions = read_feature_file_as_list(base_features_path / f"{original_set_type_name}.question")
    answers = read_feature_file_as_list(base_features_path / f"{original_set_type_name}.answer")
    code = read_feature_file_as_list(base_features_path / f"{original_set_type_name}.code")
    original_code = read_feature_file_as_list(base_features_path / f"{original_set_type_name}.code.original")

    df = pd.DataFrame(
        {
            "questions": questions,
            "answers": answers,
            "code": code,
            "original_code": original_code,
        }
    )
    
    save_path = generative_qa_paths.code_qa_dataset_dir / language
    
    if not save_path.exists():
        save_path.mkdir(parents=True)
    df.to_csv(save_path / f"{final_set_type_name}.csv", index=True, index_label="index", escapechar='\\')
    
    
def extract_java_data():
    extract_data(original_set_type_name="train", final_set_type_name="train", language="java")
    extract_data(original_set_type_name="dev", final_set_type_name="val", language="java")
    extract_data(original_set_type_name="test", final_set_type_name="test", language="java")


def extract_python_data():
    extract_data(original_set_type_name="train", final_set_type_name="train", language="python")
    extract_data(original_set_type_name="dev", final_set_type_name="val", language="python")
    extract_data(original_set_type_name="test", final_set_type_name="test", language="python")

In [3]:
extract_java_data()

In [4]:
extract_python_data()

In [5]:
def decrease_test_dataset_size(language: str):
    dataset_path = generative_qa_paths.code_qa_dataset_dir / language
    df_test_large = pd.read_csv(dataset_path / "test.csv").dropna()
    df_test_small = df_test_large.head(2500)

    if 'index' in df_test_large.columns:
        df_test_large = df_test_large.drop(columns=['index'])
    if 'index' in df_test_small.columns:
        df_test_small = df_test_small.drop(columns=['index'])

    df_test_large.to_csv(dataset_path / "test_large.csv", index=True, index_label="index", escapechar='\\')
    df_test_small.to_csv(dataset_path / "test.csv", index=True, index_label="index", escapechar='\\')

In [6]:
decrease_test_dataset_size("java")

In [7]:
decrease_test_dataset_size("python")