In [6]:
import pandas as pd
from question_answering.paths import generative_qa_paths

In [7]:
def read_feature_file_as_list(filename: str):
    feature_file = open(filename, "r", encoding="utf-8")
    features = [line.strip() for line in feature_file.readlines()]
    return features


def extract_data(set_type: str, language: str):
    base_path = generative_qa_paths.code_qa_data_dir / language
    base_features_path = base_path / set_type

    questions = read_feature_file_as_list(base_features_path / f"{set_type}.question")
    answers = read_feature_file_as_list(base_features_path / f"{set_type}.answer")
    code = read_feature_file_as_list(base_features_path / f"{set_type}.code")
    original_code = read_feature_file_as_list(base_features_path / f"{set_type}.code.original")

    df = pd.DataFrame(
        {
            "questions": questions,
            "answers": answers,
            "code": code,
            "original_code": original_code,
        }
    )
    
    save_path = generative_qa_paths.code_qa_dataset_dir / language
    
    if not save_path.exists():
        save_path.mkdir(parents=True)
    df.to_csv(save_path / f"{set_type}.csv", index=True, index_label="index", escapechar='\\')

In [8]:
# Java
extract_data(set_type="train", language="java")
extract_data(set_type="val", language="java")
extract_data(set_type="test", language="java")

# Python
extract_data(set_type="train", language="python")
extract_data(set_type="val", language="python")
extract_data(set_type="test", language="python")