In [2]:
# Imports
import os
from pathlib import Path
import re
import numpy as np
import csv

# File locations
ROOTDIR = str(Path().parent.absolute().parent.absolute())
NPY_DATA_PATH = ROOTDIR + "/data/new_test_data_nolabel"

In [4]:
def find_npy_files(path):
    """
    Finds all npy files and terminal folders in a given path
    :param path: path to search
    :return: list of npy files paths
    """
    npy_files = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".npy"):
                npy_files.append(os.path.join(root, file))
    return npy_files

def remove_lines(string):
    """
    Do a little clean up on the string:
    1. remove new lines
    2. remove extra (non-breaking, double, extra) spaces
    3. replace period + semicolon and remove extra semicolons
    TODO:
    1. fix typos?
    """
    if isinstance(string, str):
        string = string.replace("\n", "")
        string = string.replace("\xa0", " ")
        string = string.replace("  ", " ")
        string = string.replace(".;", ";")
        string = string.replace(". ;", ";")
        if string.startswith(" "):
            string = string[1:]
        if string.endswith(" "):
            string = string[:-1]
        if string.endswith("; "):
            string = string[:-2]
        return string
    elif isinstance(string, list):
        return [remove_lines(x) for x in string]
    else:
        return str(string)


def extra_spaces(string):
    """
    Remove spaces at the start and end of string
    """
    if isinstance(string, str):
        if string.startswith(" "):
            string = string[1:]
        if string.endswith(" "):
            string = string[:-1]
        return string
    else:
        return str(string)


def npy_to_txt(npy_path, txt_path):
    """
    Converts npy files to txt files
    :param npy_path: path to npy files
    :param txt_path: path to save txt files
    """
    npy = np.load(npy_path, allow_pickle=True)
    with open(txt_path, 'w') as f:
        for i in range(len(npy)):
            f.write("Question " + str(i+1) + ":\n")
            for key in npy[i]:
                f.write(remove_lines(key) + ": ")
                if isinstance(npy[i][key], list):
                    line = "".join(extra_spaces(x) + "; " for x in npy[i][key])
                    f.write(remove_lines(line) + "\n")
                else:
                    f.write(remove_lines(npy[i][key]) + "\n")
            f.write("\n\n")

In [7]:
files = find_npy_files(NPY_DATA_PATH)
print(files[0])

/Users/alvinchen/Documents/GitHub/brainteaser-data/data/new_test_data_nolabel/WP_new_test.npy


In [9]:
npy_to_txt(files[1], files[1].replace(".npy", ".txt"))

In [12]:
sp = np.load(files[1], allow_pickle=True)
wp = np.load(files[0], allow_pickle=True)

In [13]:
print(len(sp))
print(len(wp))

120
96
