In [1]:
"""Template for jupyter notebooks created within this directory. Adds the parent directory to path and sets autoreload."""

'Template for jupyter notebooks created within this directory. Adds the parent directory to path and sets autoreload.'

In [1]:
%load_ext autoreload
%autoreload 2


In [2]:
import os
import sys

# Get the current working directory
cwd = os.getcwd()

# Get the parent directory
parent_dir = os.path.dirname(cwd)

# Get the grandparent directory (two levels above)
root_dir = os.path.dirname(parent_dir)

# Add the root directory to sys.path
if root_dir not in sys.path:
    sys.path.append(root_dir)

print(f"Root directory: {root_dir} is added to sys.path")


Root directory: /Users/aag/Documents/proteinfolding is added to sys.path


In [4]:
from proteinfolding.data_processing import *
from proteinfolding.paths import SCP_PRODUCTION_RUNS_DIR, XY_QAOA_DATA_DIR

SIMULATION_ID = "A127566"
RAW_DATA_DIR = os.path.join(SCP_PRODUCTION_RUNS_DIR, SIMULATION_ID)
PROCESSED_DATA_DIR = os.path.join(XY_QAOA_DATA_DIR, SIMULATION_ID)

In [None]:
# loads all json files into one list
raw_data = load_json_files(RAW_DATA_DIR)

# breaks down the params into five columns
cleaned_data = clean_json_data(raw_data)


In [6]:
cleaned_data.head()

Unnamed: 0,num_res,num_rot,alpha,shots,p,pos,transverse_field,bitstrings,energy,intermediate_data,parameters,all_bitstrings,all_unrestricted_bitstrings,sorted_bitstrings,sorted_unrestricted_bitstrings
0,4,5,0.2,45,4,0,1,"[10000001000000100010, 10000010000000100010, 1...",-35.78897,"[[{'34884': 0.022222222222222223, '35080': 0.0...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'00001000010000110000': {'probability': 0.177...,{'00001000010000110000': {'probability': 0.177...,"[[10000001000000100010, {'probability': 0.2888...","[[10000001000000100010, {'probability': 0.2888..."
1,4,5,0.2,30,4,0,1,"[10000001000000100010, 10000010000000100010, 1...",-35.78897,"[[{'34888': 0.03333333333333333, '35330': 0.03...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'00001000010000100010': {'probability': 0.066...,{'00001000010000100010': {'probability': 0.066...,"[[10000001000000100010, {'probability': 0.6, '...","[[10000001000000100010, {'probability': 0.6, '..."
2,5,5,0.2,45,4,0,1,"[0100000100000010010000010, 000010010000001001...",-56.036883,"[[{'1083460': 0.022222222222222223, '1116417':...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'0000100010000010001000100': {'probability': ...,{'0000100010000010001000100': {'probability': ...,"[[0100000100000010010000010, {'probability': 0...","[[0100000100000010010000010, {'probability': 0..."
3,4,4,0.2,35,4,0,1,"[1000001010000001, 1000010010000001, 100000011...",-21.897148,"[[{'4417': 0.02857142857142857, '4420': 0.0285...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'0001001000100010': {'probability': 0.6857142...,{'0001001000100010': {'probability': 0.6857142...,"[[1000001010000001, {'probability': 0.11428571...","[[1000001010000001, {'probability': 0.11428571..."
4,5,3,0.2,40,4,0,1,"[010010100001001, 010010100001100, 01001001000...",-25.63222,"[[{'4684': 0.025, '4772': 0.025, '4874': 0.025...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'001001010001001': {'probability': 0.24999999...,{'001001010001001': {'probability': 0.24999999...,"[[010010100001001, {'probability': 3.974999999...","[[010010100001001, {'probability': 3.974999999..."


In [7]:
### Clean the intermediate_data column of the dataframe

cleaned_data = clean_intermediate_data(cleaned_data)

In [8]:
cleaned_data.head()

Unnamed: 0,num_res,num_rot,alpha,shots,p,pos,transverse_field,bitstrings,energy,intermediate_data,parameters,all_bitstrings,all_unrestricted_bitstrings,sorted_bitstrings,sorted_unrestricted_bitstrings
0,4,5,0.2,45,4,0,1,"[10000001000000100010, 10000010000000100010, 1...",-35.78897,"[{'34884': 0.022222222222222223, '35080': 0.02...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'00001000010000110000': {'probability': 0.177...,{'00001000010000110000': {'probability': 0.177...,"[[10000001000000100010, {'probability': 0.2888...","[[10000001000000100010, {'probability': 0.2888..."
1,4,5,0.2,30,4,0,1,"[10000001000000100010, 10000010000000100010, 1...",-35.78897,"[{'34888': 0.03333333333333333, '35330': 0.033...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'00001000010000100010': {'probability': 0.066...,{'00001000010000100010': {'probability': 0.066...,"[[10000001000000100010, {'probability': 0.6, '...","[[10000001000000100010, {'probability': 0.6, '..."
2,5,5,0.2,45,4,0,1,"[0100000100000010010000010, 000010010000001001...",-56.036883,"[{'1083460': 0.022222222222222223, '1116417': ...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'0000100010000010001000100': {'probability': ...,{'0000100010000010001000100': {'probability': ...,"[[0100000100000010010000010, {'probability': 0...","[[0100000100000010010000010, {'probability': 0..."
3,4,4,0.2,35,4,0,1,"[1000001010000001, 1000010010000001, 100000011...",-21.897148,"[{'4417': 0.02857142857142857, '4420': 0.02857...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'0001001000100010': {'probability': 0.6857142...,{'0001001000100010': {'probability': 0.6857142...,"[[1000001010000001, {'probability': 0.11428571...","[[1000001010000001, {'probability': 0.11428571..."
4,5,3,0.2,40,4,0,1,"[010010100001001, 010010100001100, 01001001000...",-25.63222,"[{'4684': 0.025, '4772': 0.025, '4874': 0.025,...","[[[-0.1, -1.0, -0.03333333333333334, -0.333333...",{'001001010001001': {'probability': 0.24999999...,{'001001010001001': {'probability': 0.24999999...,"[[010010100001001, {'probability': 3.974999999...","[[010010100001001, {'probability': 3.974999999..."


In [9]:
print(cleaned_data)

    num_res  num_rot  alpha  shots  p  pos  transverse_field  \
0         4        5    0.2     45  4    0                 1   
1         4        5    0.2     30  4    0                 1   
2         5        5    0.2     45  4    0                 1   
3         4        4    0.2     35  4    0                 1   
4         5        3    0.2     40  4    0                 1   
..      ...      ...    ...    ... ..  ...               ...   
58        4        2    0.2     40  4    0                 1   
59        4        2    0.2     15  4    0                 1   
60        4        2    0.2     50  4    0                 1   
61        5        2    0.2     15  4    0                 1   
62        4        2    0.2     45  4    0                 1   

                                           bitstrings     energy  \
0   [10000001000000100010, 10000010000000100010, 1... -35.788970   
1   [10000001000000100010, 10000010000000100010, 1... -35.788970   
2   [0100000100000010010000

In [10]:
# save the processed data as a csv, create the directory if it doesn't exist

if not os.path.exists(PROCESSED_DATA_DIR):
    os.makedirs(PROCESSED_DATA_DIR)

cleaned_data.to_csv(os.path.join(PROCESSED_DATA_DIR, f"{SIMULATION_ID}.csv.gz"), index=False, compression="gzip")