In [None]:
import boto3

s3 = boto3.client("s3")
bucket_name = "icdd-extacted-info"

# Initialize the paginator
paginator = s3.get_paginator("list_objects_v2")

# Paginate through all objects
for page in paginator.paginate(Bucket=bucket_name):
    for obj in page.get('Contents', []):
        if obj['Key'].endswith(".pickle"):
            print(obj['Key'])


all_subfile.pickle
cellpar.pickle
cform_search.pickle
comments_melting_point.pickle
comments_optical_data.pickle
compositions.pickle
crossref_alternate.pickle
full_comments.pickle
litref_all.pickle
litref_pf_st.pickle
lpf_coordinates.pickle
mod_coordinates.pickle
mod_powderpattern.pickle
mod_sfactors.pickle
pd3.pickle
pd3_instparam.pickle
pearson_colors.pickle
pressure.pickle
spacegroup_sg_lookup.pickle
spacings00.pickle
spacings01.pickle
spacings1.pickle
spacings10.pickle
spacings2.pickle
spacings3.pickle
spacings4.pickle
spacings5.pickle
spacings6.pickle
spacings7.pickle
spacings8.pickle
spacings9.pickle
strong8_spacings.pickle


In [6]:
from io import BytesIO
import os
import pickle
from pathlib import Path
import boto3

s3 = boto3.client("s3")
bucket_name = "icdd-extacted-info"

tables = [
    "full_comments.pickle",
    "compositions.pickle",
    "comments_optical_data.pickle",
    "comments_melting_point.pickle"
]

# Create the Data directory if it doesn't exist
data_dir = Path('../Data')
data_dir.mkdir(parents=True, exist_ok=True)

for key in tables:
    try:
        response = s3.get_object(Bucket='icdd-extacted-info', Key=key)
        data = pickle.load(BytesIO(response['Body'].read()))

        # Save the pickle file locally
        file_path = data_dir / key
        with open(file_path, "wb") as f:
            pickle.dump(data, f)
        print(f"Saved {key}")
    except Exception as e:
        print(f"Error processing {key}:{e}")

Saved full_comments.pickle
Saved compositions.pickle
Saved comments_optical_data.pickle
Saved comments_melting_point.pickle


In [57]:
import pandas as pd

dfs = {}

for table in tables:
    try:
        file_path = data_dir / table
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        dfs[table.replace(".pickle", "")] = pd.DataFrame(data)
    except Exception as e:
        print(f"Error loading table {table} : {e}")

In [58]:
dfs.keys()

dict_keys(['full_comments', 'compositions', 'comments_optical_data', 'comments_melting_point'])

In [59]:
# explore full_comments table
import json
row = dfs['full_comments'].iloc[-1].to_dict()
print(json.dumps(row, indent=2, ensure_ascii=False))

{
  "0": "050104886",
  "1": "Color: colourless. General Comments: Reported GOF=0.954. Habit: Prismatic. Structures: 10.1107/S2056989019007989. Article Title: Crystal structure and DFT study of benzyl 1-benzyl-2-oxo-1,2-dihydroquinoline-4-carboxylate. Temperature of Data Collection: Pattern taken at 296 K. Unit Cell Data Source: Single Crystal."
}


In [60]:
# explore full_comments table
import json
row = dfs['compositions'].iloc[-1].to_dict()
print(json.dumps(row, indent=2, ensure_ascii=False))

{
  "0": "050105353",
  "1": "B1.61 C57.07 F28.21 H4.34 Mg1.80 O2.38 P4.60",
  "2": "B1.34 C42.95 F13.42 H38.93 Mg0.67 O1.34 P1.34",
  "3": "4.34",
  "4": "38.93",
  "5": null,
  "6": null,
  "7": null,
  "8": null,
  "9": null,
  "10": null,
  "11": "1.61",
  "12": "1.34",
  "13": "57.07",
  "14": "42.95",
  "15": null,
  "16": null,
  "17": "2.38",
  "18": "1.34",
  "19": "28.21",
  "20": "13.42",
  "21": null,
  "22": null,
  "23": null,
  "24": null,
  "25": "1.80",
  "26": ".67",
  "27": null,
  "28": null,
  "29": null,
  "30": null,
  "31": "4.60",
  "32": "1.34",
  "33": null,
  "34": null,
  "35": null,
  "36": null,
  "37": null,
  "38": null,
  "39": null,
  "40": null,
  "41": null,
  "42": null,
  "43": null,
  "44": null,
  "45": null,
  "46": null,
  "47": null,
  "48": null,
  "49": null,
  "50": null,
  "51": null,
  "52": null,
  "53": null,
  "54": null,
  "55": null,
  "56": null,
  "57": null,
  "58": null,
  "59": null,
  "60": null,
  "61": null,
  "62": null,
  

In [64]:
# Step 1: Manually define labels
column_labels = [
    "ProductID", "AllWtPercent", "AllAtPercent",  # 0, 1, 2
    "H_WtPercent", "H_AtPercent",
    "He_WtPercent", "He_AtPercent",
    "Li_WtPercent", "Li_AtPercent",
    "Be_WtPercent", "Be_AtPercent",
    "B_WtPercent", "B_AtPercent",
    "C_WtPercent", "C_AtPercent",
    "N_WtPercent", "N_AtPercent",
    "O_WtPercent", "O_AtPercent",
    "F_WtPercent", "F_AtPercent",
    "Ne_WtPercent", "Ne_AtPercent",
    "Na_WtPercent", "Na_AtPercent",
    "Mg_WtPercent", "Mg_AtPercent",
    "Al_WtPercent", "Al_AtPercent",
    "Si_WtPercent", "Si_AtPercent",
    "P_WtPercent", "P_AtPercent",
    "S_WtPercent", "S_AtPercent",
    "Cl_WtPercent", "Cl_AtPercent",
    "Ar_WtPercent", "Ar_AtPercent",
    "K_WtPercent", "K_AtPercent",
    "Ca_WtPercent", "Ca_AtPercent",
    "Sc_WtPercent", "Sc_AtPercent",
    "Ti_WtPercent", "Ti_AtPercent",
    "V_WtPercent", "V_AtPercent",
    "Cr_WtPercent", "Cr_AtPercent",
    "Mn_WtPercent", "Mn_AtPercent",
    "Fe_WtPercent", "Fe_AtPercent",
    "Co_WtPercent", "Co_AtPercent",
    "Ni_WtPercent", "Ni_AtPercent",
    "Cu_WtPercent", "Cu_AtPercent",
    "Zn_WtPercent", "Zn_AtPercent",
    "Ga_WtPercent", "Ga_AtPercent",
    "Ge_WtPercent", "Ge_AtPercent",
    "As_WtPercent", "As_AtPercent",
    "Se_WtPercent", "Se_AtPercent",
    "Br_WtPercent", "Br_AtPercent",
    "Kr_WtPercent", "Kr_AtPercent",
    "Rb_WtPercent", "Rb_AtPercent",
    "Sr_WtPercent", "Sr_AtPercent",
    "Y_WtPercent", "Y_AtPercent",
    "Zr_WtPercent", "Zr_AtPercent",
    "Nb_WtPercent", "Nb_AtPercent",
    "Mo_WtPercent", "Mo_AtPercent",
    "Tc_WtPercent", "Tc_AtPercent",
    "Ru_WtPercent", "Ru_AtPercent",
    "Rh_WtPercent", "Rh_AtPercent",
    "Pd_WtPercent", "Pd_AtPercent",
    "Ag_WtPercent", "Ag_AtPercent",
    "Cd_WtPercent", "Cd_AtPercent",
    "In_WtPercent", "In_AtPercent",
    "Sn_WtPercent", "Sn_AtPercent",
    "Sb_WtPercent", "Sb_AtPercent",
    "Te_WtPercent", "Te_AtPercent",
    "I_WtPercent", "I_AtPercent",
    "Xe_WtPercent", "Xe_AtPercent",
    "Cs_WtPercent", "Cs_AtPercent",
    "Ba_WtPercent", "Ba_AtPercent",
    "La_WtPercent", "La_AtPercent",
    "Ce_WtPercent", "Ce_AtPercent",
    "Pr_WtPercent", "Pr_AtPercent",
    "Nd_WtPercent", "Nd_AtPercent",
    "Pm_WtPercent", "Pm_AtPercent",
    "Sm_WtPercent", "Sm_AtPercent",
    "Eu_WtPercent", "Eu_AtPercent",
    "Gd_WtPercent", "Gd_AtPercent",
    "Tb_WtPercent", "Tb_AtPercent",
    "Dy_WtPercent", "Dy_AtPercent",
    "Ho_WtPercent", "Ho_AtPercent",
    "Er_WtPercent", "Er_AtPercent",
    "Tm_WtPercent", "Tm_AtPercent",
    "Yb_WtPercent", "Yb_AtPercent",
    "Lu_WtPercent", "Lu_AtPercent",
    "Hf_WtPercent", "Hf_AtPercent",
    "Ta_WtPercent", "Ta_AtPercent",
    "W_WtPercent", "W_AtPercent",
    "Re_WtPercent", "Re_AtPercent",
    "Os_WtPercent", "Os_AtPercent",
    "Ir_WtPercent", "Ir_AtPercent",
    "Pt_WtPercent", "Pt_AtPercent",
    "Au_WtPercent", "Au_AtPercent",
    "Hg_WtPercent", "Hg_AtPercent",
    "Tl_WtPercent", "Tl_AtPercent",
    "Pb_WtPercent", "Pb_AtPercent",
    "Bi_WtPercent", "Bi_AtPercent",
    "Po_WtPercent", "Po_AtPercent",
    "At_WtPercent", "At_AtPercent",
    "Rn_WtPercent", "Rn_AtPercent",
    "Fr_WtPercent", "Fr_AtPercent",
    "Ra_WtPercent", "Ra_AtPercent",
    "Ac_WtPercent", "Ac_AtPercent",
    "Th_WtPercent", "Th_AtPercent",
    "Pa_WtPercent", "Pa_AtPercent",
    "U_WtPercent", "U_AtPercent",
    "Np_WtPercent", "Np_AtPercent",
    "Pu_WtPercent", "Pu_AtPercent",
    "Am_WtPercent", "Am_AtPercent",
    "Cm_WtPercent", "Cm_AtPercent",
    "Bk_WtPercent", "Bk_AtPercent",
    "Cf_WtPercent", "Cf_AtPercent",
    "Es_WtPercent", "Es_AtPercent",
    "Fm_WtPercent", "Fm_AtPercent",
    "Md_WtPercent", "Md_AtPercent",
    "No_WtPercent", "No_AtPercent",
    "Lr_WtPercent", "Lr_AtPercent",
    "Rf_WtPercent", "Rf_AtPercent",
    "Db_WtPercent", "Db_AtPercent",
    "D_WtPercent", "D_AtPercent"
]

# Step 2: Create the mapping
column_map = {i: name for i, name in enumerate(column_labels)}

# Step 3: Apply the rename to the whole DataFrame
df = dfs["compositions"]
df = df.rename(columns=column_map)

# Optional: store it back
dfs["compositions"] = df

# Check a row with readable column names
import json
print(json.dumps(df.iloc[-1].to_dict(), indent=2, ensure_ascii=False))

{
  "ProductID": "050105353",
  "AllWtPercent": "B1.61 C57.07 F28.21 H4.34 Mg1.80 O2.38 P4.60",
  "AllAtPercent": "B1.34 C42.95 F13.42 H38.93 Mg0.67 O1.34 P1.34",
  "H_WtPercent": "4.34",
  "H_AtPercent": "38.93",
  "He_WtPercent": null,
  "He_AtPercent": null,
  "Li_WtPercent": null,
  "Li_AtPercent": null,
  "Be_WtPercent": null,
  "Be_AtPercent": null,
  "B_WtPercent": "1.61",
  "B_AtPercent": "1.34",
  "C_WtPercent": "57.07",
  "C_AtPercent": "42.95",
  "N_WtPercent": null,
  "N_AtPercent": null,
  "O_WtPercent": "2.38",
  "O_AtPercent": "1.34",
  "F_WtPercent": "28.21",
  "F_AtPercent": "13.42",
  "Ne_WtPercent": null,
  "Ne_AtPercent": null,
  "Na_WtPercent": null,
  "Na_AtPercent": null,
  "Mg_WtPercent": "1.80",
  "Mg_AtPercent": ".67",
  "Al_WtPercent": null,
  "Al_AtPercent": null,
  "Si_WtPercent": null,
  "Si_AtPercent": null,
  "P_WtPercent": "4.60",
  "P_AtPercent": "1.34",
  "S_WtPercent": null,
  "S_AtPercent": null,
  "Cl_WtPercent": null,
  "Cl_AtPercent": null,
  "Ar

In [65]:
# Path to save the updated DataFrame
output_path = Path("../Data/compositions_renamed.pickle")

# Save using pickle
with open(output_path, "wb") as f:
    pickle.dump(dfs["compositions"], f)

print(f"Pickle file saved to: {output_path}")

Pickle file saved to: ../Data/compositions_renamed.pickle


In [69]:
table = 'compositions_renamed.pickle'
file_path = data_dir / table

with open(file_path, 'rb') as f:
    data = pickle.load(f)
df = pd.DataFrame(data)

In [None]:
d

In [70]:
df.iloc[0]

ProductID                         000010001
AllWtPercent     Bi56.77 C32.63 H1.92 O8.69
AllAtPercent    Bi5.00 C50.00 H35.00 O10.00
H_WtPercent                            1.92
H_AtPercent                           35.00
                           ...             
Rf_AtPercent                           None
Db_WtPercent                           None
Db_AtPercent                           None
D_WtPercent                            None
D_AtPercent                            None
Name: 0, Length: 215, dtype: object

In [71]:
# explore full_comments table
import json
row = dfs['full_comments'].iloc[-1].to_dict()
print(json.dumps(row, indent=2, ensure_ascii=False))

{
  "0": "050104886",
  "1": "Color: colourless. General Comments: Reported GOF=0.954. Habit: Prismatic. Structures: 10.1107/S2056989019007989. Article Title: Crystal structure and DFT study of benzyl 1-benzyl-2-oxo-1,2-dihydroquinoline-4-carboxylate. Temperature of Data Collection: Pattern taken at 296 K. Unit Cell Data Source: Single Crystal."
}


In [None]:
row = dfs['full_comments'].iloc[-2].to_dict()
print(json.dumps(row, indent=2, ensure_ascii=False))

{
  "0": "050104885",
  "1": "Color: dark blue. General Comments: Reported GOF=1.036. Habit: Plate. Structures: 10.1107/S2056989019008053. Article Title: Synthesis, characterization, and crystal structure of aquabis(4,4`-dimethoxy-2,2`-bipyridine)(mu-(2R,3R)-tartrato(4âˆ’))dicopper(II) octahydrate. Temperature of Data Collection: Pattern taken at 150.00 K. Unit Cell Data Source: Single Crystal."
}


In [76]:
row = dfs['full_comments'].iloc[-100].to_dict()
print(json.dumps(row, indent=2, ensure_ascii=False))

{
  "0": "050104787",
}


In [77]:
# revise the column labels

column_labels = ["ProductID", "FullComment"]

column_map = {i:name for i, name in enumerate(column_labels)}

df = dfs["full_comments"]

df = df.rename(columns=column_map)

dfs["full_comments"] = df

# Path to save the updated DataFrame
output_path = Path("../Data/full_comments_renamed.pickle")

# Save using pickle
with open(output_path, "wb") as f:
    pickle.dump(df, f)

print(f"Pickle file saved to: {output_path}")

Pickle file saved to: ../Data/full_comments_renamed.pickle


In [78]:
table = 'full_comments_renamed.pickle'
file_path = data_dir / table

with open(file_path, 'rb') as f:
    data = pickle.load(f)
df = pd.DataFrame(data)
df.head()

Unnamed: 0,ProductID,FullComment
0,10001,Warning: Unindexed pattern. Unit Cell Data Sou...
1,10002,Warning: Unindexed pattern. Unit Cell Data Sou...
2,10003,Warning: Unindexed pattern. Unit Cell Data Sou...
3,10004,Warning: Unindexed pattern. Unit Cell Data Sou...
4,10005,Warning: Unindexed pattern. Unit Cell Data Sou...


In [81]:
df.iloc[-1][-1]

  df.iloc[-1][-1]


'Color: colourless. General Comments: Reported GOF=0.954. Habit: Prismatic. Structures: 10.1107/S2056989019007989. Article Title: Crystal structure and DFT study of benzyl 1-benzyl-2-oxo-1,2-dihydroquinoline-4-carboxylate. Temperature of Data Collection: Pattern taken at 296 K. Unit Cell Data Source: Single Crystal.'