## Notes

 - Visit https://www.crystallography.net/cod/ for perusal

## Globals and Utilities

In [161]:
## Module Installs
# pip install mp_api
# pip install python-dotenv
# pip install requests
# pip install python-slugify

In [199]:
## Import modules
import os
import io
import json
import lzma
import random
import zipfile
import requests
from tqdm import tqdm
from slugify import slugify
from mp_api.client import MPRester
from emmet.core.summary import HasProps
from dotenv import load_dotenv

load_dotenv()

True

In [170]:
## Variables
MPKEY = os.getenv('MPKEY')
ROOT_DIR = os.getcwd()
TEMP_DIR = os.path.join(ROOT_DIR, 'tmp')

In [171]:
if not os.path.exists(TEMP_DIR):
    print("Not found temp dir, creating one...")
    os.makedirs(TEMP_DIR)

In [172]:
def pretty_file_size(size_bytes: int) -> str:
    units = ['TB', 'GB', 'MB', 'KB', 'B']
    for unit in units:
        if size_bytes >= 1024 ** (len(units) - units.index(unit)):
            return f"{size_bytes / 1024 ** (len(units) - units.index(unit)):.2f} {unit}"
    return "0 B"

In [202]:
def zip_dir(directory_path: str, zip_filename: str = None) -> str:

    # Create the zip file in the same parent directory as the zipped dir and name it the same as the source dir if no name is given
    parent_dir_path = os.path.dirname(directory_path)
    sorce_dir_name = os.path.basename(directory_path)
    
    if zip_filename is None:
        zip_filename = os.path.join(parent_dir_path, f"{slugify(sorce_dir_name, separator='_')}.zip")
        
    # Check if the input is a valid directory
    if not os.path.isdir(directory_path):
        raise ValueError(f"'{directory_path}' is not a valid directory.")

    # Create an in-memory buffer for compression
    buf = io.BytesIO()
    
    # Create the zip file and write its contents to it
    print("Creating the zip file and write its contents to it...")
    with zipfile.ZipFile(buf, 'w') as zip_file:
        for root, dirs, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                rel_path = os.path.relpath(file_path, directory_path)
                zip_file.write(file_path, rel_path)

    # Compress the contents of the zip file using LZMA
    print("Compressing the contents of the zip file using LZMA...")
    compressed_data = lzma.compress(buf.getvalue())

    # Save the compressed data to a new .zip file
    print("Saving the compressed data to a new .zip file")
    with open(zip_filename, 'wb') as f:
        f.write(compressed_data)
    
    return zip_filename

In [173]:
def unzip(path_to_zip_file: str, directory_to_extract_to: str = None) -> str:

    ## Extract to the same directory if a dest9ination is not set:
    if directory_to_extract_to is None:
        new_dir_name = slugify(os.path.splitext(os.path.basename(path_to_zip_file))[0], separator='_')
        directory_to_extract_to = os.path.join(os.path.dirname(path_to_zip_file), new_dir_name)

    ## Create the destination path if not exist
    if not os.path.exists(directory_to_extract_to):
        os.makedirs(directory_to_extract_to)
    
    with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
        zip_ref.extractall(directory_to_extract_to)

    print(f"Finished extracting files to: {directory_to_extract_to}")
    return directory_to_extract_to

In [174]:
def download_file(url: str, file_name: str, destination: str = None, overwrite: bool = False) -> str:

    ## Slugify the file name
    file_name = slugify(file_name, separator='_')
    
    ## Create the full destinantion path if given
    download_path = os.path.join(destination, file_name) if destination else os.path.join(TEMP_DIR, file_name)

    ## Verify the destination path exists
    if destination is not None:
        if (not os.path.exists(destination)) and (os.path.isdir(destination)):
            print(f"Creating download destination directories... [{destination}]")
            os.makedirs(destination)
    
    with requests.get(url, stream=True) as response:
        response.raise_for_status()
        total_size = int(response.headers.get('content-length', 0))
        downloaded_size = 0
        block_size = 8192  # 8 Kilobytes

        ## Skip download if file exists of same size and no overwrite flag
        if os.path.exists(download_path) and not overwrite:
            print(f"File of the same size as remote file already exists and overwrite is set to False\nSkipping Download...")
            return download_path
        
        with open(download_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=block_size):
                file.write(chunk)
                downloaded_size += len(chunk)
                if total_size > 0:
                    print(f"\rDownloading: {pretty_file_size(downloaded_size)} of {pretty_file_size(total_size)} ({downloaded_size * 100.0 / total_size:.2f}%)", end='')
                else:
                    print(f"\rDownloading: {pretty_file_size(downloaded_size)} bytes", end='')

    print(f"\nFinished downloading file: {file_name}")
    print(
        f"Destination: {download_path}\n"
        f"File: {file_name}\n"
        f"File Size: {pretty_file_size(total_size)}\n"
        f"Downloaded Size: {pretty_file_size(os.path.getsize(download_path))}\n"
    )

    return download_path

## This notebook will focus on the collection of training data for the project based on our four areas of focus.

Areas of focus:
 - Battery materials.
 - Photovoltaic materials.
 - Carbon capture.
 - Photocatalytic materials

## A: Battery materials.

#### 1. Materials Project

In [11]:
## Get all materials that have an insertion_electrodes property (From battry materials MP page)
## Only the MP material ID and the pretty formula
with MPRester(MPKEY) as mpr:
    docs = mpr.materials.summary.search(
        has_props = [HasProps.insertion_electrodes],
        fields = ["material_id", "formula_pretty"]
    )

Retrieving SummaryDoc documents:   0%|          | 0/11206 [00:00<?, ?it/s]

In [91]:
## Print out all the possible properties we can query but did not
print(docs[0].fields_not_requested)



In [102]:
## Get some 5 random materials
random_materials = random.choices(docs, k=5)
[m.formula_pretty for m in random_materials]

['V3OF11', 'Li5Mn2Ni3O10', 'MgV2O4', 'LiLa4NiO8', 'Li7Cr2O8']

In [103]:
print(f"Found {len(docs):,} battery materials from the Materials project DB")

Found 11,206 battery materials from the Materials project DB


#### 2. ChemDataExtractor Data

A database of battery materials auto-generated using ChemDataExtractor.

 > Listed from [this](https://www.nature.com/articles/s41597-020-00602-2) paper.

The dataset includes materials extraced from academic papers that are related to energy storage.

The dataset has the following properties extracted/scraped:
 - Conductivity [CentiMeter^(-1.0)  Siemens^(1.0)]
 - Coulombic Efficiency [Percent^(1.0)]
 - Voltage [Volt^(1.0)]
 - Capacity [Ampere^(1.0)  Hour^(1.0)  KiloGram^(-1.0)]
 - Energy [KiloGram^(-1.0)  WattHour^(1.0)]

In [108]:
## Download the batabase archive file
url = "https://figshare.com/ndownloader/files/22789760"
battery_data_zip_file_path = download_file(url, 'battery_data.zip')

File of the same size as remote file already exists and overwrite is set to False
Skipping Download...


In [124]:
## Extract the data from the zip archive
extracted_chem_data_path = unzip(battery_data_zip_file_path)

Finished extracting files to: /home/alen/projects/Inverse-Design-of-Materials-with-AI/tmp/battery_data


In [133]:
## Get the data
with open(os.path.join(extracted_chem_data_path, 'battery_merged.json')) as batt_data_1_fo:
    chem_extractor_data = json.load(batt_data_1_fo)

In [152]:
print(f"Data type: {type(chem_extractor_data)}")
print(f"Number of materials extracted: {len(chem_extractor_data):,}")
print(f"Example data: {random.choice(chem_extractor_data)}")

Data type: <class 'list'>
Number of materials extracted: 214,617


## B: Carbon capture materials.

[ref](https://www.sciencedirect.com/science/article/pii/S2589234723001690) Generally, the most widely used solid materials for CO2 capture are silica, zeolites, alumina, amine-based materials, metal oxides, metal-organic frameworks (MOFs), polymers, and carbon materials (activated carbon, graphite, graphene, fullerene, carbon nanotubes, biochar, and hydrochar).

#### 1. [CCDC](https://www.ccdc.cam.ac.uk/) - MOFs

The dataset was downloaded from The Cambridge Crystallographic Data Centre website.

The dataset is of metal-organic frameworks (MOFs).

The dataset archive was manually downloaded and uploaded to this project's working directory as an account is required to download the data from the CCDC website.

The dataset archive has been repackaged and can be downloaded [here](http://197.137.67.2:7010/files/Inverse-Design-of-Materials-with-AI/tmp/csd_mof_collection.zip?_xsrf=2%7C1aa66def%7C52e4121104952bacf3ace88923de87a4%7C1722332477)

In [240]:
## Assuming the zip file is downloaded and extracted, get the CCDC MOF data.
csd_mof_file_path = os.path.join(ROOT_DIR, "tmp/csd_mof_collection")

# Get all the cif files in the collection
csd_mof_cif_collection = [f for f in os.listdir(csd_mof_file_path) if '.cif' in f]
other_files = [f for f in os.listdir(csd_mof_file_path) if '.cif' not in f]

# List the collection stats
print(f"Number of cif files: {len(csd_mof_cif_collection):,}")
print(f"Other files in the collection:")
for f in sorted(other_files): print(f"\t - {f}")
print(f"Sample of cif files:")
for f in random.sample(csd_mof_cif_collection, 5): print(f"\t - {f}")

Number of cif files: 10,636
Other files in the collection:
	 - 00_README.txt
	 - Charged frameworks.csv
	 - Framework details.csv
	 - Frameworks with hydrogen added.csv
	 - Suspect chemistry frameworks.csv
Sample of cif files:
	 - uxiqoj_P1.cif
	 - navjaw_P1.cif
	 - udiwuc_P1.cif
	 - isikif_P1_H.cif
	 - huwluj_P1_charged.cif


#### 2. 