In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path as pt
from loguru import logger
from typing import TypedDict, Literal, Optional, List, Dict, Union
import json

In [13]:
root_loc = pt('/Users/aravindhnivas/Library/CloudStorage/OneDrive-MassachusettsInstituteofTechnology/')
base_loc = root_loc / 'ML properties/Analysed/[PHYSICAL CONSTANTS OF ORGANIC COMPOUNDS]/'

model = 'lgbm'
embeddings = 'mol2vec_embeddings'
method = 'Optuna'
pre_trained_filename = f'{model}_{embeddings}_pretrained_model_{method}'
loc = base_loc / f'tmp_C_processed_data/analysis_data/filtered/tmpC_topelements_processed_data/pretrained_models/{model}/{embeddings}/{method}/'

resultsfile = loc / f'{pre_trained_filename}.results.json'
datfile = loc / f'{pre_trained_filename}.dat.json'

print(f'resultsfile exists: {resultsfile.exists()}')
print(f'datfile exists: {datfile.exists()}')

resultsfile exists: True
datfile exists: True


In [27]:
# Define the structure of the inner dictionaries
class DataEntry(TypedDict):
    y_true: List[float]
    y_pred: List[float]
    y_linear_fit: List[float]
    
class DataType(TypedDict):
    test: DataEntry
    train: DataEntry


# Define the Embedding type
Embedding = Literal['mol2vec', 'VICGAE']

# Define the MLStats TypedDict
class MLStats(TypedDict):
    r2: float
    mse: float
    rmse: float
    mae: float

# Define the CVScores TypedDict
class CVScores(TypedDict):
    mean: float
    std: float
    ci_lower: float
    ci_upper: float
    scores: List[float]

# Define the CV_scoring_methods type
CV_scoring_methods = Literal['r2', 'mse', 'rmse', 'mae']

# Define the LearningCurveData type
LearningCurveData = Dict[str, Dict[Literal['test', 'train'], CVScores]]

# Define the CVScoresData type
CVScoresData = Dict[Literal['test', 'train'], Dict[CV_scoring_methods, CVScores]]

# Define the PlotData and Layout TypedDicts (assuming simplified structures)
class PlotData(TypedDict, total=False):
    x: List[float]
    y: List[float]
    type: str
    name: str

class Layout(TypedDict, total=False):
    title: str
    xaxis: Dict[str, Union[str, int, float]]
    yaxis: Dict[str, Union[str, int, float]]

# Define the MLResults TypedDict
class MLResults(TypedDict):
    learning_curve_plotly_data: Optional[Dict[str, Union[List[PlotData], Layout]]]
    embedding: Embedding
    PCA: bool
    data_shapes: Dict[str, List[int]]
    train_stats: MLStats
    test_stats: MLStats
    model: str
    bootstrap: bool
    bootstrap_nsamples: Optional[int]
    cross_validation: bool
    cv_fold: Optional[int]
    cv_scores: Optional[CVScoresData]
    best_params: Optional[Dict[str, Union[str, int, bool, None]]]
    best_score: Optional[float]
    timestamp: str
    time: str
    

In [28]:
results: MLResults = json.load(open(resultsfile, 'r'))
data: DataType = json.load(open(datfile, 'r'))

In [32]:
results

['data_shapes',
 'test_stats',
 'train_stats',
 'bootstrap',
 'cross_validation',
 'cv_fold',
 'cv_scores',
 'best_params',
 'timestamp',
 'time']