In [None]:

import numpy as np
from scipy.interpolate import interp1d, BarycentricInterpolator
import csv
import json
import torch
from torch_geometric.data import Data
import code.util.utils as ut

In [38]:

def extrapolate_matrix_to_zero(freq_array, matrix_array):
    """
    Extrapolate 3x3 matrices to freq=0 using 1D interpolation on each element.

    freq_array  : 1D numpy array of shape (N,) containing your discrete frequencies,
                  e.g. [1.55, 1.62, 1.70, ..., 6.19].
    matrix_array: 3D numpy array of shape (N, 3, 3), 
                  where matrix_array[i] is the 3x3 matrix at freq_array[i].
    
    Returns a single (3, 3) float array, representing the 
    (extrapolated) matrix at freq=0.
    
    Note: This is a purely mathematical extrapolation and 
    may not be physically valid if freq=0 is well outside your data range.
    """

    freq_array = np.asarray(freq_array)
    matrix_array = np.asarray(matrix_array)
    
    if freq_array.shape[0] != matrix_array.shape[0]:
        raise ValueError("freq_array and matrix_array must have same length in first dimension!")
    if matrix_array.shape[1:] != (3,3):
        raise ValueError("matrix_array must be shape (N,3,3).")

    new_mat = np.zeros((3,3), dtype=float)
    
    # For each of the 9 elements, do a 1D interpolation vs freq
    for i in range(3):
        for j in range(3):
            # Grab the values of this element across all frequencies
            y_values = matrix_array[:, i, j]
            
            # Build an interpolator that can also extrapolate
            f = interp1d(freq_array, y_values, kind='cubic',
                         fill_value='extrapolate', assume_sorted=False)
            
            # Evaluate at freq=0
            new_mat[i, j] = f(0.0)
    
    return new_mat

In [39]:
import csv
import json
import numpy as np
from collections import defaultdict
from scipy.interpolate import interp1d


def load_smiles_freq_matrices(csv_path):
    """
    Reads a CSV with columns:
      - smiles
      - frequency
      - matrix_real   (a JSON-encoded 3x3 array)
    Returns a dict:
       data_dict[smiles] = list of (freq_val, 3x3 numpy array)
    """

    data_dict = defaultdict(list)

    with open(csv_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        header = next(reader, None)
        if header is None:
            raise ValueError("Empty CSV or missing header.")

        smiles_idx  = header.index("smiles")
        freq_idx    = header.index("frequency")
        matrix_idx  = header.index("matrix_real")

        for row in reader:
            if not row:
                continue
            smiles = row[smiles_idx]
            # parse freq
            try:
                freq_val = float(row[freq_idx])
            except ValueError:
                # skip invalid freq
                continue
            # parse matrix
            mat_str = row[matrix_idx]
            try:
                arr_3x3 = json.loads(mat_str)  # should be shape [3,3]
                arr_3x3 = np.array(arr_3x3, dtype=float)
                if arr_3x3.shape != (3,3):
                    print(f"Skipping row with invalid shape {arr_3x3.shape} for SMILES={smiles}")
                    continue
            except json.JSONDecodeError:
                print(f"Skipping row with invalid JSON for SMILES={smiles}")
                continue

            data_dict[smiles].append((freq_val, arr_3x3))

    return data_dict


def process_and_write(csv_in, csv_out):
    """
    1) Group by SMILES
    2) For each SMILES, gather freq + matrix, sort by freq
    3) Extrapolate to freq=0
    4) Write to CSV with columns: "smiles", "frequency", "matrix_real"
    """

    data_dict = load_smiles_freq_matrices(csv_in)

    with open(csv_out, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["smiles", "frequency", "matrix_real"])

        for smiles, freq_mats in data_dict.items():
            # freq_mats is a list of (freq_val, 3x3 np array)
            if len(freq_mats) < 2:
                # We have only one freq or zero => can't do cubic interpolation meaningfully
                # you might fallback to just returning that single matrix as an approximation, or skip
                print(f"Skipping SMILES={smiles}, only {len(freq_mats)} freq found.")
                continue

            # build arrays
            freq_array = np.array([fm[0] for fm in freq_mats], dtype=float)
            mat_list   = np.stack([fm[1] for fm in freq_mats], axis=0)  # shape [N,3,3]

            # sort by frequency
            sort_idx = np.argsort(freq_array)
            freq_sorted = freq_array[sort_idx]
            mat_sorted  = mat_list[sort_idx]

            try:
                mat_at_zero = extrapolate_matrix_to_zero(freq_sorted, mat_sorted)
            except ValueError as e:
                print(f"Interpolation error for SMILES={smiles}, skipping. Reason: {e}")
                continue

            # Convert mat_at_zero => list-of-lists for JSON
            mat_listoflists = mat_at_zero.tolist()
            mat_json = json.dumps(mat_listoflists)

            writer.writerow([smiles, 0.0, mat_json])


if __name__ == "__main__":
    input_csv  = "/media/maria/work_space/dyn-detanet/data/ee_polarizabilities.csv"
    output_csv = "/media/maria/work_space/dyn-detanet/data/ee_extrapolated_freq0_cubic.csv"
    process_and_write(input_csv, output_csv)
    print("Done. Output in:", output_csv)


Done. Output in: /media/maria/work_space/dyn-detanet/data/ee_extrapolated_freq0_cubic.csv
