Here is the code to construct two feature matrices, the part before patitioning. One matrix uses coordinates, lattice vectors, and a as features. The other uses distances, lattice vectors, and a as features.

Feature matrix using coords, lattice vectors, and a

In [None]:
import numpy as np
import os

Extract data from POSCAR files

In [None]:
def parse_poscar(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # extract energies, a ,lattice vectors, number of atoms
    energy = float(lines[0].split('=')[1].strip())
    a = float(lines[1].strip())
    lattice_vectors = np.array([list(map(float, lines[i].strip().split())) for i in range(2, 5)])
    num_atoms = 27 + 162
    coordinates_start = 7

    # extract coordinates
    coordinates = np.array([list(map(float, lines[i].strip().split()[:3])) for i in
                            range(coordinates_start, coordinates_start + num_atoms)])
    return coordinates.flatten(), lattice_vectors.flatten(), a, energy

Read all POSCAR files

In [None]:
base_dir = r'C:\Users\Administrator\Desktop\UCL\research project\data\ML_LiC6'
folders = ['MD_T300', 'MD_T500', 'MD_T700', 'MD_T1000']

In [None]:
X = []
y = []

In [None]:
for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    poscar_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.startswith('POSCAR')]
    for file in poscar_files:
        coords, lattice_vectors, a, energy = parse_poscar(file)
        X.append(np.concatenate([coords, lattice_vectors, [a]]))
        y.append(energy)

In [None]:
X = np.array(X)
y = np.array(y)

Feature matrix using distances, lattice vectors, and a as features

In [None]:
def parse_poscar(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # extract energies, a ,lattice vectors, number of atoms
    energy = float(lines[0].split('=')[1].strip())
    a = float(lines[1].strip())
    lattice_vectors = np.array([list(map(float, lines[i].strip().split())) for i in range(2, 5)])
    num_atoms = 27 + 162
    coordinates_start = 7

    # extract coordinates
    coordinates = np.array([list(map(float, lines[i].strip().split()[:3])) for i in
                            range(coordinates_start, coordinates_start + num_atoms)])
    return coordinates.flatten(), lattice_vectors.flatten(), a, energy

Extract distances

In [None]:
from scipy.spatial.distance import pdist

In [None]:
def compute_distances(coordinates):
    distances = pdist(coordinates, metric='euclidean')
    return distances

In [None]:
def extract_features(coordinates, lattice_vectors, a):
    coordinates = coordinates.reshape(-1, 3)
    distances = compute_distances(coordinates)
    features = np.concatenate([distances, lattice_vectors.flatten(), [a]])
    return features

Read all POSCAR files

In [None]:
base_dir = r'C:\Users\Administrator\Desktop\UCL\research project\data\ML_LiC6'
folders = ['MD_T300', 'MD_T500', 'MD_T700', 'MD_T1000']

In [None]:
X = []
y = []

In [None]:
for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    poscar_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.startswith('POSCAR')]
    for file in poscar_files:
        coords, lattice_vectors, a, energy = parse_poscar(file)
        features = extract_features(coords, lattice_vectors, a)
        X.append(features)
        y.append(energy)

In [None]:
X = np.array(X)
y = np.array(y)

This is the code to derive features matrix

In [None]:
import pandas as pd

In [None]:
feature_columns = [f"feature_{i}" for i in range(X.shape[1])]
df = pd.DataFrame(X, columns=feature_columns)
df['energy'] = y

Export to CSV file

In [None]:
df.to_csv("matrix.csv", index=False)