In [None]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt

Extract data from POSCAR files

In [None]:
def parse_poscar(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # extract energies, a ,lattice vectors, number of atoms
    energy = float(lines[0].split('=')[1].strip())
    a = float(lines[1].strip())
    lattice_vectors = np.array([list(map(float, lines[i].strip().split())) for i in range(2, 5)])
    num_atoms = 27 + 162
    coordinates_start = 7

    # extract coordinates
    coordinates = np.array([list(map(float, lines[i].strip().split()[:3])) for i in
                            range(coordinates_start, coordinates_start + num_atoms)])
    return coordinates.flatten(), lattice_vectors.flatten(), a, energy

Read all POSCAR files

In [None]:
base_dir = r'C:\Users\Administrator\Desktop\UCL\research project\data\ML_LiC6'
folders = ['MD_T300', 'MD_T500', 'MD_T700', 'MD_T1000']

In [None]:
X = []
y = []

In [None]:
for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    poscar_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.startswith('POSCAR')]
    for file in poscar_files:
        coords, lattice_vectors, a, energy = parse_poscar(file)
        X.append(np.concatenate([coords, lattice_vectors, [a]]))
        y.append(energy)

In [None]:
X = np.array(X)
y = np.array(y)

Patitioning

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

machine learning

In [None]:
poly_reg = PolynomialFeatures(degree=2, interaction_only=False)
X_poly = poly_reg.fit_transform(X_train)
X_test = poly_reg.transform(X_test)

In [None]:
polynomial_regressor = LinearRegression()
polynomial_regressor.fit(X_poly, y_train)

Predict the test set

In [None]:
y_pred = polynomial_regressor.predict(X_test)

Plot the actual vs. predicted values

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Values')
plt.axis('equal')
plt.ylabel('Predicted Values')
x_start = -1580
x_end = -1480
plt.plot([x_start, x_end], [x_start, x_end], color='red')
plt.show()

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error: ", mse)
print("R-squared: ", r2)