#Title (This is the extraction of the data from xsf files.)

In [1]:
import os
import glob
import numpy as np 
import math
import pandas as pd



def read_xsf(xsf_file):
    atoms = []
    coords = []
    gradients = []
    total_energy = None
    with open(xsf_file, 'r') as f:
        lines = f.readlines()

        # Extract total energy
        for line in lines:
            if line.startswith('# total energy'):
                total_energy = float(line.split('=')[1].strip().split()[0])
                break

        # Read ATOMS section
        atoms_section = False
        for line in lines:
            if line.strip() == "ATOMS":
                atoms_section = True
                continue
            if atoms_section:
                parts = line.split()
                if len(parts) == 4:
                    atom_symbol = parts[0]
                    coord = [float(x) for x in parts[1:4]]
                   # grad = [float(x) for x in parts[4:]]
                    atoms.append(atom_symbol)
                    coords.append(coord)
                  #  gradients.append(grad)

    return atoms, coords, gradients, total_energy

def distance(i,j,coords):
    coord1 = np.array(coords[i][:])
    coord2 = np.array(coords[j][:])
    distance = np.linalg.norm(coord2 - coord1)
    return distance
    
def get_CM(natoms,atoms, coords):
    coulomb_matrix= np.zeros((7, 7))

    atomic_number_dict = {
    'H': 1, 'He': 2, 'Li': 3, 'Be': 4, 'B': 5, 'C': 6, 'N': 7, 'O': 8, 'F': 9, 'Ne': 10,
    'Na': 11, 'Mg': 12, 'Al': 13, 'Si': 14, 'P': 15, 'S': 16, 'Cl': 17, 'Ar': 18, 'K': 19, 'Ca': 20,
    'Sc': 21, 'Ti': 22, 'V': 23, 'Cr': 24, 'Mn': 25, 'Fe': 26, 'Co': 27, 'Ni': 28, 'Cu': 29, 'Zn': 30,
    'Ga': 31, 'Ge': 32, 'As': 33, 'Se': 34, 'Br': 35, 'Kr': 36, 'Rb': 37, 'Sr': 38, 'Y': 39, 'Zr': 40,
    'Nb': 41, 'Mo': 42, 'Tc': 43, 'Ru': 44, 'Rh': 45, 'Pd': 46, 'Ag': 47, 'Cd': 48, 'In': 49, 'Sn': 50,
    'Sb': 51, 'I': 53, 'Te': 52, 'Xe': 54, 'Cs': 55, 'Ba': 56, 'La': 57, 'Ce': 58, 'Pr': 59, 'Nd': 60,
    'Pm': 61, 'Sm': 62, 'Eu': 63, 'Gd': 64, 'Tb': 65, 'Dy': 66, 'Ho': 67, 'Er': 68, 'Tm': 69, 'Yb': 70,
    'Lu': 71, 'Hf': 72, 'Ta': 73, 'W': 74, 'Re': 75, 'Os': 76, 'Ir': 77, 'Pt': 78, 'Au': 79, 'Hg': 80,
    'Tl': 81, 'Pb': 82, 'Bi': 83, 'Po': 84, 'At': 85, 'Rn': 86, 'Fr': 87, 'Ra': 88, 'Ac': 89, 'Th': 90,
    'Pa': 91, 'U': 92, 'Np': 93, 'Pu': 94, 'Am': 95, 'Cm': 96, 'Bk': 97, 'Cf': 98, 'Es': 99, 'Fm': 100,
    'Md': 101, 'No': 102, 'Lr': 103, 'Rf': 104, 'Db': 105, 'Sg': 106, 'Bh': 107, 'Hs': 108, 'Mt': 109,
    'Ds': 110, 'Rg': 111, 'Cn': 112, 'Nh': 113, 'Fl': 114, 'Mc': 115, 'Lv': 116, 'Ts': 117, 'Og': 118
}
    for i in range(natoms):
        for j in range(natoms):
            if i==j:
                coulomb_matrix[i][j]= 0.5*(atomic_number_dict[atoms[i]])**2.4
            if i!=j:
                coulomb_matrix[i][j]= (atomic_number_dict[atoms[i]])*(atomic_number_dict[atoms[j]])/distance(i,j,coords)
    return coulomb_matrix
                
    
#xsf_directory = './data'
xsf_directory = "./coords"
xsf_files = [f for f in os.listdir(xsf_directory) if f.endswith(".xsf")]

data=[]
for file_name in xsf_files:
    # Read the .xsf file
    atoms, coords, gradients, total_energy = read_xsf(os.path.join(xsf_directory, file_name))
    natoms=len(atoms)
    CM=get_CM(natoms,atoms,coords)
    reshape_CM=(CM.reshape(-1))
    row = [file_name] + reshape_CM.tolist()
#    reshape_CM= reshape_CM.strip('[]')
#    reshape_CM= reshape_CM.replace('\n',', ')
#    reshape_CM= reshape_CM.replace(' ',' ')
#    string=" ".join((file_name,reshape_CM))
#    string = string.strip('[]')
#    string = string.split()
    data.append(row)
#    print(file_name,'done picakchu')
df = pd.DataFrame(data)



df.to_csv('data.csv', index=False)
    
#    coulomb_matrix=get_CM(natoms,atoms,coords)


In [2]:
import pandas as pd
import os
import glob
import numpy as np 
import math
df=pd.read_csv('data.csv')
#df.iloc[0,1] = df.iloc[0,1].apply(ast.literal_eval)
#df.iloc[0,1] = df.iloc[0,1].apply(np.array)

coord_indx=df.iloc[:,0]
coord_indx

0        coord_143819.xsf
1        coord_350180.xsf
2        coord_324863.xsf
3        coord_321517.xsf
4         coord_41829.xsf
               ...       
81709    coord_210799.xsf
81710    coord_119816.xsf
81711    coord_333785.xsf
81712     coord_59755.xsf
81713    coord_362243.xsf
Name: 0, Length: 81714, dtype: object

In [77]:
    print(df.iloc[247,0],df.iloc[55737,0],df.iloc[78827,0])

coord_28433.xsf coord_152701.xsf coord_259476.xsf


In [3]:
def nint(x):
    return math.floor(x + 0.5) if x > 0 else math.ceil(x - 0.5)


In [4]:


X=df.iloc[0:,1:]
#all=np.array(df.iloc[0:,1:])
Y=pd.DataFrame(np.zeros((X.shape[0],X.shape[1])))

divs=10
i=0


for i in range(X.shape[1]):
    rho_min = X.iloc[:, i].min()  
    rho_max = X.iloc[:, i].max()  
    
    # Looping through each row (samples)
    for j in range(X.shape[0]):
        if rho_min == rho_max:
            Y.iloc[:, i] = 0  
        else:
            Y.iloc[j, i] = nint(divs*((rho_max - X.iloc[j, i]) / (rho_max - rho_min)))
          #  print(Y.iloc[j, i])

Y=Y.astype(int)
print(Y)
Y.iloc[0].astype(str).to_string(index=False).replace('\n','').replace(' ', '')

       0   1   2   3   4   5   6   7   8   9   ...  39  40  41  42  43  44  \
0       0   6   5   2   9   9   4   6   0   6  ...   4   0   2   4   7   8   
1       0   6   3   8   8   5   5   6   0   5  ...   5   0   3   5   8   3   
2       0   7   8   9   9   9   7   7   0  10  ...   4   0   3   7   8   5   
3       0   3   8   9   9   3   2   3   0  10  ...   8   0   5   2   5   4   
4       0   6   1   7   8   2   4   6   0   3  ...   6   0   3   4   7   3   
...    ..  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ..  ..  ..  ..  ..  ..   
81709   0   6   6   9   2   9   4   6   0   8  ...   8   0   2   4   7   5   
81710   0   4   5   8   2   8   7   4   0   7  ...   6   0   2   7   9   5   
81711   0   7   5   7   9   9   6   7   0   7  ...   3   0   2   6   8   5   
81712   0   3   7   3   8   9   8   3   0   9  ...   3   0   2   8   9   4   
81713   0   6   5   9   3   8   3   6   0   3  ...   7   0   3   3   3   4   

       45  46  47  48  
0       4   4   2   0  
1       3   2  

'065299460671077560842827808849104804497284024784420'

In [5]:
from multiprocessing import Pool, cpu_count
# Function to process one column
def process_column(args):
    i, X_col, divs = args  # Unpack arguments
    rho_min = X_col.min()
    rho_max = X_col.max()
    Y_col = np.zeros_like(X_col, dtype=int)
    
    if rho_min != rho_max:
        Y_col = [nint(divs * ((rho_max - x) / (rho_max - rho_min))) for x in X_col]
    return i, Y_col

# Multiprocessing main function
def parallel_process(X, divs):
    num_columns = X.shape[1]
    args = [(i, X.iloc[:, i], divs) for i in range(num_columns)]
    
    with Pool(cpu_count()) as pool:
        results = pool.map(process_column, args)
    
    # Create an empty DataFrame for results
    Y = pd.DataFrame(np.zeros_like(X, dtype=int), columns=X.columns)
    for i, Y_col in results:
        Y.iloc[:, i] = Y_col
    
    return Y
divs=10
Y = parallel_process(X, divs)
print(Y)

       1  2  3  4  5  6  7  8  9  10  ...  40  41  42  43  44  45  46  47  48  \
0      0  6  5  2  9  9  4  6  0   6  ...   4   0   2   4   7   8   4   4   2   
1      0  6  3  8  8  5  5  6  0   5  ...   5   0   3   5   8   3   3   2   3   
2      0  7  8  9  9  9  7  7  0  10  ...   4   0   3   7   8   5   3   3   3   
3      0  3  8  9  9  3  2  3  0  10  ...   8   0   5   2   5   4   2   3   5   
4      0  6  1  7  8  2  4  6  0   3  ...   6   0   3   4   7   3   3   3   3   
...   .. .. .. .. .. .. .. .. ..  ..  ...  ..  ..  ..  ..  ..  ..  ..  ..  ..   
81709  0  6  6  9  2  9  4  6  0   8  ...   8   0   2   4   7   5   3   3   2   
81710  0  4  5  8  2  8  7  4  0   7  ...   6   0   2   7   9   5   3   7   2   
81711  0  7  5  7  9  9  6  7  0   7  ...   3   0   2   6   8   5   2   3   2   
81712  0  3  7  3  8  9  8  3  0   9  ...   3   0   2   8   9   4   7   3   2   
81713  0  6  5  9  3  8  3  6  0   3  ...   7   0   3   3   3   4   2   6   3   

       49  
0       0  
1  

In [6]:
import hashlib
import random

hash_table={}

for entry in range(X.shape[0]):
    bin_string=Y.iloc[entry].astype(str).to_string(index=False).replace('\n','').replace(' ', '')
    hash_value=hashlib.sha256(bin_string.encode())
    hash_value=str(hash_value.hexdigest())
    hash_table.setdefault(hash_value,[])
    hash_table[hash_value].append(entry)
    

with open("hashtable.txt", 'w') as f:    # save entire hash_table
    for key, value in hash_table.items():
        f.write('%s  :  %s\n' % (key, value))


random_indx=[]
with open("random_str.txt", 'w') as f:   # take out one structure randomly from each bin
    for key,value in hash_table.items():
        value=random.choice(value)
        random_index=random_indx.append(value)
        f.write('%s\n' % (value))


In [8]:
random_indx=[]
#with open("random_str.txt", 'w') as f:   # take out one structure randomly from each bin
for key,value in hash_table.items():
    value=random.choice(value)
    random_indx.append(value)
#        f.write('%s\n' % (value))
filtered_rows = df.iloc[random_indx]
filtered_rows = np.array(filtered_rows.iloc[:,1:])
filtered_rows

array([[  36.8581052 ,   35.88405773,   11.01392891, ...,   17.773772  ,
          19.59529427, 6874.35714489],
       [  36.8581052 ,   35.86354223,   13.29078613, ...,   20.60260753,
          18.82768993, 6874.35714489],
       [  36.8581052 ,   34.76491469,    8.43594256, ...,   19.213876  ,
          18.68325721, 6874.35714489],
       ...,
       [  36.8581052 ,   36.61610017,   10.93931616, ...,   12.73280379,
          19.36976119, 6874.35714489],
       [  36.8581052 ,   34.63499195,   11.20472719, ...,   19.99495211,
          20.3147736 , 6874.35714489],
       [  36.8581052 ,   35.6477958 ,   11.37251196, ...,   13.52696643,
          18.84353283, 6874.35714489]])

In [16]:
from skmatter.sample_selection import FPS
import numpy as np
selector = FPS(
    n_to_select=10000,
    # int or 'random', default=0
    # Index of the first selection.
    # If ‘random’, picks a random value when fit starts.
    initialize='random',
)
X = filtered_rows
selector.fit(X)
selected_index = selector.selected_idx_
print(selected_index)

[68268 21163 30032 ... 37033 25911 43672]


In [19]:
def takeout_coordindex(random_index):
    coord=df.iloc[random_index,0]
    return coord


In [20]:
hentry=hash_table['9aaa3b8feaa662f0797cebaf38953f49e36a07d0c07817521fb8393ce9503184']
coord_indx=takeout_coordindex(hentry)

KeyError: '9aaa3b8feaa662f0797cebaf38953f49e36a07d0c07817521fb8393ce9503184'

In [21]:
np.set_printoptions(suppress=True)
coord_indx=np.array([],dtype='int16')
i=0
while i <= selector.n_to_select-1 :
      coord=takeout_coordindex(selected_index[i])
      coord_indx=np.append(coord_indx,coord)
      i=i+1 

C=coord_indx.reshape(-1,1)


In [22]:
file = open("coords_tobe_extracted.dat", "w+")
import sys
import numpy as np
string = "\n ".join(map(str, coord_indx))
#np.set_printoptions(threshold=sys.maxsize)
#string=str(coord_indx.reshape(-1,1))
##string=string.strip('[]')
print(string)
file.write(string)
b=takeout_coordindex(146)
b
file.close()

coord_343329.xsf
 coord_236635.xsf
 coord_141958.xsf
 coord_338543.xsf
 coord_67752.xsf
 coord_253298.xsf
 coord_29858.xsf
 coord_147586.xsf
 coord_181177.xsf
 coord_280418.xsf
 coord_308976.xsf
 coord_86005.xsf
 coord_325031.xsf
 coord_239913.xsf
 coord_93807.xsf
 coord_105297.xsf
 coord_280049.xsf
 coord_99166.xsf
 coord_362885.xsf
 coord_257462.xsf
 coord_23622.xsf
 coord_295020.xsf
 coord_331172.xsf
 coord_95591.xsf
 coord_38696.xsf
 coord_78955.xsf
 coord_116174.xsf
 coord_164746.xsf
 coord_352363.xsf
 coord_87180.xsf
 coord_300902.xsf
 coord_230477.xsf
 coord_350446.xsf
 coord_18286.xsf
 coord_107343.xsf
 coord_356315.xsf
 coord_54143.xsf
 coord_97158.xsf
 coord_192370.xsf
 coord_142527.xsf
 coord_248772.xsf
 coord_45968.xsf
 coord_116114.xsf
 coord_298396.xsf
 coord_131939.xsf
 coord_142330.xsf
 coord_156266.xsf
 coord_100148.xsf
 coord_115154.xsf
 coord_233263.xsf
 coord_198365.xsf
 coord_254337.xsf
 coord_74063.xsf
 coord_94523.xsf
 coord_251298.xsf
 coord_42758.xsf
 coord_167

In [13]:
len(coord_indx)

5000

In [82]:
xsf_directory='../7/completed'
with open('animation.axsf','w') as file:
     file.write(f"ANIMSTEPS {len(coord_indx)}\n")
     n=0
     for file_name in coord_indx:
         atoms, coords, gradients, total_energy = read_xsf(os.path.join(xsf_directory, file_name))
         n=n+1
         file.write(f"ATOMS {n}\n")
         for atom, coord in zip(atoms, coords):
             file.write(f"{atom} {coord[0]:.4f} {coord[1]:.4f} {coord[2]:.4f}\n")
    

In [23]:
xsf_directory='./coords'
with open('animation.xyz','w') as file:
     #file.write(f"ANIMSTEPS {len(coord_indx)}\n")
     n=0
     for file_name in coord_indx:
         atoms, coords, gradients, total_energy = read_xsf(os.path.join(xsf_directory, file_name))
         n=n+1
         file.write(f"{len(atoms)}\n")
         file.write(f"{file_name}\n")
         for atom, coord in zip(atoms, coords):
             file.write(f"{atom} {coord[0]:.4f} {coord[1]:.4f} {coord[2]:.4f}\n")
    