# Predition de l'energie des molécules

#### imporation des librairies

In [64]:
import pandas as pd
import numpy as np
# ase est une librairie pour lire les fichiers .xyz (souvent utilisé pour representer des atomes.) site : https://wiki.fysik.dtu.dk/
from ase.io.xyz import read_xyz
from ase.io import read
from ase.visualize import view
import os 
import plotly.express as px

### Prise en main de la librairie ase

Dans cette partie, nous prenons en main la librairie ASE et regardons les différentes fonctions qui pourrait nous être utile pour la suite.

In [29]:
test = read("./train/atoms/train/id_1.xyz")

In [44]:
view(test, viewer='x3d')

#### Listing des infos (fonctions) complémentaires potentiellement interessantes pour une molécule

In [32]:
test.get_all_distances().shape

(19, 19)

In [37]:
test.get_masses()

array([14.007, 12.011, 12.011, 12.011, 12.011, 12.011, 14.007,  1.008,
        1.008,  1.008,  1.008,  1.008,  1.008,  1.008,  1.008,  1.008,
        1.008,  1.008,  1.008])

In [34]:
test.get_chemical_formula(mode='hill', empirical=False)

'C5H12N2'

In [40]:
test.get_global_number_of_atoms()

19

In [39]:
test.get_positions()

array([[ 0.857135,  1.766441,  1.943662],
       [ 0.099524,  0.525085,  1.758627],
       [-0.233978,  0.294832,  0.290564],
       [ 0.740909, -0.375661, -0.628106],
       [-0.519742, -1.108375, -0.204441],
       [-1.656612, -1.365652, -1.170557],
       [ 0.916179,  0.131485, -1.980716],
       [ 1.022234,  1.944131,  2.930746],
       [ 0.353962,  2.557052,  1.549804],
       [-0.841044,  0.528336,  2.354028],
       [ 0.698684, -0.314844,  2.141217],
       [-0.841061,  1.081979, -0.162159],
       [ 1.675159, -0.731132, -0.180879],
       [-0.377184, -1.900361,  0.534083],
       [-1.740849, -0.565663, -1.903374],
       [-2.610844, -1.439292, -0.64818 ],
       [-1.507099, -2.299673, -1.712636],
       [ 0.103102,  0.661861, -2.28071 ],
       [ 1.071134, -0.629252, -2.639089]])

In [41]:
test.get_center_of_mass(scaled=False)

array([0.02960112, 0.01087163, 0.00017221])

In [51]:
test.set_calculator()

In [56]:
# Necessite de donner un calculateur à la molécule
try :
    test.get_charges()
except Exception as e: 
    print(e)
    print("Erreur : Necessite de donner un calculateur à la molécule")

Atoms object has no calculator.
Erreur : Necessite de donner un calculateur à la molécule


In [53]:
test.euler_rotate(phi=0.0, theta=0.0, psi=0.0, center=(0, 0, 0))

In [54]:
# Necessite de donner un calculateur à la molécule
try :
    test.get_dipole_moment()
except : 
    print("Erreur : Necessite de donner un calculateur à la molécule")

Erreur : Necessite de donner un calculateur à la molécule


In [49]:
# Necessite de donner un calculateur à la molécule
try :
    test.get_forces(apply_constraint=True, md=False)
except : 
    print("Erreur : Necessite de donner un calculateur à la molécule")

Erreur : Necessite de donner un calculateur à la molécule


In [50]:
test.get_moments_of_inertia(vectors=False)

array([121.81724473, 305.64416583, 383.56495588])

### Chargement des données et mise en forme

In [125]:
df_train_energies = pd.read_csv("./train/energies/train.csv",sep=",",dtype = {"id":"str"})

In [126]:
# Dictionnaire regroupant les différents atomes.
def get_dict_atoms(path) :
    """
    input : path of the file
    return : dictionnary of atoms
    """
    dict_atoms_train = {}
    l = os.scandir(path)
    l = [e.name for e in l]
    for e in l : 
        dict_atoms_train[e.split(".")[0].split("_")[1]] = read(path+e)
    return dict_atoms_train

In [127]:
dict_atoms_train = get_dict_atoms("./train/atoms/train/")
dict_atoms_test = get_dict_atoms("./train/atoms/test/")

In [131]:
# nombre de molécule uniques 
df_train = pd.DataFrame({"id":dict_atoms_train.keys(),"Molecule":dict_atoms_train.values()})

In [133]:
def getParams(x) :
    return x.get_chemical_formula(mode='hill', empirical=False),x.get_global_number_of_atoms()

In [134]:
df_train[["Molecule_formula","Number_of_molecule"]] = None
df_train[["Molecule_formula","Number_of_molecule"]] = df_train.apply(lambda x : getParams(x["Molecule"]), axis=1,result_type='expand')

In [138]:
df_train = df_train.merge(df_train_energies,on="id",how="left")

### Exploratory Data Analysis

In [128]:
df_train_energies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6770 entries, 0 to 6769
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      6770 non-null   object 
 1   energy  6770 non-null   float64
dtypes: float64(1), object(1)
memory usage: 105.9+ KB


In [129]:
df_train_energies.describe()

Unnamed: 0,energy
count,6770.0
mean,-78.077716
std,11.69214
min,-103.413076
25%,-89.656981
50%,-77.649109
75%,-69.314706
max,-19.938731


In [130]:
px.histogram(df_train_energies["energy"].values)

In [132]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6773 entries, 0 to 6772
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        6773 non-null   object
 1   Molecule  6773 non-null   object
dtypes: object(2)
memory usage: 106.0+ KB


In [136]:
df_train.shape

(6773, 4)

In [135]:
print("Nombre de molécules différentes : "+str(len(df_train["Molecule_formula"].unique())))

Nombre de molécules différentes : 107


In [145]:
df_train

Unnamed: 0,id,Molecule,Molecule_formula,Number_of_molecule,energy
0,2337,"(Atom('C', [-2.09372, 1.837671, -1.484257], in...",C7H12,19,-90.905125
1,3572,"(Atom('C', [-0.138336, -0.083298, 2.268276], i...",C6H12,18,-83.108256
2,5406,"(Atom('O', [-1.031679, 0.743757, 1.530765], in...",C4H8O2,14,-66.612301
3,2736,"(Atom('N', [-1.385826, -0.816217, 1.59968], in...",C5H10N2,17,-76.819860
4,6560,"(Atom('C', [-2.382196, 0.792269, 0.941563], in...",C7H12,19,-90.627658
...,...,...,...,...,...
6768,2957,"(Atom('C', [-1.820669, -1.574578, -0.564349], ...",C5H11N,17,-76.053972
6769,4821,"(Atom('O', [0.371446, -1.285156, 0.263895], in...",C4H5NO,11,-55.605303
6770,1309,"(Atom('C', [-0.583074, -2.312918, 0.885891], i...",C7H14,21,-97.257312
6771,5934,"(Atom('O', [-1.408811, -1.916292, -0.432555], ...",C4H6O2,12,-61.909691


In [148]:
px.scatter(df_train,x="Number_of_molecule",y="energy",hover_data = ["energy","Molecule_formula"])