# Fichier Exemple : utilisation des modèles développés
### 1. Modèle IA pour TC
### 2. Modèle IA pour PC
### 3. Modèle IA pour ACEN
### 4. Modèle IA pour NBP - normal boiling point
### 5. Modèle IA pour TTR - point triple
### 6. Modèle IA pour VC
### Copyright - LRGP - Nancy 2024 - Roda Bounaceur


## -- Step 1 : Appel de la classe modele et importation des différents modèles IA développés

In [5]:
#
# Avec l'approche "Ensemble Learning", Le modèle finale est une moyenne de plusieurs sous-modèles
# La classe ainsi développée permet de faire cette moyenne automatiquement
#
from sklearn.base import BaseEstimator, RegressorMixin

class MetaModel(BaseEstimator, RegressorMixin):
    def __init__(self, models):
        self.models = models
    
    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = [model.predict(X) for model in self.models]
        return sum(predictions) / len(self.models)

In [18]:
#
# Chargement des modèles IA
#
#
import joblib
#
TC = joblib.load('./01_modele_final_TC.joblib')      # [K]
PC = joblib.load('./02_modele_final_PC.joblib')      # [bar]
ACEN = joblib.load('./03_modele_final_ACEN.joblib')  # [-]
NBP = joblib.load('./04_modele_final_NBP.joblib')    # [K]
TTR = joblib.load('./05_modele_final_TTR.joblib')    # [K]
VC = joblib.load('./06_modele_final_VC.joblib')      # [m3/kmol]

In [7]:
#
# Appel de la liste des descripteurs à conserver
# Cette liste a été déterminé après l'analyse statistique de la database complète
# En se focalisant sur l'étude des TC, 247 descripteurs ont été retenus
#
# Lire les noms de colonnes d'un fichier texte dans une liste
with open('noms_colonnes_247_TC.txt', 'r') as f:
    noms_colonnes_247_TC = [ligne.strip() for ligne in f]
del noms_colonnes_247_TC[0] 
# noms_colonnes_247_TC
#

## -- Step 2 : Importation des modules pour Mordred et RDkit et des fonctions associées

In [8]:
#
# Importation des bibliothèques RDKIT + Mordred
#
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import rdkit.Chem.inchi
#
from mordred import Calculator, descriptors
import mordred

#
# Ecriture des fonctions nécessaires
#
def All_Mordred_descriptors(data): # Fonction d'appelle des descripteurs
    calc = Calculator(descriptors, ignore_3D=False)
    mols = [Chem.MolFromSmiles(smi) for smi in data]
   
    # pandas df
    df = calc.pandas(mols)
    
    #return mols # et commenter df pour tester les molecules
    return df

#
# Fonction pour obtenir la formule brute + notation InChiKey
#
def smiles_to_Inchikey_and_molecule_1(SMILES): 
    mol = Chem.MolFromSmiles(SMILES)
    smiles = Chem.MolToSmiles(mol)
    Inchikey = rdkit.Chem.inchi.MolToInchiKey(mol)
    descriptors = All_Mordred_descriptors_1(SMILES)
    nC = descriptors["nC"].iloc[0]
    nH = descriptors["nH"].iloc[0]
    nO = descriptors["nO"].iloc[0]
    nN = descriptors["nN"].iloc[0]
    return Inchikey,smiles,nC,nH,nO,nN

#
# Fonction ecriture de la notation smile canonique
#
def canonical_smiles(smiles): 
    mols = [Chem.MolFromSmiles(smi) for smi in smiles] 
    smiles = [Chem.MolToSmiles(mol) for mol in mols]
    return smiles

#
# Même fonction mais ne renvoie que l'Inchikey
# Il évite la latence due à l'appel de Mordred
#
def smiles_to_Inchikey(SMILES):
    
    mol = Chem.MolFromSmiles(SMILES)
    smiles = Chem.MolToSmiles(mol)
    Inchikey = rdkit.Chem.inchi.MolToInchiKey(mol)
    return(Inchikey)

## -- Step 4 : Importation des modules python de base

In [9]:
#
# Importation des bibliothèques de bases - Pandas et Numpy - pour manipuler les data, etc ...
#
import pandas as pd  
import numpy as np
#

## -- Step 3 : Exemple de calcul

In [10]:
#
# Importation d'un fichier de data smile
#
df =  pd.read_csv('Liste_Alcanes.txt',sep='*')
df

Unnamed: 0,SMILES
0,CC
1,CCC
2,CCCC
3,CCCCC
4,CCCCCC
...,...
140,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
141,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
142,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...
143,CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC...


In [13]:
#
# Calcul de tous les descripteurs possibles avec la méthode Mordred
#
import warnings
warnings.simplefilter("ignore", category=RuntimeWarning)
#
df_all_descriptors = All_Mordred_descriptors(df['SMILES'])

 21%|██        | 30/145 [00:54<00:51,  2.25it/s] 

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|██▏       | 32/145 [00:55<00:38,  2.90it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 23%|██▎       | 34/145 [00:58<01:00,  1.83it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 29%|██▉       | 42/145 [00:59<00:52,  1.97it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 30%|███       | 44/145 [00:59<00:30,  3.31it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 36%|███▌      | 52/145 [01:03<00:50,  1.86it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 51%|█████     | 74/145 [01:04<00:24,  2.91it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwarg

 52%|█████▏    | 76/145 [01:05<00:10,  6.35it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 55%|█████▌    | 80/145 [01:07<00:12,  5.17it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 57%|█████▋    | 82/145 [01:08<00:12,  4.98it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 57%|█████▋    | 83/145 [01:08<00:13,  4.45it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 59%|█████▉    | 86/145 [01:10<00:21,  2.74it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 61%|██████    | 88/145 [01:11<00:16,  3.37it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 63%|██████▎   | 92/145 [01:11<00:14,  3.61it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 65%|██████▍   | 94/145 [01:11<00:10,  4.81it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 67%|██████▋   | 97/145 [01:13<00:16,  2.87it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 73%|███████▎  | 106/145 [01:14<00:11,  3.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 78%|███████▊  | 113/145 [01:16<00:07,  4.17it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 79%|███████▉  | 115/145 [01:16<00:06,  4.98it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 81%|████████  | 117/145 [01:17<00:05,  4.93it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 81%|████████▏ | 118/145 [01:17<00:05,  4.81it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 83%|████████▎ | 120/145 [01:18<00:05,  4.24it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 84%|████████▍ | 122/145 [01:18<00:04,  5.38it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 85%|████████▍ | 123/145 [01:20<00:11,  1.97it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 89%|████████▉ | 129/145 [01:23<00:09,  1.67it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 90%|█████████ | 131/145 [01:24<00:04,  3.11it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 92%|█████████▏| 134/145 [01:24<00:03,  3.44it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 145/145 [01:25<00:00,  1.70it/s]


  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [14]:
#
# Réduction du nombre de descripteurs à ceux définis par l'analyse statistique - 245 data
#
X_Alcane = df_all_descriptors[noms_colonnes_247_TC]
#

In [15]:
#
# Nettoyage des data - A faire obligatoirement car parfois du texte subsiste ou des NaN, etc.
#
for col in X_Alcane.columns:
    # Convertir la colonne en float en remplaçant les valeurs non convertibles par zéro
    X_Alcane.loc[:, col] = pd.to_numeric(X_Alcane[col], errors='coerce').fillna(0)
#
X_Alcane = X_Alcane.fillna(0)
X_Alcane = X_Alcane.astype(float)
#
X_Alcane

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


Unnamed: 0,SpAbs_A,SpDiam_A,VR2_A,nH,nO,nX,ATS0dv,ATS6s,ATS0Z,ATS2Z,...,VSA_EState4,VSA_EState7,AMID_h,MID_N,piPC2,n6Ring,TopoPSA,GGI4,WPath,mZagreb1
0,2.000000,2.000000,0.707107,6.0,0.0,0.0,2.0,0.00,78.0,42.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.00,1.0,2.00
1,2.828427,2.828427,1.121195,8.0,0.0,0.0,6.0,0.00,116.0,103.0,...,0.0,1.250000,0.0,0.0,0.693147,0.0,0.0,0.00,4.0,2.25
2,4.472136,3.236068,1.472998,10.0,0.0,0.0,10.0,0.00,154.0,164.0,...,0.0,2.638889,0.0,0.0,1.098612,0.0,0.0,0.00,10.0,2.50
3,5.464102,3.464102,1.797343,12.0,0.0,0.0,14.0,9.00,192.0,225.0,...,0.0,4.076389,0.0,0.0,1.386294,0.0,0.0,0.00,20.0,2.75
4,6.987918,3.603875,2.104810,14.0,0.0,0.0,18.0,24.00,230.0,286.0,...,0.0,5.536389,0.0,0.0,1.609438,0.0,0.0,0.08,35.0,3.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,185.169775,3.999087,25.047227,294.0,0.0,0.0,578.0,1739.50,5550.0,8826.0,...,0.0,215.361939,0.0,0.0,4.976734,0.0,0.0,0.08,518665.0,38.00
141,186.432377,3.999099,25.168490,296.0,0.0,0.0,582.0,1751.75,5588.0,8887.0,...,0.0,216.861892,0.0,0.0,4.983607,0.0,0.0,0.08,529396.0,38.25
142,187.716206,3.999111,25.289469,298.0,0.0,0.0,586.0,1764.00,5626.0,8948.0,...,0.0,218.361846,0.0,0.0,4.990433,0.0,0.0,0.08,540274.0,38.50
143,188.978950,3.999123,25.410169,300.0,0.0,0.0,590.0,1776.25,5664.0,9009.0,...,0.0,219.861800,0.0,0.0,4.997212,0.0,0.0,0.08,551300.0,38.75


In [16]:
#
# Estimation des Propriétés Thermo
#
XX = X_Alcane
#
TC_predicted   = []
PC_predicted   = []
ACEN_predicted = []
NBP_predicted  = []
TTR_predicted  = []
VC_predicted   = []
#
TC_predicted   = TC.predict( XX )
PC_predicted   = PC.predict( XX )
ACEN_predicted = ACEN.predict( XX )
NBP_predicted  = NBP.predict( XX )
TTR_predicted  = TTR.predict( XX )
VC_predicted   = VC.predict( XX )
#

In [17]:
#
# Après on fait ce que l'on veut - Ici affichage simple des résultats
#
for index in range(len(TC_predicted)):
    print(TC_predicted[index] , PC_predicted[index] , ACEN_predicted[index] , NBP_predicted[index] , TTR_predicted[index] , VC_predicted[index] )

305.1141802777416 48.74638002810309 0.10042144484589088 185.52199740941447 95.89518648526901 0.14550192850353097
365.7209919557075 42.545905981991645 0.13970939253233716 232.618626931064 88.9377980661026 0.20800396970875118
421.6671552466249 38.074123141814944 0.18855984536507686 272.5385096334604 135.58145313095093 0.2602416757628265
469.61490313708 33.74210187962281 0.25101485723841843 309.6218432402569 143.71735282335504 0.31532354604436225
507.618898675859 30.050925671992836 0.3003288866277668 341.8817506786439 175.73246135476347 0.3667433164608363
541.3335596217 27.424218959684648 0.35340694853738436 372.4498200464385 185.07656541439823 0.4272118911136407
568.4017078005301 24.98420095532942 0.3987271892488397 398.67245244831145 209.2914806355334 0.48611429433247383
594.3844009674369 22.934469560770168 0.44434534854514535 424.2645532153512 224.67081484543868 0.5476400778093892
617.7365030913918 21.148763964065026 0.490808142076529 447.64438496298646 239.68586479114182 0.61689655402