# Pre-processing

In [1]:
# Import the necessary libraries
import numpy as np 
import pandas as pd 
import pubchempy as pcp
import time
import requests
from requests.utils import quote
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import Descriptors3D
from rdkit.Chem import rdchem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.rdchem import PeriodicTable, GetPeriodicTable
from rdkit.Chem import Fragments
from rdkit.Chem.rdchem import EditableMol
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
from rdkit.Chem import PyMol
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from sklearn import preprocessing
import re
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
import numpy
numpy.__version__

'1.24.4'

In [3]:
import gensim 
gensim.__version__

'4.3.3'

## 1. Digital representations for the chemical compounds that make up the wine
First: obtain digital representations of chemical compounds using the mol2vec library

In [4]:
# Forming a list of chemical compounds
chem_list = ['Acetoine',
       'Acetaldehyde', 'Methanol', '1-propanol', 'Ethyl lactate',
       'Isobutanol', '1-butanol', '2-butanol', '2-methyl-1-butanol',
       '3-methyl-1-butanol', '2-methyl-1-propanol', '1-pentanol', '1-hexanol',
       '2,3-butanediol', '2-phenylethanol', '3-(Methylthio)-1-propanol',
       'Hexyl acetate', 'Ethyl octanoate', 'Diethyl succinate',
       '2-phenylethyl acetate', 'Diethyl malate', 'Ethyl decanoate',
       'Isoamyl alcohol', 'Isoamyl acetate', '1-heptanol', 'Ethyl acetate',
       'Isopropyl acetate', 'Isobutyl acetate', 'Ethyl butyrate',
       'Ethyl hexanoate', 'Ethyl isovalerate', 'Pentyl acetate',
       'Phenyl acetate', 'Ethyl caprylate', 'alpha-TERPINEOL', 'Linalool ',
       'Nerol', 'Citronellol', 'Geraniol', '1-octanol', 'Hexanoic acid',
       'Octanoic acid', 'Decanoic acid', 'Propanoic acid', 'butyric acid',
       '2-methylpropanoic acid', '2-methylbutyric acid',
       '3-methylbutyric acid', 'Dodecanoic acid']

In [5]:
# Create a new table and add the SMILES of all components to it
smiles_df = pd.DataFrame(chem_list, columns=['Name'])
smiles_df.head()

Unnamed: 0,Name
0,Acetoine
1,Acetaldehyde
2,Methanol
3,1-propanol
4,Ethyl lactate


In [6]:
smiles_df = pd.DataFrame(chem_list, columns=['Name'])
smiles_df['Name'] = smiles_df['Name'].astype(str).str.strip()

alias = {
    'Acetoine': 'Acetoin',
    'alpha-TERPINEOL': 'alpha-Terpineol',
    'butyric acid': 'Butanoic acid',
    'Hexanoic acid': 'Caproic acid',
    'Octanoic acid': 'Caprylic acid',
    'Decanoic acid': 'Capric acid',
    'Dodecanoic acid': 'Lauric acid',
    '2-methylpropanoic acid': 'Isobutyric acid',
    '3-methylbutyric acid': 'Isovaleric acid',
    'Linalool': 'Linalool',
}
smiles_df['Name_query'] = smiles_df['Name'].map(lambda x: alias.get(x, x))

# Offline fallback mapping (name -> SMILES) used only when external resolvers
# (PubChem / NCI Cactus) are unavailable or rate-limited.
local_map = {
    'Acetoin': 'CC(=O)C(C)O',
    'Acetaldehyde': 'CC=O',
    'Methanol': 'CO',
    '1-propanol': 'CCCO',
    'Ethyl lactate': 'CCOC(=O)C(O)C',
    'Isobutanol': 'CC(C)CO',
    '1-butanol': 'CCCCO',
    '2-butanol': 'CCC(O)C',
    '2-methyl-1-butanol': 'CCC(C)CO',
    '3-methyl-1-butanol': 'CC(C)CCO',
    '2-methyl-1-propanol': 'CC(C)CO',
    '1-pentanol': 'CCCCCO',
    '1-hexanol': 'CCCCCCO',
    '2,3-butanediol': 'CC(C)(O)CO',
    '2-phenylethanol': 'OCc1ccccc1',
    '3-(Methylthio)-1-propanol': 'CSCCCO',
    'Hexyl acetate': 'CCCCCCOC(=O)C',
    'Ethyl octanoate': 'CCCCCCCOC(=O)CC',
    'Diethyl succinate': 'CCOC(=O)CCC(=O)OCC',
    '2-phenylethyl acetate': 'CC(=O)OCCc1ccccc1',
    'Diethyl malate': 'CCOC(=O)CH(OH)CH2C(=O)OCC',
    'Ethyl decanoate': 'CCCCCCCCCOC(=O)CC',
    'Isoamyl alcohol': 'CC(C)CCO',
    'Isoamyl acetate': 'CC(=O)OCC(C)C',
    '1-heptanol': 'CCCCCCCO',
    'Ethyl acetate': 'CCOC(=O)C',
    'Isopropyl acetate': 'CC(=O)OC(C)C',
    'Isobutyl acetate': 'CC(=O)OCC(C)C',
    'Ethyl butyrate': 'CCCC(=O)OCC',
    'Ethyl hexanoate': 'CCCCCC(=O)OCC',
    'Ethyl isovalerate': 'CC(C)CC(=O)OCC',
    'Pentyl acetate': 'CC(=O)OCCCCC',
    'Phenyl acetate': 'CC(=O)Oc1ccccc1',
    'Ethyl caprylate': 'CCCCCCCC(=O)OCC',
    'alpha-Terpineol': 'CC1=CCC(CC1)(C)C(C)(O)C',
    'Linalool': 'CC(C)=CCC=C(C)CO',
    'Nerol': 'CC(C)=CCC/C(C)=C/CO',
    'Citronellol': 'CC(C)=CCCC(C)CO',
    'Geraniol': 'CC(C)=CCC/C(C)=C/CO',
    '1-octanol': 'CCCCCCCCO',
    'Caproic acid': 'CCCCCC(=O)O',
    'Caprylic acid': 'CCCCCCCC(=O)O',
    'Capric acid': 'CCCCCCCCCC(=O)O',
    'Propanoic acid': 'CCC(=O)O',
    'Butanoic acid': 'CCCC(=O)O',
    'Isobutyric acid': 'CC(C)C(=O)O',
    '2-Methylbutanoic acid': 'CCC(C)C(=O)O',
    'Isovaleric acid': 'CC(C)CC(=O)O',
    'Lauric acid': 'CCCCCCCCCCCC(=O)O',
}

# --- HTTP session ---
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (PubChem/NCI client)"})

def f_isomeric_smiles(name: str, sleep=0.12, timeout=10):
    if not isinstance(name, str) or not name.strip():
        return None
    qname = alias.get(name.strip(), name.strip())
    time.sleep(sleep)  
    
    try:
        u = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(qname, safe='')}/property/IsomericSMILES,CanonicalSMILES/JSON"
        r = session.get(u, timeout=timeout)
        data = r.json()
        if "PropertyTable" in data:
            props = (data.get("PropertyTable") or {}).get("Properties") or []
            if props:
                rec = props[0] or {}
                iso = rec.get("IsomericSMILES")
                can = rec.get("CanonicalSMILES")
                if iso or can:
                    return iso or can
    except Exception:
        pass

    # PubChem: name -> CID -> properties
    try:
        u = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{quote(qname, safe='')}/cids/JSON"
        r = session.get(u, timeout=timeout)
        cids = ((r.json().get("IdentifierList") or {}).get("CID")) or []
        if cids:
            cid = cids[0]
            u = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/{cid}/property/IsomericSMILES,CanonicalSMILES/JSON"
            r = session.get(u, timeout=timeout)
            data = r.json()
            props = (data.get("PropertyTable") or {}).get("Properties") or []
            if props:
                rec = props[0] or {}
                iso = rec.get("IsomericSMILES")
                can = rec.get("CanonicalSMILES")
                if iso or can:
                    return iso or can
    except Exception:
        pass

    try:
        u = f"https://cactus.nci.nih.gov/chemical/structure/{quote(qname, safe='')}/smiles"
        r = session.get(u, timeout=timeout)
        if r.status_code == 200:
            txt = r.text.strip()
            if txt and "Not Found" not in txt and "<" not in txt:
                return txt
    except Exception:
        pass

    return local_map.get(qname)

smiles_df['IsomericSMILES'] = smiles_df['Name_query'].apply(f_isomeric_smiles)

print("Not found:", smiles_df['IsomericSMILES'].isna().sum())
smiles_df.head(10)


Not found: 0


Unnamed: 0,Name,Name_query,IsomericSMILES
0,Acetoine,Acetoin,CC(O)C(C)=O
1,Acetaldehyde,Acetaldehyde,CC=O
2,Methanol,Methanol,CO
3,1-propanol,1-propanol,CCCO
4,Ethyl lactate,Ethyl lactate,CCOC(=O)C(C)O
5,Isobutanol,Isobutanol,CC(C)CO
6,1-butanol,1-butanol,CCCCO
7,2-butanol,2-butanol,CCC(C)O
8,2-methyl-1-butanol,2-methyl-1-butanol,CCC(C)CO
9,3-methyl-1-butanol,3-methyl-1-butanol,CC(C)CCO


In [7]:
smiles_df.index = smiles_df['IsomericSMILES']
smiles_df.head()

Unnamed: 0_level_0,Name,Name_query,IsomericSMILES
IsomericSMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CC(O)C(C)=O,Acetoine,Acetoin,CC(O)C(C)=O
CC=O,Acetaldehyde,Acetaldehyde,CC=O
CO,Methanol,Methanol,CO
CCCO,1-propanol,1-propanol,CCCO
CCOC(=O)C(C)O,Ethyl lactate,Ethyl lactate,CCOC(=O)C(C)O


In [8]:
smiles_df['mol'] = smiles_df['IsomericSMILES'].apply(lambda s: Chem.MolFromSmiles(s) if isinstance(s, str) else None)
smiles_df['sentence'] = smiles_df['mol'].apply(lambda m: MolSentence(mol2alt_sentence(m, 1)) if m is not None else MolSentence([]))

model = word2vec.Word2Vec.load('model_300dim.pkl')
dim_src = model.wv.vector_size 

def sentences2vec(sentences, model, unseen='UNK'):
    keys = set(model.wv.index_to_key)
    unseen_vec = model.wv.get_vector(unseen) if unseen in keys else np.zeros(model.wv.vector_size, dtype=np.float32)
    out = []
    for sent in sentences:
        toks = list(sent)
        if not toks:
            out.append(unseen_vec.copy())
            continue
        acc = np.zeros(model.wv.vector_size, dtype=np.float32)
        for t in toks:
            acc += model.wv.get_vector(t) if t in keys else unseen_vec
        out.append(acc)
    return np.vstack(out).astype(np.float32)

vecs_300 = sentences2vec(smiles_df['sentence'], model, unseen='UNK')

# --- PCA (V×300, V≫100) ---
vocab_mat = model.wv.vectors 
scaler = StandardScaler(with_mean=True, with_std=True)
vocab_scaled = scaler.fit_transform(vocab_mat) 
pca = PCA(n_components=100, svd_solver='randomized', random_state=42)
pca.fit(vocab_scaled)

vecs_scaled = scaler.transform(vecs_300)       
vecs_100 = pca.transform(vecs_scaled).astype(np.float32)

smiles_df['mol2vec'] = [DfVec(v) for v in vecs_100]

Mol2vec_list = [f"Mol2vec{i}" for i in range(100)]
smiles_df[Mol2vec_list] = vecs_100

  smiles_df[Mol2vec_list] = vecs_100
  smiles_df[Mol2vec_list] = vecs_100
  smiles_df[Mol2vec_list] = vecs_100
  smiles_df[Mol2vec_list] = vecs_100
  smiles_df[Mol2vec_list] = vecs_100
  smiles_df[Mol2vec_list] = vecs_100


In [9]:
smiles_df.head()

Unnamed: 0_level_0,Name,Name_query,IsomericSMILES,mol,sentence,mol2vec,Mol2vec0,Mol2vec1,Mol2vec2,Mol2vec3,...,Mol2vec90,Mol2vec91,Mol2vec92,Mol2vec93,Mol2vec94,Mol2vec95,Mol2vec96,Mol2vec97,Mol2vec98,Mol2vec99
IsomericSMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CC(O)C(C)=O,Acetoine,Acetoin,CC(O)C(C)=O,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 3537119515, 2245273601, 257963457...","(100,) dimensional vector",4.015188,32.161507,6.612415,-18.856762,...,-1.496407,-2.350022,5.844427,-3.546087,-12.798881,-3.380084,-8.348577,0.134422,-8.949853,-8.625219
CC=O,Acetaldehyde,Acetaldehyde,CC=O,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 3545353036, 2246703798, 446538036...","(100,) dimensional vector",-2.071191,8.975573,12.101973,-13.984548,...,-0.841639,-2.737437,5.109271,-2.387955,-0.513017,-0.124627,-2.238647,-3.149096,-3.508213,-2.376474
CO,Methanol,Methanol,CO,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 864662311, 1533899907)","(100,) dimensional vector",-0.625881,4.537026,1.408581,-2.789965,...,-0.819517,-0.780651,1.686044,0.055773,-2.10084,-1.208281,-0.397336,-0.40837,-1.969918,-0.917015
CCCO,1-propanol,1-propanol,CCCO,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 3542456614, 2245384272, 117312591...","(100,) dimensional vector",0.254139,15.224995,0.893574,-4.234551,...,-7.746664,-1.868781,3.099017,-4.963908,-9.283813,4.429289,-2.659184,-1.415973,-6.830268,-9.272933
CCOC(=O)C(C)O,Ethyl lactate,Ethyl lactate,CCOC(=O)C(C)O,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 3542456614, 2245384272, 399408866...","(100,) dimensional vector",3.31885,38.945786,8.586362,-18.389406,...,-1.607992,-5.572074,3.771054,-5.962752,-17.407959,-0.533512,-2.914109,-2.180466,-15.222984,-8.718189


## 2. Adding Concentrations
At this stage, matrices for each wine will be created; a dataset containing concentrations will be imported to construct an array of matrices characterizing each wine, and an array of target values will also be created

### 2.1. Working with concentrations

In [10]:
df_aroma = pd.read_csv('WineAroma.csv')
# Rename columns to make it easier to search
df_aroma = df_aroma.rename(columns={"3-methyltiopropanol": "3-(Methylthio)-1-propanol", 
                          "α - terpineol": "alpha-TERPINEOL", 
                          "β – citronellol": "Citronellol", 
                          "Butyric acid": "butyric acid",
                          "Acetaldehyde, mg/dm3": "Acetaldehyde"})
df_aroma = df_aroma.fillna(0) # Filling in missing values with zeros

In [11]:
# Remove columns with zero concentrations
df_aroma = df_aroma.drop(columns=['Ethyl butyrate', 'alpha-TERPINEOL', 'Linalool ', 'Nerol', 'Citronellol'])

In [12]:
conc_df = df_aroma.loc[:, 'Acetoine' : 'Dodecanoic acid']
conc_df = conc_df.applymap(lambda x: 0 if isinstance(x, str) else x)
conc_df = conc_df.astype(float)
conc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 449 entries, 0 to 448
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Acetoine                   449 non-null    float64
 1   Acetaldehyde               449 non-null    float64
 2   Methanol                   449 non-null    float64
 3   1-propanol                 449 non-null    float64
 4   Ethyl lactate              449 non-null    float64
 5   Isobutanol                 449 non-null    float64
 6   1-butanol                  449 non-null    float64
 7   2-butanol                  449 non-null    float64
 8   2-methyl-1-butanol         449 non-null    float64
 9   3-methyl-1-butanol         449 non-null    float64
 10  2-methyl-1-propanol        449 non-null    float64
 11  1-pentanol                 449 non-null    float64
 12  1-hexanol                  449 non-null    float64
 13  2,3-butanediol             449 non-null    float64

  conc_df = conc_df.applymap(lambda x: 0 if isinstance(x, str) else x)


In [13]:
# Normalization of concentration
sc_conc = preprocessing.MinMaxScaler()
conc_df_norm = sc_conc.fit_transform(conc_df)
conc_df_norm = pd.DataFrame(conc_df_norm, columns = conc_df.columns)
conc_df_norm.head()

Unnamed: 0,Acetoine,Acetaldehyde,Methanol,1-propanol,Ethyl lactate,Isobutanol,1-butanol,2-butanol,2-methyl-1-butanol,3-methyl-1-butanol,...,1-octanol,Hexanoic acid,Octanoic acid,Decanoic acid,Propanoic acid,butyric acid,2-methylpropanoic acid,2-methylbutyric acid,3-methylbutyric acid,Dodecanoic acid
0,0.0,0.097291,0.683416,0.0,0.0,0.0,0.0,0.006178,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.06951,0.881677,0.0,0.0,0.0,0.0,0.006178,0.198434,0.482396,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006514,0.00012,...,0.361538,0.277135,0.833348,0.095229,0.0,0.0,0.0,0.0,0.0,0.516129
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034824,6.1e-05,...,0.153846,0.204604,0.667595,0.080752,0.0,0.0,0.0,0.0,0.0,0.458065


### 2.2. Working with the digital representation of molecules

In [14]:
smiles_df.index = smiles_df['Name']
smiles_df = smiles_df.drop('Name_query', axis=1)
smiles_name = smiles_df['Name']
smiles_df.head()

Unnamed: 0_level_0,Name,IsomericSMILES,mol,sentence,mol2vec,Mol2vec0,Mol2vec1,Mol2vec2,Mol2vec3,Mol2vec4,...,Mol2vec90,Mol2vec91,Mol2vec92,Mol2vec93,Mol2vec94,Mol2vec95,Mol2vec96,Mol2vec97,Mol2vec98,Mol2vec99
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acetoine,Acetoine,CC(O)C(C)=O,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 3537119515, 2245273601, 257963457...","(100,) dimensional vector",4.015188,32.161507,6.612415,-18.856762,13.955299,...,-1.496407,-2.350022,5.844427,-3.546087,-12.798881,-3.380084,-8.348577,0.134422,-8.949853,-8.625219
Acetaldehyde,Acetaldehyde,CC=O,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 3545353036, 2246703798, 446538036...","(100,) dimensional vector",-2.071191,8.975573,12.101973,-13.984548,5.626438,...,-0.841639,-2.737437,5.109271,-2.387955,-0.513017,-0.124627,-2.238647,-3.149096,-3.508213,-2.376474
Methanol,Methanol,CO,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 864662311, 1533899907)","(100,) dimensional vector",-0.625881,4.537026,1.408581,-2.789965,1.486079,...,-0.819517,-0.780651,1.686044,0.055773,-2.10084,-1.208281,-0.397336,-0.40837,-1.969918,-0.917015
1-propanol,1-propanol,CCCO,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 3542456614, 2245384272, 117312591...","(100,) dimensional vector",0.254139,15.224995,0.893574,-4.234551,3.849184,...,-7.746664,-1.868781,3.099017,-4.963908,-9.283813,4.429289,-2.659184,-1.415973,-6.830268,-9.272933
Ethyl lactate,Ethyl lactate,CCOC(=O)C(C)O,<rdkit.Chem.rdchem.Mol object at 0x00000209492...,"(2246728737, 3542456614, 2245384272, 399408866...","(100,) dimensional vector",3.31885,38.945786,8.586362,-18.389406,13.419109,...,-1.607992,-5.572074,3.771054,-5.962752,-17.407959,-0.533512,-2.914109,-2.180466,-15.222984,-8.718189


In [15]:
# Normalization of digital representation
smiles_df = smiles_df.loc[:, 'Mol2vec0':]
sc_smiles = preprocessing.MinMaxScaler()
smiles_df_norm = sc_smiles.fit_transform(smiles_df)
smiles_df_norm = pd.DataFrame(smiles_df_norm, columns = smiles_df[Mol2vec_list].columns)
smiles_df_norm.index = smiles_name
smiles_df_norm.head()

Unnamed: 0_level_0,Mol2vec0,Mol2vec1,Mol2vec2,Mol2vec3,Mol2vec4,Mol2vec5,Mol2vec6,Mol2vec7,Mol2vec8,Mol2vec9,...,Mol2vec90,Mol2vec91,Mol2vec92,Mol2vec93,Mol2vec94,Mol2vec95,Mol2vec96,Mol2vec97,Mol2vec98,Mol2vec99
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acetoine,0.801877,0.409614,0.246569,0.631153,0.269415,0.208739,0.36884,0.4438,0.477199,0.993605,...,0.840515,0.411997,0.388204,0.736687,0.563851,0.093956,0.157428,0.796222,0.761179,0.511559
Acetaldehyde,0.670903,0.065814,0.327185,0.743005,0.089458,0.512639,0.585742,0.574972,0.671702,0.892888,...,0.862723,0.382643,0.340616,0.819577,0.996026,0.248313,0.407434,0.60907,0.947367,0.907519
Methanol,0.702005,0.0,0.170149,1.0,0.0,0.412479,0.519154,0.723864,0.515124,0.73229,...,0.863474,0.53091,0.119027,0.99448,0.940172,0.196932,0.482777,0.765284,1.0,1.0
1-propanol,0.720942,0.158481,0.162586,0.966836,0.051058,0.490786,0.542737,0.650676,0.562896,0.532614,...,0.628521,0.448461,0.21049,0.63521,0.687499,0.464237,0.390227,0.707853,0.833701,0.470516
Ethyl lactate,0.786892,0.510211,0.275557,0.641882,0.25783,0.293533,0.259866,0.520377,0.542122,0.836678,...,0.83673,0.16786,0.253992,0.56372,0.401719,0.228926,0.379796,0.664279,0.546542,0.505668


In [16]:
final_df = conc_df.copy()
final_df['Matrix'] = 0
final_df.head()

Unnamed: 0,Acetoine,Acetaldehyde,Methanol,1-propanol,Ethyl lactate,Isobutanol,1-butanol,2-butanol,2-methyl-1-butanol,3-methyl-1-butanol,...,Hexanoic acid,Octanoic acid,Decanoic acid,Propanoic acid,butyric acid,2-methylpropanoic acid,2-methylbutyric acid,3-methylbutyric acid,Dodecanoic acid,Matrix
0,0.0,33.76,110.03,0.0,0.0,0.0,0.0,0.05,373.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,24.12,141.95,0.0,0.0,0.0,0.0,0.05,74.02,173.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.43,0.043,...,2.155,9.276,2.914,0.0,0.0,0.0,0.0,0.0,0.08,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.99,0.022,...,1.591,7.431,2.471,0.0,0.0,0.0,0.0,0.0,0.071,0


### 2.3. Creating matrices for a neural network


In [17]:
# Compilation of matrices for further neural processing by multiplying concentrations and digital representations
final_array = []
for ind in conc_df.index:
    new_matrix = []
    for col in conc_df.columns:
        new_str = smiles_df_norm.loc[col, 'Mol2vec0':].to_list()
        new_str = [elem * conc_df.loc[ind, col] for elem in new_str]
        new_matrix.append(new_str)

    final_array.append(new_matrix)


In [18]:
# Normalization of the obtained matrices

X_array = np.array(final_array, dtype=np.float64)
# find the minimum and maximum in the entire array
min_val = np.min(X_array)
max_val = np.max(X_array)

# Normalize the array
X_array_norm = (X_array - min_val) / (max_val - min_val)


In [19]:
np.save('X_array', X_array_norm) # Formation of X

## 3. Formation of a list of aroma (target value)

In [20]:
df_y = pd.read_csv('WineAroma.csv')
df_y = df_y.fillna(0) # Replacing missing values with zeros
df_y

Unnamed: 0,Wine,Grape sort,Year,Region,Country,Acetoine,Acetaldehyde,Methanol,1-propanol,Ethyl lactate,...,Herbs and spices,Tobacco/Smoke,Wood,Berries,Citrus,Fruits,Nuts,Coffee,Chocolate/Cacao,Flowers
0,Nobile,Rubin,2017,0,Bulgaria,0.0,33.76,110.03,0.0,0.000,...,0,1,0,1,0,0,0,0,1,0
1,Vidinska Gamza,Storgozia,2017,0,Bulgaria,0.0,24.12,141.95,0.0,0.000,...,1,1,0,1,0,0,0,0,1,0
2,Traversa,Tannat,2000,0,Uruguay,0.0,0.00,0.00,0.0,0.000,...,0,0,0,1,0,0,0,0,0,0
3,La Comtesse,Albarino,2015,Pontevedra,Spain,0.0,0.00,0.00,0.0,0.000,...,1,1,0,1,0,1,0,1,1,0
4,Armas de Lanzos,Albarino,2015,Pontevedra,Spain,0.0,0.00,0.00,0.0,0.000,...,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,Brisas del Este,Tannat,2005,Canelones,Uruguay,0.0,0.00,0.00,0.0,3.022,...,0,0,0,1,0,1,0,0,0,0
445,Bouza,Tannat,2005,Canelones,Uruguay,0.0,0.00,0.00,0.0,4.892,...,0,0,0,1,0,0,0,0,0,0
446,Salida,Tannat,2005,Canelones,Uruguay,0.0,0.00,0.00,0.0,6.256,...,1,0,0,1,0,0,0,0,0,0
447,Single Vineyard,Tannat,2005,Canelones,Uruguay,0.0,0.00,0.00,0.0,118.317,...,0,1,0,1,0,0,1,0,1,1


In [21]:
df_y.columns 

Index(['Wine', 'Grape sort', 'Year', 'Region', 'Country', 'Acetoine',
       'Acetaldehyde', 'Methanol', '1-propanol', 'Ethyl lactate', 'Isobutanol',
       '1-butanol', '2-butanol', '2-methyl-1-butanol', '3-methyl-1-butanol',
       '2-methyl-1-propanol', '1-pentanol', '1-hexanol', '2,3-butanediol',
       '2-phenylethanol', '3-(Methylthio)-1-propanol', 'Hexyl acetate',
       'Ethyl octanoate', 'Diethyl succinate', '2-phenylethyl acetate',
       'Diethyl malate', 'Ethyl decanoate', 'Isoamyl alcohol',
       'Isoamyl acetate', '1-heptanol', 'Ethyl acetate', 'Isopropyl acetate',
       'Isobutyl acetate', 'Ethyl butyrate', 'Ethyl hexanoate',
       'Ethyl isovalerate', 'Pentyl acetate', 'Phenyl acetate',
       'Ethyl caprylate', 'alpha-TERPINEOL', 'Linalool ', 'Nerol',
       'Citronellol', 'Geraniol', '1-octanol', 'Hexanoic acid',
       'Octanoic acid', 'Decanoic acid', 'Propanoic acid', 'butyric acid',
       '2-methylpropanoic acid', '2-methylbutyric acid',
       '3-methylbutyri

In [22]:
# Making a list of flavors for each wine
y_array = []
for ind in df_y.index:
    new_str = df_y.loc[ind, 'Herbs and spices':].to_list()
    new_str = [float(i) for i in new_str]
    y_array.append(new_str)


In [23]:
Y_array = np.array(y_array, dtype=np.float64)
np.save('Y_array', Y_array) # Formation of Y