# Pre-processing

In [1]:
# Import the necessary libraries
import numpy as np 
import pandas as pd 
import pubchempy as pcp
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import Descriptors3D
from rdkit.Chem import rdchem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Descriptors import MoleculeDescriptors
from rdkit.Chem.rdchem import PeriodicTable, GetPeriodicTable
from rdkit.Chem import Fragments
from rdkit.Chem.rdchem import EditableMol
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Draw.MolDrawing import MolDrawing, DrawingOptions
from rdkit.Chem import PyMol
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec
from sklearn import preprocessing
import re
from mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from gensim.models import word2vec


In [2]:
import numpy
numpy.__version__

'1.26.2'

In [3]:
import gensim 
gensim.__version__

'4.3.2'

## 1. Digital representations for the chemical compounds that make up the wine
First, we will obtain digital representations of chemical compounds using the mol2vec library

In [4]:
# Forming a list of chemical compounds
chem_list = ['Acetoine',
       'Acetaldehyde', 'Methanol', '1-propanol', 'Ethyl lactate',
       'Isobutanol', '1-butanol', '2-butanol', '2-methyl-1-butanol',
       '3-methyl-1-butanol', '2-methyl-1-propanol', '1-pentanol', '1-hexanol',
       '2,3-butanediol', '2-phenylethanol', '3-(Methylthio)-1-propanol',
       'Hexyl acetate', 'Ethyl octanoate', 'Diethyl succinate',
       '2-phenylethyl acetate', 'Diethyl malate', 'Ethyl decanoate',
       'Isoamyl alcohol', 'Isoamyl acetate', '1-heptanol', 'Ethyl acetate',
       'Isopropyl acetate', 'Isobutyl acetate', 'Ethyl butyrate',
       'Ethyl hexanoate', 'Ethyl isovalerate', 'Pentyl acetate',
       'Phenyl acetate', 'Ethyl caprylate', 'alpha-TERPINEOL', 'Linalool ',
       'Nerol', 'Citronellol', 'Geraniol', '1-octanol', 'Hexanoic acid',
       'Octanoic acid', 'Decanoic acid', 'Propanoic acid', 'butyric acid',
       '2-methylpropanoic acid', '2-methylbutyric acid',
       '3-methylbutyric acid', 'Dodecanoic acid']

In [5]:
# Create a new table and add the SMILES of all components to it
smiles_df = pd.DataFrame(chem_list, columns=['Name'])
smiles_df.head()


Unnamed: 0,Name
0,Acetoine
1,Acetaldehyde
2,Methanol
3,1-propanol
4,Ethyl lactate


In [6]:
# Function for getting IsomericSMILES from pcp
f_isomeric_smiles = lambda x: pcp.get_properties('IsomericSMILES', x, 'name')[0]['IsomericSMILES'] if isinstance(x, str) else None
# Adding a new column using a previously written function
smiles_df['IsomericSMILES'] = smiles_df['Name'].apply(f_isomeric_smiles)
smiles_df.head()

Unnamed: 0,Name,IsomericSMILES
0,Acetoine,CC(C(=O)C)O
1,Acetaldehyde,CC=O
2,Methanol,CO
3,1-propanol,CCCO
4,Ethyl lactate,CCOC(=O)C(C)O


In [7]:
smiles_df.index = smiles_df['IsomericSMILES']
smiles_df.head()

Unnamed: 0_level_0,Name,IsomericSMILES
IsomericSMILES,Unnamed: 1_level_1,Unnamed: 2_level_1
CC(C(=O)C)O,Acetoine,CC(C(=O)C)O
CC=O,Acetaldehyde,CC=O
CO,Methanol,CO
CCCO,1-propanol,CCCO
CCOC(=O)C(C)O,Ethyl lactate,CCOC(=O)C(C)O


In [8]:
# Adding a vector representation of each chemical component

smiles_df['mol'] = smiles_df['IsomericSMILES'].apply(lambda x: Chem.MolFromSmiles(x))
smiles_df['sentence'] = smiles_df['mol'].apply(lambda x: MolSentence(mol2alt_sentence(x, 1)))

model = word2vec.Word2Vec.load('C:/Users/V/Downloads/model_300dim.pkl')

def sentences2vec(sentences, model, unseen=None):
    keys = set(model.wv.index_to_key)  # Updated line
    vec = []
    if unseen:
        unseen_vec = model.wv.get_vector(unseen) if unseen in keys else None  # Fixed line
    for sentence in sentences:
        if unseen:
            vec.append(np.sum([model.wv.get_vector(token) if token in keys else unseen_vec for token in sentence], axis=0))
        else:
            vec.append(np.sum([model.wv.get_vector(token) for token in sentence if token in keys], axis=0))
    return np.array(vec)

smiles_df['mol2vec'] = [DfVec(x) for x in sentences2vec(smiles_df['sentence'], model, unseen='UNK')]
Mol2vec_list = ['Mol2vec' + str(x) for x in range(100)]
smiles_df[Mol2vec_list] = 0

for ind in smiles_df.index:
    try:
        MV_Comp = smiles_df.loc[ind, 'mol2vec'].vec.tolist()

    except:
        MV_Comp = smiles_df.loc[ind, 'mol2vec'][0].vec.tolist()
    
    for MV in Mol2vec_list:
        smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]



  smiles_df[Mol2vec_list] = 0
  smiles_df[Mol2vec_list] = 0
  smiles_df[Mol2vec_list] = 0
  smiles_df[Mol2vec_list] = 0
  smiles_df[Mol2vec_list] = 0
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV] = MV_Comp[Mol2vec_list.index(MV)]
  smiles_df.loc[ind, MV]

In [9]:
smiles_df.head()

Unnamed: 0_level_0,Name,IsomericSMILES,mol,sentence,mol2vec,Mol2vec0,Mol2vec1,Mol2vec2,Mol2vec3,Mol2vec4,...,Mol2vec90,Mol2vec91,Mol2vec92,Mol2vec93,Mol2vec94,Mol2vec95,Mol2vec96,Mol2vec97,Mol2vec98,Mol2vec99
IsomericSMILES,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CC(C(=O)C)O,Acetoine,CC(C(=O)C)O,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 3537119515, 2245273601, 257963457...","(100,) dimensional vector",2.165806,-2.663183,-2.364369,0.574677,-0.391675,...,5.363455,-0.34001,2.675683,-3.28533,2.845707,-3.660943,3.207551,4.787437,-7.774351,-0.589282
CC=O,Acetaldehyde,CC=O,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 3545353036, 2246703798, 446538036...","(100,) dimensional vector",0.311404,-0.194385,-1.674949,0.386866,-0.280317,...,4.051633,-0.967849,-0.224607,-2.040269,1.296507,-1.318476,0.155246,1.844942,-2.502734,-0.553457
CO,Methanol,CO,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 864662311, 1533899907)","(100,) dimensional vector",-0.008597,0.102382,-0.601383,-0.555249,-0.372062,...,0.993908,-0.471332,-0.538349,-0.847326,1.177116,-1.265408,0.216298,0.995225,-1.587683,-0.42592
CCCO,1-propanol,CCCO,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 3542456614, 2245384272, 117312591...","(100,) dimensional vector",1.944748,0.032287,-0.226522,-0.322337,-1.552486,...,2.966865,2.513301,0.597738,-4.438979,2.601466,-1.00453,-1.72722,1.734237,-4.190338,-2.040512
CCOC(=O)C(C)O,Ethyl lactate,CCOC(=O)C(C)O,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 3542456614, 2245384272, 399408866...","(100,) dimensional vector",4.583887,-1.319448,-1.892589,-0.676893,0.215674,...,6.587327,1.254742,1.413754,-3.993537,3.934108,-4.057415,2.761223,6.622914,-8.496894,-2.779251


## 2. Adding Concentrations
At this stage, we will create matrices for each wine, for this we will import a dataset containing concentrations and create an array of matrices characterizing each wine, we will also create an array of target values

### 2.1. Working with concentrations

In [10]:
df_aroma = pd.read_csv('WineAroma.csv')
# Rename columns to make it easier to search
df_aroma = df_aroma.rename(columns={"3-methyltiopropanol": "3-(Methylthio)-1-propanol", 
                          "α - terpineol": "alpha-TERPINEOL", 
                          "β – citronellol": "Citronellol", 
                          "Butyric acid": "butyric acid",
                          "Acetaldehyde, mg/dm3": "Acetaldehyde"})
df_aroma = df_aroma.fillna(0) # Filling in missing values with zeros

In [11]:
# We remove columns with zero concentrations
df_aroma = df_aroma.drop(columns=['Ethyl butyrate', 'alpha-TERPINEOL', 'Linalool ', 'Nerol', 'Citronellol'])

In [12]:
conc_df = df_aroma.loc[:, 'Acetoine' : 'Dodecanoic acid']
conc_df = conc_df.applymap(lambda x: 0 if isinstance(x, str) else x)
conc_df = conc_df.astype(float)
conc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 449 entries, 0 to 448
Data columns (total 44 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Acetoine                   449 non-null    float64
 1   Acetaldehyde               449 non-null    float64
 2   Methanol                   449 non-null    float64
 3   1-propanol                 449 non-null    float64
 4   Ethyl lactate              449 non-null    float64
 5   Isobutanol                 449 non-null    float64
 6   1-butanol                  449 non-null    float64
 7   2-butanol                  449 non-null    float64
 8   2-methyl-1-butanol         449 non-null    float64
 9   3-methyl-1-butanol         449 non-null    float64
 10  2-methyl-1-propanol        449 non-null    float64
 11  1-pentanol                 449 non-null    float64
 12  1-hexanol                  449 non-null    float64
 13  2,3-butanediol             449 non-null    float64

  conc_df = conc_df.applymap(lambda x: 0 if isinstance(x, str) else x)


In [13]:
# Normalization of concentration
sc_conc = preprocessing.MinMaxScaler()
conc_df_norm = sc_conc.fit_transform(conc_df)
conc_df_norm = pd.DataFrame(conc_df_norm, columns = conc_df.columns)
conc_df_norm.head()

Unnamed: 0,Acetoine,Acetaldehyde,Methanol,1-propanol,Ethyl lactate,Isobutanol,1-butanol,2-butanol,2-methyl-1-butanol,3-methyl-1-butanol,...,1-octanol,Hexanoic acid,Octanoic acid,Decanoic acid,Propanoic acid,butyric acid,2-methylpropanoic acid,2-methylbutyric acid,3-methylbutyric acid,Dodecanoic acid
0,0.0,0.097291,0.683416,0.0,0.0,0.0,0.0,0.006178,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.06951,0.881677,0.0,0.0,0.0,0.0,0.006178,0.198434,0.482396,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006514,0.00012,...,0.361538,0.277135,0.833348,0.095229,0.0,0.0,0.0,0.0,0.0,0.516129
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.034824,6.1e-05,...,0.153846,0.204604,0.667595,0.080752,0.0,0.0,0.0,0.0,0.0,0.458065


### 2.2. Working with the digital representation of molecules

In [14]:
smiles_df.index = smiles_df['Name']
smiles_name = smiles_df['Name']
smiles_df.head()

Unnamed: 0_level_0,Name,IsomericSMILES,mol,sentence,mol2vec,Mol2vec0,Mol2vec1,Mol2vec2,Mol2vec3,Mol2vec4,...,Mol2vec90,Mol2vec91,Mol2vec92,Mol2vec93,Mol2vec94,Mol2vec95,Mol2vec96,Mol2vec97,Mol2vec98,Mol2vec99
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acetoine,Acetoine,CC(C(=O)C)O,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 3537119515, 2245273601, 257963457...","(100,) dimensional vector",2.165806,-2.663183,-2.364369,0.574677,-0.391675,...,5.363455,-0.34001,2.675683,-3.28533,2.845707,-3.660943,3.207551,4.787437,-7.774351,-0.589282
Acetaldehyde,Acetaldehyde,CC=O,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 3545353036, 2246703798, 446538036...","(100,) dimensional vector",0.311404,-0.194385,-1.674949,0.386866,-0.280317,...,4.051633,-0.967849,-0.224607,-2.040269,1.296507,-1.318476,0.155246,1.844942,-2.502734,-0.553457
Methanol,Methanol,CO,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 864662311, 1533899907)","(100,) dimensional vector",-0.008597,0.102382,-0.601383,-0.555249,-0.372062,...,0.993908,-0.471332,-0.538349,-0.847326,1.177116,-1.265408,0.216298,0.995225,-1.587683,-0.42592
1-propanol,1-propanol,CCCO,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 3542456614, 2245384272, 117312591...","(100,) dimensional vector",1.944748,0.032287,-0.226522,-0.322337,-1.552486,...,2.966865,2.513301,0.597738,-4.438979,2.601466,-1.00453,-1.72722,1.734237,-4.190338,-2.040512
Ethyl lactate,Ethyl lactate,CCOC(=O)C(C)O,<rdkit.Chem.rdchem.Mol object at 0x0000028F966...,"(2246728737, 3542456614, 2245384272, 399408866...","(100,) dimensional vector",4.583887,-1.319448,-1.892589,-0.676893,0.215674,...,6.587327,1.254742,1.413754,-3.993537,3.934108,-4.057415,2.761223,6.622914,-8.496894,-2.779251


In [15]:
# Normalization of digital representation
smiles_df = smiles_df.loc[:, 'Mol2vec0':]
sc_smiles = preprocessing.MinMaxScaler()
smiles_df_norm = sc_smiles.fit_transform(smiles_df)
smiles_df_norm = pd.DataFrame(smiles_df_norm, columns = smiles_df[Mol2vec_list].columns)
smiles_df_norm.index = smiles_name
smiles_df_norm.head()

Unnamed: 0_level_0,Mol2vec0,Mol2vec1,Mol2vec2,Mol2vec3,Mol2vec4,Mol2vec5,Mol2vec6,Mol2vec7,Mol2vec8,Mol2vec9,...,Mol2vec90,Mol2vec91,Mol2vec92,Mol2vec93,Mol2vec94,Mol2vec95,Mol2vec96,Mol2vec97,Mol2vec98,Mol2vec99
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Acetoine,0.342303,0.0,0.50582,0.64861,0.762878,0.606743,0.673172,0.845706,0.817423,0.491881,...,0.353867,0.313466,0.554307,0.863411,0.195936,0.667635,0.862589,0.42864,0.418414,0.862557
Acetaldehyde,0.138071,0.364761,0.606131,0.60117,0.781662,0.880098,0.952085,0.776525,0.724816,0.818206,...,0.247629,0.242691,0.263071,0.933166,0.01402,0.96072,0.380375,0.096045,0.913979,0.868594
Methanol,0.102829,0.408608,0.762334,0.363199,0.766187,1.0,0.852851,0.871195,0.626047,0.92114,...,0.0,0.298662,0.231566,1.0,0.0,0.96736,0.39002,0.0,1.0,0.890086
1-propanol,0.317957,0.398251,0.816877,0.422031,0.567074,0.877962,0.644565,0.807199,0.737231,0.594659,...,0.15978,0.635116,0.345648,0.798778,0.167256,1.0,0.082976,0.083532,0.755334,0.618005
Ethyl lactate,0.608614,0.198535,0.574464,0.332473,0.865326,0.530536,0.526669,0.62487,0.899514,0.509371,...,0.452982,0.49324,0.427589,0.823734,0.323743,0.618029,0.792076,0.636108,0.350491,0.493518


In [16]:
final_df = conc_df.copy()
final_df['Matrix'] = 0
final_df.head()

Unnamed: 0,Acetoine,Acetaldehyde,Methanol,1-propanol,Ethyl lactate,Isobutanol,1-butanol,2-butanol,2-methyl-1-butanol,3-methyl-1-butanol,...,Hexanoic acid,Octanoic acid,Decanoic acid,Propanoic acid,butyric acid,2-methylpropanoic acid,2-methylbutyric acid,3-methylbutyric acid,Dodecanoic acid,Matrix
0,0.0,33.76,110.03,0.0,0.0,0.0,0.0,0.05,373.02,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,24.12,141.95,0.0,0.0,0.0,0.0,0.05,74.02,173.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.43,0.043,...,2.155,9.276,2.914,0.0,0.0,0.0,0.0,0.0,0.08,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.99,0.022,...,1.591,7.431,2.471,0.0,0.0,0.0,0.0,0.0,0.071,0


### 2.3. Creating matrices for a neural network


In [17]:
# Compilation of matrices for further neural processing by multiplying concentrations and digital representations
final_array = []
for ind in conc_df.index:
    new_matrix = []
    for col in conc_df.columns:
        new_str = smiles_df_norm.loc[col, 'Mol2vec0':].to_list()
        new_str = [elem * conc_df.loc[ind, col] for elem in new_str]
        new_matrix.append(new_str)

    final_array.append(new_matrix)


In [18]:
# Normalization of the obtained matrices

X_array = np.array(final_array, dtype=np.float64)
# We find the minimum and maximum in the entire array
min_val = np.min(X_array)
max_val = np.max(X_array)

# Normalize the array
X_array_norm = (X_array - min_val) / (max_val - min_val)


In [19]:
np.save('X_array', X_array_norm) # Formation of X

## 3. Formation of a list of aroma (target value)

In [20]:
df_y = pd.read_csv('WineAroma.csv')
df_y = df_y.fillna(0) # Replacing missing values with zeros
df_y

Unnamed: 0,Wine,Grape sort,Year,Region,Country,Acetoine,Acetaldehyde,Methanol,1-propanol,Ethyl lactate,...,Herbs and spices,Tobacco/Smoke,Wood,Berries,Citrus,Fruits,Nuts,Coffee,Chocolate/Cacao,Flowers
0,Nobile,Rubin,2017,0,Bulgaria,0.0,33.76,110.03,0.0,0.000,...,0,1,0,1,0,0,0,0,1,0
1,Vidinska Gamza,Storgozia,2017,0,Bulgaria,0.0,24.12,141.95,0.0,0.000,...,1,1,0,1,0,0,0,0,1,0
2,Traversa,Tannat,2000,0,Uruguay,0.0,0.00,0.00,0.0,0.000,...,0,0,0,1,0,0,0,0,0,0
3,La Comtesse,Albarino,2015,Pontevedra,Spain,0.0,0.00,0.00,0.0,0.000,...,1,1,0,1,0,1,0,1,1,0
4,Armas de Lanzos,Albarino,2015,Pontevedra,Spain,0.0,0.00,0.00,0.0,0.000,...,1,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,Brisas del Este,Tannat,2005,Canelones,Uruguay,0.0,0.00,0.00,0.0,3.022,...,0,0,0,1,0,1,0,0,0,0
445,Bouza,Tannat,2005,Canelones,Uruguay,0.0,0.00,0.00,0.0,4.892,...,0,0,0,1,0,0,0,0,0,0
446,Salida,Tannat,2005,Canelones,Uruguay,0.0,0.00,0.00,0.0,6.256,...,1,0,0,1,0,0,0,0,0,0
447,Single Vineyard,Tannat,2005,Canelones,Uruguay,0.0,0.00,0.00,0.0,118.317,...,0,1,0,1,0,0,1,0,1,1


In [None]:
df_y.columns 

Index(['Wine', 'Grape sort', 'Year', 'Region', 'Country', 'Acetoine',
       'Acetaldehyde', 'Methanol', '1-propanol', 'Ethyl lactate', 'Isobutanol',
       '1-butanol', '2-butanol', '2-methyl-1-butanol', '3-methyl-1-butanol',
       '2-methyl-1-propanol', '1-pentanol', '1-hexanol', '2,3-butanediol',
       '2-phenylethanol', '3-(Methylthio)-1-propanol', 'Hexyl acetate',
       'Ethyl octanoate', 'Diethyl succinate', '2-phenylethyl acetate',
       'Diethyl malate', 'Ethyl decanoate', 'Isoamyl alcohol',
       'Isoamyl acetate', '1-heptanol', 'Ethyl acetate', 'Isopropyl acetate',
       'Isobutyl acetate', 'Ethyl butyrate', 'Ethyl hexanoate',
       'Ethyl isovalerate', 'Pentyl acetate', 'Phenyl acetate',
       'Ethyl caprylate', 'alpha-TERPINEOL', 'Linalool ', 'Nerol',
       'Citronellol', 'Geraniol', '1-octanol', 'Hexanoic acid',
       'Octanoic acid', 'Decanoic acid', 'Propanoic acid', 'butyric acid',
       '2-methylpropanoic acid', '2-methylbutyric acid',
       '3-methylbutyri

In [22]:
# Making a list of flavors for each wine
y_array = []
for ind in df_y.index:
    new_str = df_y.loc[ind, 'Herbs and spices':].to_list()
    new_str = [float(i) for i in new_str]
    y_array.append(new_str)


In [23]:
Y_array = np.array(y_array, dtype=np.float64)
np.save('Y_array', Y_array) # Formation of Y