# Data transformation

After cleaning the data, and obtaining what we could call our initial set of variables, there are numerous trnasformations that can or need to be done. The most esential one is separating data into test and training sets, but , we can also operate and transform the different features to create new ones, or to try different models to improve our predictive power. 

In [1]:
import pandas as pd
datapath = 'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\python\\winequality.csv' 
# place here the path of the file you are loading, if you have it on your local computer

winedata = pd.read_csv(datapath,sep = ';') # make sure to check the separator for your case

winedata.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,5.2,0.34,0.0,1.8,0.05,27.0,63.0,0.9916,3.68,0.79,14.0,6,red
1,6.2,0.55,0.45,12.0,0.049,27.0,186.0,0.9974,3.17,0.5,9.3,6,white
2,7.15,0.17,0.24,9.6,0.119,56.0,178.0,0.99578,3.15,0.44,10.2,6,white
3,6.7,0.64,0.23,2.1,0.08,11.0,119.0,0.99538,3.36,0.7,10.9,5,red
4,7.6,0.23,0.34,1.6,0.043,24.0,129.0,0.99305,3.12,0.7,10.4,5,white


## Separate data in training and test samples

In [2]:
from sklearn.model_selection import train_test_split

dataset_separado = train_test_split ( winedata, train_size = 0.8, test_size = 0.2)  # this will generate two dataframes, one with the
# corresponding 80% data for the training set, and one with the remaining 20% for testing. Bot stored in the same variable
# that is why we need to complement the proceses by asigning two global variables that represent them

train = dataset_separado[0] # Here we call the training dataset. We will use this one also for validation further on the process

test = dataset_separado[1] # And here the testing data set

### Method 2 : we want to stratify and maintain % of one variable results in the training and test sets

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit
# I we care about percentage of answers 
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# We specify the dataset, and the variable that we want to take in consideration when splitting 
for train_index, test_index in split.split(winedata, winedata["quality"]):
    strat_train_set = winedata.iloc[train_index]
    strat_test_set = winedata.iloc[test_index]
    
strat_train_set

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
5080,6.9,0.280,0.40,8.20,0.036,15.0,95.0,0.99440,3.17,0.33,10.200000,5,white
3627,6.6,0.200,0.38,7.90,0.052,30.0,145.0,0.99470,3.32,0.56,11.000000,7,white
2836,8.8,0.270,0.25,5.00,0.024,52.0,99.0,0.99250,2.87,0.49,11.400000,5,white
2916,6.6,0.500,0.04,2.10,0.068,6.0,14.0,0.99550,3.39,0.64,9.400000,6,red
5867,5.5,0.140,0.27,4.60,0.029,22.0,104.0,0.99490,3.34,0.44,9.000000,5,white
3796,7.5,0.240,0.31,13.10,0.050,26.0,180.0,0.99884,3.05,0.53,9.100000,6,white
5646,6.4,0.620,0.12,4.70,0.060,33.0,196.0,0.99556,3.22,0.48,8.900000,5,white
6133,7.7,0.260,0.51,2.60,0.045,26.0,159.0,0.99126,3.00,0.50,11.200000,6,white
2869,7.8,0.430,0.49,13.00,0.033,37.0,158.0,0.99550,3.14,0.35,11.300000,6,white
5716,6.5,0.290,0.26,7.00,0.040,18.0,113.0,0.99366,3.17,0.38,10.200000,6,white


## Data standarization    

before doing it, we need to make sure all features are numerical.

In [4]:
# This function transforms a cateogiral variable, into a numerical one. 
def categoricaltonum(dataset,category,mapdictionary):
    'this function transforms categorial variable of a dataset into numerical one, we specify as \
    arguments the dataset, the category we want to transform, and we use a dictionary to specify\
    which category will correspond to what created number afterwards'
    
    dataset[category] = dataset[category].map(mapdictionary)
    return(dataset)

# this is equivalent to : winedata['color'] = winedata['color'].map({'red': 1, 'white': 0})


categoricaltonum(winedata,"color",{'red': 1, 'white': 0})

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,5.20,0.340,0.00,1.80,0.050,27.0,63.0,0.99160,3.68,0.79,14.0,6,1
1,6.20,0.550,0.45,12.00,0.049,27.0,186.0,0.99740,3.17,0.50,9.3,6,0
2,7.15,0.170,0.24,9.60,0.119,56.0,178.0,0.99578,3.15,0.44,10.2,6,0
3,6.70,0.640,0.23,2.10,0.080,11.0,119.0,0.99538,3.36,0.70,10.9,5,1
4,7.60,0.230,0.34,1.60,0.043,24.0,129.0,0.99305,3.12,0.70,10.4,5,0
5,5.70,0.220,0.20,16.00,0.044,41.0,113.0,0.99862,3.22,0.46,8.9,6,0
6,7.10,0.470,0.29,14.80,0.024,22.0,142.0,0.99518,3.12,0.48,12.0,8,0
7,9.70,0.310,0.47,1.60,0.062,13.0,33.0,0.99830,3.27,0.66,10.0,6,1
8,7.60,0.210,0.44,1.90,0.036,10.0,119.0,0.99130,3.01,0.70,12.8,6,0
9,5.80,0.320,0.28,4.30,0.032,46.0,115.0,0.98946,3.16,0.57,13.0,8,0


In [5]:
from sklearn.preprocessing import StandardScaler

# The first step is creating an object representing standard scaler
scaler = StandardScaler()

# we apply the .fit method, using out dataset as argument  this will calculate mean and std for each feature. The scaler is ready to
# happen
scaler.fit(winedata)

# We now apply the .transform, creating a new dataframe but with the data havin a mean of 0 and std of 1. 
normalwinedata = scaler.transform(winedata)

normalwinedata


  return self.partial_fit(X, y)
  # This is added back by InteractiveShellApp.init_path()


array([[-1.55462025e+00,  2.02887007e-03, -2.19283252e+00, ...,
         2.94159020e+00,  2.07999049e-01,  1.75018984e+00],
       [-7.83214109e-01,  1.27766457e+00,  9.04065689e-01, ...,
        -9.99313172e-01,  2.07999049e-01, -5.71366589e-01],
       [-5.03782711e-02, -1.03062860e+00, -5.41153475e-01, ...,
        -2.44672102e-01,  2.07999049e-01, -5.71366589e-01],
       ...,
       [-2.43229807e-01, -1.27360683e+00, -2.65873634e-01, ...,
        -3.28521110e-01,  2.07999049e-01, -5.71366589e-01],
       [-1.66089193e-01, -6.05416699e-01,  2.84686047e-01, ...,
         2.58421945e-01,  2.07999049e-01, -5.71366589e-01],
       [-8.89485784e-02,  2.02887007e-03, -2.65873634e-01, ...,
        -9.15464164e-01, -9.37229614e-01,  1.75018984e+00]])

In [6]:
normalwinedata = pd.DataFrame(normalwinedata) # We see that scaler.transform returns a numpy array, so we transform it to a dataframe


normalwinedata.columns = winedata.columns # We add it the same colums that the original dataframe had.

normalwinedata # Et voala. 

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,-1.554620,0.002029,-2.192833,-0.765798,-0.172244,-0.198632,-0.933243,-1.032748,2.870469,1.738854,2.941590,0.207999,1.750190
1,-0.783214,1.277665,0.904066,1.378213,-0.200790,-0.198632,1.243074,0.901590,-0.301669,-0.210144,-0.999313,0.207999,-0.571367
2,-0.050378,-1.030629,-0.541153,0.873740,1.797445,1.435352,1.101525,0.361310,-0.426067,-0.613385,-0.244672,0.207999,-0.571367
3,-0.397511,1.824366,-0.609973,-0.702739,0.684143,-1.100140,0.057600,0.227907,0.880108,1.133992,0.342271,-0.937230,1.750190
4,0.296754,-0.666161,0.147046,-0.807837,-0.372068,-0.367664,0.234537,-0.549163,-0.612663,1.133992,-0.076974,-0.937230,-0.571367
5,-1.168917,-0.726906,-0.816433,2.219002,-0.343521,0.590188,-0.048561,1.308468,0.009325,-0.478971,-1.334709,0.207999,-0.571367
6,-0.088949,0.791708,-0.197054,1.966766,-0.914446,-0.480353,0.464554,0.161206,-0.612663,-0.344558,1.264610,2.498456,-0.571367
7,1.916707,-0.180205,1.041706,-0.807837,0.170311,-0.987451,-1.464052,1.201746,0.320319,0.865165,-0.412370,0.207999,1.750190
8,0.296754,-0.787650,0.835246,-0.744778,-0.571891,-1.156484,0.057600,-1.132799,-1.296850,1.133992,1.935402,0.207999,-0.571367
9,-1.091777,-0.119460,-0.265874,-0.240305,-0.686076,0.871910,-0.013174,-1.746451,-0.363868,0.260304,2.103100,2.498456,-0.571367


In [7]:
# We can apply it all inside one simple function like this one:
def dataframenormalizer(dataframe):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(dataframe)
    normaldataframe = scaler.transform(dataframe)
    normaldataframe = pd.DataFrame(normaldataframe)
    normaldataframe.columns = dataframe.columns
    return(normaldataframe)
    
    
dataframenormalizer(winedata)

  return self.partial_fit(X, y)
  


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,-1.554620,0.002029,-2.192833,-0.765798,-0.172244,-0.198632,-0.933243,-1.032748,2.870469,1.738854,2.941590,0.207999,1.750190
1,-0.783214,1.277665,0.904066,1.378213,-0.200790,-0.198632,1.243074,0.901590,-0.301669,-0.210144,-0.999313,0.207999,-0.571367
2,-0.050378,-1.030629,-0.541153,0.873740,1.797445,1.435352,1.101525,0.361310,-0.426067,-0.613385,-0.244672,0.207999,-0.571367
3,-0.397511,1.824366,-0.609973,-0.702739,0.684143,-1.100140,0.057600,0.227907,0.880108,1.133992,0.342271,-0.937230,1.750190
4,0.296754,-0.666161,0.147046,-0.807837,-0.372068,-0.367664,0.234537,-0.549163,-0.612663,1.133992,-0.076974,-0.937230,-0.571367
5,-1.168917,-0.726906,-0.816433,2.219002,-0.343521,0.590188,-0.048561,1.308468,0.009325,-0.478971,-1.334709,0.207999,-0.571367
6,-0.088949,0.791708,-0.197054,1.966766,-0.914446,-0.480353,0.464554,0.161206,-0.612663,-0.344558,1.264610,2.498456,-0.571367
7,1.916707,-0.180205,1.041706,-0.807837,0.170311,-0.987451,-1.464052,1.201746,0.320319,0.865165,-0.412370,0.207999,1.750190
8,0.296754,-0.787650,0.835246,-0.744778,-0.571891,-1.156484,0.057600,-1.132799,-1.296850,1.133992,1.935402,0.207999,-0.571367
9,-1.091777,-0.119460,-0.265874,-0.240305,-0.686076,0.871910,-0.013174,-1.746451,-0.363868,0.260304,2.103100,2.498456,-0.571367


## Feature engineering 

### Transforming numerical data, into categorical data 

In [8]:
import pandas as pd

winedata = pd.read_csv(datapath,sep = ';') # we renew the variable to operate it from scratch in this document


# We use the cut function using a bin argument as a number, to separate in equal length bins 

winedata["quality"] = pd.cut(winedata["quality"], bins = 4)
print(winedata["quality"])


0       (4.5, 6.0]
1       (4.5, 6.0]
2       (4.5, 6.0]
3       (4.5, 6.0]
4       (4.5, 6.0]
5       (4.5, 6.0]
6       (7.5, 9.0]
7       (4.5, 6.0]
8       (4.5, 6.0]
9       (7.5, 9.0]
10      (4.5, 6.0]
11      (4.5, 6.0]
12      (4.5, 6.0]
13      (4.5, 6.0]
14      (6.0, 7.5]
15      (4.5, 6.0]
16      (4.5, 6.0]
17      (4.5, 6.0]
18      (4.5, 6.0]
19      (4.5, 6.0]
20      (6.0, 7.5]
21      (4.5, 6.0]
22      (4.5, 6.0]
23      (4.5, 6.0]
24      (4.5, 6.0]
25      (4.5, 6.0]
26      (4.5, 6.0]
27      (6.0, 7.5]
28      (4.5, 6.0]
29      (6.0, 7.5]
           ...    
6467    (4.5, 6.0]
6468    (4.5, 6.0]
6469    (4.5, 6.0]
6470    (4.5, 6.0]
6471    (6.0, 7.5]
6472    (6.0, 7.5]
6473    (6.0, 7.5]
6474    (4.5, 6.0]
6475    (4.5, 6.0]
6476    (4.5, 6.0]
6477    (4.5, 6.0]
6478    (6.0, 7.5]
6479    (4.5, 6.0]
6480    (4.5, 6.0]
6481    (4.5, 6.0]
6482    (6.0, 7.5]
6483    (4.5, 6.0]
6484    (4.5, 6.0]
6485    (4.5, 6.0]
6486    (4.5, 6.0]
6487    (4.5, 6.0]
6488    (7.5

In [9]:
winedata = pd.read_csv(datapath,sep = ';') # we renew the variable to operate it from scratch in this document

winedata["quality"] = pd.cut(winedata["quality"] , right = False, bins = (0,2,5,7,9,11),labels = ["a","b","c","d","e"]) 
# With right = false, we make it so that 
# the fisrt interval does not include two values, the second interval wont include sevens ad the last one wont include 9. We also
# add specified names  as labels 
print(winedata["quality"])

0       c
1       c
2       c
3       c
4       c
5       c
6       d
7       c
8       c
9       d
10      c
11      c
12      c
13      c
14      d
15      c
16      c
17      c
18      c
19      c
20      d
21      c
22      c
23      c
24      c
25      c
26      c
27      d
28      c
29      d
       ..
6467    c
6468    c
6469    c
6470    c
6471    d
6472    d
6473    d
6474    c
6475    c
6476    c
6477    c
6478    d
6479    c
6480    c
6481    c
6482    d
6483    c
6484    c
6485    c
6486    c
6487    c
6488    d
6489    c
6490    c
6491    c
6492    c
6493    c
6494    c
6495    c
6496    c
Name: quality, Length: 6497, dtype: category
Categories (5, object): [a < b < c < d < e]


### Transform a continous numerical variable, into a discrete numerical one 


In [10]:
import numpy as np
np.ceil(winedata["fixed acidity"])

0        6.0
1        7.0
2        8.0
3        7.0
4        8.0
5        6.0
6        8.0
7       10.0
8        8.0
9        6.0
10       6.0
11       9.0
12       8.0
13       6.0
14       7.0
15       7.0
16       8.0
17       7.0
18       8.0
19       7.0
20       6.0
21       7.0
22       8.0
23       7.0
24      13.0
25       8.0
26       6.0
27       7.0
28       7.0
29       7.0
        ... 
6467     7.0
6468     7.0
6469     7.0
6470    10.0
6471     9.0
6472     7.0
6473     7.0
6474     8.0
6475     8.0
6476     6.0
6477     6.0
6478     8.0
6479     8.0
6480     7.0
6481     7.0
6482     8.0
6483     8.0
6484     8.0
6485     8.0
6486     7.0
6487     7.0
6488     7.0
6489     6.0
6490     8.0
6491     7.0
6492     8.0
6493     7.0
6494     7.0
6495     7.0
6496     8.0
Name: fixed acidity, Length: 6497, dtype: float64

### Transforming categorical data into numerical data 

In [11]:
# This function transforms a cateogiral variable, into a numerical one. 
def categoricaltonum(dataset,category,mapdictionary):
    'this function transforms categorial variable of a dataset into numerical one, we specify as \
    arguments the dataset, the category we want to transform, and we use a dictionary to specify\
    which category will correspond to what created number afterwards'
    
    dataset[category] = dataset[category].map(mapdictionary)
    return(dataset)

# this is equivalent to : winedata['color'] = winedata['color'].map({'red': 1, 'white': 0})


categoricaltonum(winedata,"color",{'red': 1, 'white': 0})

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,5.20,0.340,0.00,1.80,0.050,27.0,63.0,0.99160,3.68,0.79,14.0,c,1
1,6.20,0.550,0.45,12.00,0.049,27.0,186.0,0.99740,3.17,0.50,9.3,c,0
2,7.15,0.170,0.24,9.60,0.119,56.0,178.0,0.99578,3.15,0.44,10.2,c,0
3,6.70,0.640,0.23,2.10,0.080,11.0,119.0,0.99538,3.36,0.70,10.9,c,1
4,7.60,0.230,0.34,1.60,0.043,24.0,129.0,0.99305,3.12,0.70,10.4,c,0
5,5.70,0.220,0.20,16.00,0.044,41.0,113.0,0.99862,3.22,0.46,8.9,c,0
6,7.10,0.470,0.29,14.80,0.024,22.0,142.0,0.99518,3.12,0.48,12.0,d,0
7,9.70,0.310,0.47,1.60,0.062,13.0,33.0,0.99830,3.27,0.66,10.0,c,1
8,7.60,0.210,0.44,1.90,0.036,10.0,119.0,0.99130,3.01,0.70,12.8,c,0
9,5.80,0.320,0.28,4.30,0.032,46.0,115.0,0.98946,3.16,0.57,13.0,d,0


# # Transforming a categorical variable, into more than one binary variables 

In [45]:
import pandas as pd

winedata = pd.read_csv(datapath,sep = ';') # make sure to check the separator for your case

winedata.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,5.2,0.34,0.0,1.8,0.05,27.0,63.0,0.9916,3.68,0.79,14.0,6,red
1,6.2,0.55,0.45,12.0,0.049,27.0,186.0,0.9974,3.17,0.5,9.3,6,white
2,7.15,0.17,0.24,9.6,0.119,56.0,178.0,0.99578,3.15,0.44,10.2,6,white
3,6.7,0.64,0.23,2.1,0.08,11.0,119.0,0.99538,3.36,0.7,10.9,5,red
4,7.6,0.23,0.34,1.6,0.043,24.0,129.0,0.99305,3.12,0.7,10.4,5,white


In [47]:
winedata = pd.get_dummies(winedata)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color_red,color_white
0,5.20,0.340,0.00,1.80,0.050,27.0,63.0,0.99160,3.68,0.79,14.0,6,1,0
1,6.20,0.550,0.45,12.00,0.049,27.0,186.0,0.99740,3.17,0.50,9.3,6,0,1
2,7.15,0.170,0.24,9.60,0.119,56.0,178.0,0.99578,3.15,0.44,10.2,6,0,1
3,6.70,0.640,0.23,2.10,0.080,11.0,119.0,0.99538,3.36,0.70,10.9,5,1,0
4,7.60,0.230,0.34,1.60,0.043,24.0,129.0,0.99305,3.12,0.70,10.4,5,0,1
5,5.70,0.220,0.20,16.00,0.044,41.0,113.0,0.99862,3.22,0.46,8.9,6,0,1
6,7.10,0.470,0.29,14.80,0.024,22.0,142.0,0.99518,3.12,0.48,12.0,8,0,1
7,9.70,0.310,0.47,1.60,0.062,13.0,33.0,0.99830,3.27,0.66,10.0,6,1,0
8,7.60,0.210,0.44,1.90,0.036,10.0,119.0,0.99130,3.01,0.70,12.8,6,0,1
9,5.80,0.320,0.28,4.30,0.032,46.0,115.0,0.98946,3.16,0.57,13.0,8,0,1


### Creating new variables through numerical operations

In [12]:
winedata["fixed acidity by volatile acidity"] = winedata["fixed acidity"] / winedata["volatile acidity"]

winedata["alcohol x residual sugar"] = winedata["alcohol"] * winedata["residual sugar"]


winedata.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,fixed acidity by volatile acidity,alcohol x residual sugar
0,5.2,0.34,0.0,1.8,0.05,27.0,63.0,0.9916,3.68,0.79,14.0,c,1,15.294118,25.2
1,6.2,0.55,0.45,12.0,0.049,27.0,186.0,0.9974,3.17,0.5,9.3,c,0,11.272727,111.6
2,7.15,0.17,0.24,9.6,0.119,56.0,178.0,0.99578,3.15,0.44,10.2,c,0,42.058824,97.92
3,6.7,0.64,0.23,2.1,0.08,11.0,119.0,0.99538,3.36,0.7,10.9,c,1,10.46875,22.89
4,7.6,0.23,0.34,1.6,0.043,24.0,129.0,0.99305,3.12,0.7,10.4,c,0,33.043478,16.64


## Feature selection 
https://towardsdatascience.com/why-how-and-when-to-apply-feature-selection-e9c69adfabf2

## Variance threshold

Selecting variables depending of how big their variance, the logic behind this is pretty simple, a variable that does not vary too much, will not have a big impact in the predictive model. 

In [20]:
import pandas as pd
datapath = 'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\python\\winequality.csv' 
# place here the path of the file you are loading, if you have it on your local computer

winedata = pd.read_csv(datapath,sep = ';') # make sure to check the separator for your case


winefeatures = winedata.drop("color", axis = 1)

winetarget = winedata["color"]

winefeatures.head()

winetarget.head()

0      red
1    white
2    white
3      red
4    white
Name: color, dtype: object

In [22]:
from sklearn.feature_selection import VarianceThreshold

filter_var = VarianceThreshold(threshold=5.0)  # First thing we do is establish the minimum  variance we want 

filter_var.fit(winefeatures)  # we train our filter with our model, to decide and study the different variables

winefeatures_var_filtered = filter_var.transform(winefeatures) # we make finally the transformation  

In [23]:

winefeatures_var_filtered # we notice that some features have been lost, but, as we don´t know which ones because the colum names
# have been lost, we can use the following function to represent the reamining features that survived the filter, as a dataframe with their
# respective colum names 

array([[  1.8,  27. ,  63. ],
       [ 12. ,  27. , 186. ],
       [  9.6,  56. , 178. ],
       ...,
       [ 13.3,  47. , 132. ],
       [  4.9,  10. , 133. ],
       [  2. ,  31. ,  68. ]])

In [39]:
def recover_columns(dataframe_original, input_array):
    resultado = pd.DataFrame()
    input_dataframe = pd.DataFrame(input_array)
    input_dataframe.columns = [str(col) for col in input_dataframe.columns]
    for col_orig in dataframe_original.columns:
        for col_reduc in input_dataframe.columns:
            if np.all(dataframe_original[col_orig].values==input_dataframe[col_reduc].values):
                resultado[col_orig] = dataframe_original[col_orig]
            else:
                continue
    # Si es un DataFrame jerárquico:
    if type(resultado.columns[0])==tuple:
        multiIndex = pd.MultiIndex.from_tuples(resultado.columns)
        resultado.columns = multiIndex
    return resultado

winefeatures_var_filtered = recover_columns(winefeatures,winefeatures_var_filtered) 
    
winefeatures_var_filtered


Unnamed: 0,residual sugar,free sulfur dioxide,total sulfur dioxide
0,1.80,27.0,63.0
1,12.00,27.0,186.0
2,9.60,56.0,178.0
3,2.10,11.0,119.0
4,1.60,24.0,129.0
5,16.00,41.0,113.0
6,14.80,22.0,142.0
7,1.60,13.0,33.0
8,1.90,10.0,119.0
9,4.30,46.0,115.0


## Univariant selection   
when we apply univarible linear regression models, we are able to determine the relation or effect that a variable has 
to the target, for classification problems we use chisquare analysis, well we can use the individual relations of each feature to target ( not taking in account their relations between themselves ) as a criterion for feature filtering . We use selectKbest for this, where we specify how many features we actually want, keeping the best of them.



In [40]:
import pandas as pd
datapath = 'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\python\\winequality.csv' 
# place here the path of the file you are loading, if you have it on your local computer

winedata = pd.read_csv(datapath,sep = ';') # make sure to check the separator for your case


winefeatures = winedata.drop("color", axis = 1)

winetarget = winedata["color"]

winefeatures.head()

winetarget.head()

0      red
1    white
2    white
3      red
4    white
Name: color, dtype: object

In [42]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif

# we need to specify K, and a score function that can be, f_regression, when the variable target is continous or
# .chi2 / f_classif when it´s categorical . More details here 
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html#sklearn-feature-selection-f-regression
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html#sklearn-feature-selection-chi2
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html#sklearn-feature-selection-f-classif


kb = SelectKBest(score_func = f_classif, 
                 k=4 
                )  # we establish the criteria for the filtering, in this case we are working a classification problem and we want
# only 4 variables 

wine_KBest = kb.fit_transform(X=winefeatures,  # we transform the dataframe, eliminating the less relevant variables 
                                y=winetarget)


# just like before, we lost colum names, so lets get them bacck 

wine_KBest = recover_columns(winefeatures, wine_KBest)

wine_KBest


Unnamed: 0,volatile acidity,chlorides,total sulfur dioxide,sulphates
0,0.340,0.050,63.0,0.79
1,0.550,0.049,186.0,0.50
2,0.170,0.119,178.0,0.44
3,0.640,0.080,119.0,0.70
4,0.230,0.043,129.0,0.70
5,0.220,0.044,113.0,0.46
6,0.470,0.024,142.0,0.48
7,0.310,0.062,33.0,0.66
8,0.210,0.036,119.0,0.70
9,0.320,0.032,115.0,0.57


In [43]:
# finally we would add the target to the new dataframe

wine_KBest["color"] = winetarget

wine_KBest

Unnamed: 0,volatile acidity,chlorides,total sulfur dioxide,sulphates,color
0,0.340,0.050,63.0,0.79,red
1,0.550,0.049,186.0,0.50,white
2,0.170,0.119,178.0,0.44,white
3,0.640,0.080,119.0,0.70,red
4,0.230,0.043,129.0,0.70,white
5,0.220,0.044,113.0,0.46,white
6,0.470,0.024,142.0,0.48,white
7,0.310,0.062,33.0,0.66,red
8,0.210,0.036,119.0,0.70,white
9,0.320,0.032,115.0,0.57,white


## Using decision trees  or other models for variable selection 

we can use different models that already prioritize variables for their relevance, to filter features. The most common model
used for this purpose is the decision tree. 

In [44]:
import pandas as pd
datapath = 'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\python\\winequality.csv' 
# place here the path of the file you are loading, if you have it on your local computer

winedata = pd.read_csv(datapath,sep = ';') # make sure to check the separator for your case


#winefeatures = winedata.drop("color", axis = 1)

#winetarget = winedata["color"]

#winefeatures.head()

#winetarget.head()

winedata.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,5.2,0.34,0.0,1.8,0.05,27.0,63.0,0.9916,3.68,0.79,14.0,6,red
1,6.2,0.55,0.45,12.0,0.049,27.0,186.0,0.9974,3.17,0.5,9.3,6,white
2,7.15,0.17,0.24,9.6,0.119,56.0,178.0,0.99578,3.15,0.44,10.2,6,white
3,6.7,0.64,0.23,2.1,0.08,11.0,119.0,0.99538,3.36,0.7,10.9,5,red
4,7.6,0.23,0.34,1.6,0.043,24.0,129.0,0.99305,3.12,0.7,10.4,5,white


## Using mutual information and f test to filter variables 

ftest can measure the linear correlation within one of the features and the target ( on normal distributions or standarized ones ), while mutual information coeficient can give us insight for correlations of any type. We can use them as criterion to filter variables. 

In [58]:
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html 

    
import pandas as pd
datapath = 'C:\\Users\\stefano\\Dropbox\\Aprendizaje util\\Programacion learning\\modulo machine learning\\python\\winequality.csv' 
# place here the path of the file you are loading, if you have it on your local computer

winedata = pd.read_csv(datapath,sep = ';') # make sure to check the separator for your case






In [59]:
# This function transforms a cateogiral variable, into a numerical one. 
def categoricaltonum(dataset,category,mapdictionary):
    'this function transforms categorial variable of a dataset into numerical one, we specify as \
    arguments the dataset, the category we want to transform, and we use a dictionary to specify\
    which category will correspond to what created number afterwards'
    
    dataset[category] = dataset[category].map(mapdictionary)
    return(dataset)

# this is equivalent to : winedata['color'] = winedata['color'].map({'red': 1, 'white': 0})


categoricaltonum(winedata,"color",{'red': 1, 'white': 0})




Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,5.20,0.340,0.00,1.80,0.050,27.0,63.0,0.99160,3.68,0.79,14.0,6,1
1,6.20,0.550,0.45,12.00,0.049,27.0,186.0,0.99740,3.17,0.50,9.3,6,0
2,7.15,0.170,0.24,9.60,0.119,56.0,178.0,0.99578,3.15,0.44,10.2,6,0
3,6.70,0.640,0.23,2.10,0.080,11.0,119.0,0.99538,3.36,0.70,10.9,5,1
4,7.60,0.230,0.34,1.60,0.043,24.0,129.0,0.99305,3.12,0.70,10.4,5,0
5,5.70,0.220,0.20,16.00,0.044,41.0,113.0,0.99862,3.22,0.46,8.9,6,0
6,7.10,0.470,0.29,14.80,0.024,22.0,142.0,0.99518,3.12,0.48,12.0,8,0
7,9.70,0.310,0.47,1.60,0.062,13.0,33.0,0.99830,3.27,0.66,10.0,6,1
8,7.60,0.210,0.44,1.90,0.036,10.0,119.0,0.99130,3.01,0.70,12.8,6,0
9,5.80,0.320,0.28,4.30,0.032,46.0,115.0,0.98946,3.16,0.57,13.0,8,0


In [60]:

winefeatures = winedata.drop("color", axis = 1)

winetarget = winedata["color"]

winefeatures.head()

winetarget.head()

0    1
1    0
2    0
3    1
4    0
Name: color, dtype: int64

In [77]:
import numpy as np
from sklearn.feature_selection import mutual_info_classif
import pandas as pd 

def mutualinfodata(featuredata,targetdata):
    totallist = []
    mutualcofi = mutual_info_classif(featuredata,targetdata) # we have to establish the parameters of the mutual info, for 
    #discrete variables we use classif, but for continous variables we have to use regression. 
    for i in range(0,len(mutualcofi)):
        dictio = {"feature" : featuredata.columns[i] ,
                  "mutual_info_to_target" : mutualcofi[i]}
        totallist.append(dictio)
        
    finaldf = pd.DataFrame(totallist)
    
    return(finaldf)
        

mutualinfodata(winefeatures,winetarget)

Unnamed: 0,feature,mutual_info_to_target
0,fixed acidity,0.129824
1,volatile acidity,0.233853
2,citric acid,0.103777
3,residual sugar,0.217205
4,chlorides,0.355485
5,free sulfur dioxide,0.152709
6,total sulfur dioxide,0.349693
7,density,0.148627
8,pH,0.060958
9,sulphates,0.148794
