In [1]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.feature_selection import VarianceThreshold

In [2]:
data = load_wine()
df = pd.DataFrame(data.data, columns = data.feature_names)
df["target"] = data.target
# https://www.kaggle.com/c/santander-customer-satisfaction/data?select=train.csv

In [3]:
data.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [4]:
data.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [5]:
df.shape

(178, 14)

In [6]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [7]:
df.describe()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,13.000618,2.336348,2.366517,19.494944,99.741573,2.295112,2.02927,0.361854,1.590899,5.05809,0.957449,2.611685,746.893258,0.938202
std,0.811827,1.117146,0.274344,3.339564,14.282484,0.625851,0.998859,0.124453,0.572359,2.318286,0.228572,0.70999,314.907474,0.775035
min,11.03,0.74,1.36,10.6,70.0,0.98,0.34,0.13,0.41,1.28,0.48,1.27,278.0,0.0
25%,12.3625,1.6025,2.21,17.2,88.0,1.7425,1.205,0.27,1.25,3.22,0.7825,1.9375,500.5,0.0
50%,13.05,1.865,2.36,19.5,98.0,2.355,2.135,0.34,1.555,4.69,0.965,2.78,673.5,1.0
75%,13.6775,3.0825,2.5575,21.5,107.0,2.8,2.875,0.4375,1.95,6.2,1.12,3.17,985.0,2.0
max,14.83,5.8,3.23,30.0,162.0,3.88,5.08,0.66,3.58,13.0,1.71,4.0,1680.0,2.0


In [8]:
X=df.drop(labels=['target'], axis=1)
y=df['target']

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

X_train.shape, X_test.shape

((124, 13), (54, 13))

In [10]:
X_train.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
22,13.71,1.86,2.36,16.6,101.0,2.61,2.88,0.27,1.69,3.8,1.11,4.0,1035.0
108,12.22,1.29,1.94,19.0,92.0,2.36,2.04,0.39,2.08,2.7,0.86,3.02,312.0
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835.0
145,13.16,3.57,2.15,21.0,102.0,1.5,0.55,0.43,1.3,4.0,0.6,1.68,830.0
71,13.86,1.51,2.67,25.0,86.0,2.95,2.86,0.21,1.87,3.38,1.36,3.16,410.0


## Removing Constant Features

In [11]:
var_thres=VarianceThreshold(threshold=1)
var_thres.fit(X_train)

VarianceThreshold(threshold=1)

In [12]:
### Finding non constant features
sum(var_thres.get_support())

5

In [13]:
constant_columns = [column for column in X_train.columns
                    if column not in X_train.columns[var_thres.get_support()]]

print(len(constant_columns))

8


In [14]:
for column in constant_columns:
    print(column)

alcohol
ash
total_phenols
flavanoids
nonflavanoid_phenols
proanthocyanins
hue
od280/od315_of_diluted_wines


In [15]:
X_train.drop(constant_columns,axis=1)

Unnamed: 0,malic_acid,alcalinity_of_ash,magnesium,color_intensity,proline
22,1.86,16.6,101.0,3.80,1035.0
108,1.29,19.0,92.0,2.70,312.0
175,4.28,20.0,120.0,10.20,835.0
145,3.57,21.0,102.0,4.00,830.0
71,1.51,25.0,86.0,3.38,410.0
...,...,...,...,...,...
103,1.72,19.5,86.0,2.06,415.0
67,1.17,19.6,78.0,4.68,510.0
117,1.61,22.5,108.0,2.06,345.0
47,1.68,16.0,101.0,6.10,985.0


## Removing Duplicated Features

In [16]:
# transpose the feature matrice
train_features_T = X_train.T
train_features_T

Unnamed: 0,22,108,175,145,71,162,156,27,83,134,...,70,87,36,21,9,103,67,117,47,172
alcohol,13.71,12.22,13.27,13.16,13.86,12.85,13.84,13.3,13.05,12.51,...,12.29,11.65,13.28,12.93,13.86,11.82,12.37,12.42,13.9,14.16
malic_acid,1.86,1.29,4.28,3.57,1.51,3.27,4.12,1.72,3.86,1.24,...,1.61,1.67,1.64,3.8,1.35,1.72,1.17,1.61,1.68,2.51
ash,2.36,1.94,2.26,2.15,2.67,2.58,2.38,2.14,2.32,2.25,...,2.21,2.62,2.84,2.65,2.27,1.88,1.92,2.19,2.12,2.48
alcalinity_of_ash,16.6,19.0,20.0,21.0,25.0,22.0,19.5,17.0,22.5,17.5,...,20.4,26.0,15.5,18.6,16.0,19.5,19.6,22.5,16.0,20.0
magnesium,101.0,92.0,120.0,102.0,86.0,106.0,89.0,94.0,85.0,85.0,...,103.0,88.0,110.0,102.0,98.0,86.0,78.0,108.0,101.0,91.0
total_phenols,2.61,2.36,1.59,1.5,2.95,1.65,1.8,2.4,1.65,2.0,...,1.1,1.92,2.6,2.41,2.98,2.5,2.11,2.0,3.1,1.68
flavanoids,2.88,2.04,0.69,0.55,2.86,0.6,0.83,2.19,1.59,0.58,...,1.02,1.61,2.68,2.41,3.15,1.64,2.0,2.09,3.39,0.7
nonflavanoid_phenols,0.27,0.39,0.43,0.43,0.21,0.6,0.48,0.27,0.61,0.6,...,0.37,0.4,0.34,0.25,0.22,0.37,0.27,0.34,0.21,0.44
proanthocyanins,1.69,2.08,1.35,1.3,1.87,0.96,1.56,1.35,1.62,1.25,...,1.46,1.34,1.36,1.98,1.85,1.42,1.04,1.61,2.14,1.24
color_intensity,3.8,2.7,10.2,4.0,3.38,5.58,9.01,3.95,4.8,5.45,...,3.05,2.6,4.6,4.5,7.22,2.06,4.68,2.06,6.1,9.7


In [17]:
# print the number of duplicated features
print(train_features_T.duplicated().sum())

0


In [18]:
# select the duplicated features columns names
duplicated_columns = train_features_T[train_features_T.duplicated()].index.values

In [20]:
# drop those columns
X_train.drop(labels=duplicated_columns, axis=1, inplace=True)
X_test.drop(labels=duplicated_columns, axis=1, inplace=True)