In [79]:
import pandas as pd
from io import StringIO
from sklearn.impute import SimpleImputer
import numpy as np


In [80]:
csv_data='''
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
'''
df= pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [81]:
# Using the sum method, we can then return the number of missing values per column as follows
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [82]:
# romove rows having missing values
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [83]:
# romove columns having missing values
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [84]:
# imputing missing values
#   - main imputation 
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data=imr.transform(df.values)
imputed_data 

# we replaced each NaN value with the corresponding mean. otherwise you could also used the meduab or most_frequent

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [85]:
# Otherwise you could also use the pandas function fillna function
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [86]:
# categorical features
df = pd.DataFrame([
['green', 'M', 10.1, 'class2'],
['red', 'L', 13.5, 'class1'],
['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
size_mapping ={"XL":3,"L":2,"M":1} # Size mapping for the class, unfortunately we dont have funciton that automatically dervied the correct order of labels for the size mapping
df['size']=df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [87]:
# if you want to get the orignal size values you can use the following
inv_size_mapping ={v:k for k,v in size_mapping.items() }
# df['size']=df['size'].map(inv_size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [88]:
# Encoding Class Labels
# it is considered good practice to provide class labels as integer arrays to avoid technical glitches.
class_mapping ={labels:idx for idx,labels in enumerate(np.unique(df['classlabel']))}
df['classlabel']=df['classlabel'].map(class_mapping)


# Alternatively, there is LabelEncoder from sciket learn
from sklearn.preprocessing import  LabelEncoder
class_le =LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)
class_le.inverse_transform(y)
y

array([1, 0, 1])

In [89]:
# ONE-HOT Encoding
#  The idea behind this approach is to create a new dummy feature for each unique value in the nominal feature column.

from sklearn.preprocessing import OneHotEncoder
X = df[['color',"size","price"]].values
color_ohe = OneHotEncoder()
color_ohe.fit_transform(X[:,0].reshape(-1,1)).toarray()



array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [90]:
# If we want to selectively transform columns in a multi-feature array, we can use the ColumnTransformer, which accepts a list of (name, transformer, column(s)) tuples as follows
from sklearn.compose import ColumnTransformer
X = df[['color','size','price']].values
c_transf=ColumnTransformer([('onehot',OneHotEncoder(),[0]),('nothing','passthrough',[1,2])])
c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [91]:
# An even more convenient way to create those dummy features via one-hot encoding is to use the get_dummies method implemented in pandas. Applied to a DataFrame, 
# the get_dummies method will only convert string columns and leave all other columns unchanged:
pd.get_dummies(df[["price",'color','size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,False,True,False
1,13.5,2,False,False,True
2,15.3,3,True,False,False


In [92]:
# To reduce the correlation among variables, we can simply remove one feature column from the one-hot encoded array, if we removed it, then we still preserve the information
pd.get_dummies(df[["price",'color','size']],drop_first=True)
# In order to drop a redundant column via the OneHotEncoder, set drop='first' and set categories="auto"
# OneHotEncoder(categories='auto', drop='first')

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,True,False
1,13.5,2,False,True
2,15.3,3,False,False


#### Other Encoding Techniques

- Binary encoding, which produces multiple binary features similar to one-hot
  encoding but requires fewer feature columns, i.e., log2(K) instead of K – 1, where
  K is the number of unique categories. In binary encoding, numbers are first con-
  verted into binary representations, and then each binary number position will
  form a new feature column.
- Count or frequency encoding, which replaces the label of each category by the
  number of times or frequency it occurs in the training set.


In [99]:
# partionaing the datasets into training and testing
df_wine = pd.read_csv("./wine-dataset/wine.data",header=None)
df_wine.head()
df_wine.columns = ['Class label', 'Alcohol',
'Malic acid', 'Ash',
'Alcalinity of ash', 'Magnesium',
'Total phenols', 'Flavanoids',
'Nonflavanoid phenols',
'Proanthocyanins',
'Color intensity', 'Hue',
'OD280/OD315 of diluted wines',
'Proline']


print ('Class labels', np.unique(df_wine['Class label']))
df_wine.head()


Class labels [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [101]:
# train_test_split
from sklearn.model_selection import train_test_split
X,y = df_wine.iloc[:,1:].values,df_wine.iloc[:,0].values
X_train, X_test,y_train,y_test = train_test_split(X,y, test_size=0.3,random_state=0,stratify=y)

In [None]:
# Feature scaling  
#  - Standard deviation: we center the feature columns at mean 0 with standard deviation 1 so that the feature columns have the same parameters as a standard normal distribution (zero mean and unit variance)
#  - Normalization: Normalization refers to the rescaling of the features to a range of [0, 1], which is a special case of min-max scaling.

from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
X_train_norm = mms.fit_transform(X_train)
X_test_norm = mms.transform(X_test)