# Ch04 - Data Preprocessing

In [6]:
# Dealing with Missing Data
import pandas as pd
from io import StringIO
# Create a sample dataframe which contains some missing data in some columns
# unicode is only needed for python 2.7
csv_data = unicode('''A,B,C,D
            1.0,2.0,3.0,4.0
            5.0,6.0,,8.0
            0.0,11.0,12.0,''')
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


In [36]:
# check for the total number of null values in each column
print df.isnull().sum()
print ''
# underlying values can be always accessed as a numpy array from a pandas dataframe
print df.values  
print ''
# drops the rows having null values
print df.dropna()
print ''
# to drop the columns with null values
print df.dropna(axis=1)
print ''
df.dropna(how='all') # drop all rows where all columns are NAN
df.dropna(thresh=4) # have not atleast 4 non-NAN values
df.dropna(subset=['C']) # drop from a particular column

A    0
B    0
C    1
D    1
dtype: int64

[[  1.   2.   3.   4.]
 [  5.   6.  nan   8.]
 [  0.  11.  12.  nan]]

     A    B    C    D
0  1.0  2.0  3.0  4.0

     A     B
0  1.0   2.0
1  5.0   6.0
2  0.0  11.0



Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,0.0,11.0,12.0,


### Imputing Missing Values

In [40]:
#imputing missing values
from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN', strategy='mean', axis=0) # strategy can be median or most_frequent, 
# axis = 0 by columns and axis = 1 is by rows.
imr.fit(df)
imputed_data = imr.transform(df.values)
imputed_data

array([[  1. ,   2. ,   3. ,   4. ],
       [  5. ,   6. ,   7.5,   8. ],
       [  0. ,  11. ,  12. ,   6. ]])

### Categorical features handling - Ordinal or Nominal

In [94]:
# Categorical features handling - Ordinal or Nominal
import pandas as pd
import numpy as np
df_ord_nom = pd.DataFrame([['green', 'M', 10.1, 'class1'],
            ['red', 'L', 20.1, 'class2'],
            ['blue', 'XL', 30.1, 'class1']])
df_ord_nom.columns = ["color","size","price", "label"]
df_ord_nom

# define a map for the sizes and then convert them to the numberical values
size_mapping = {'M':1, 'L':2, 'XL':3}
df_ord_nom['size'] = df_ord_nom['size'].map(size_mapping)
print df_ord_nom
print ''

# doing the reverse mapping
reverse_mapping_size = {v:k for k,v in size_mapping.items()}
df_ord_nom['size'] = df_ord_nom['size'].map(reverse_mapping_size)
print df_ord_nom
print ''

   color  size  price   label
0  green     1   10.1  class1
1    red     2   20.1  class2
2   blue     3   30.1  class1



In [85]:
# convert class labels to numerical values
class_mapping = {label:idx for idx,label in enumerate(np.unique(df_ord_nom['label']))}
df_ord_nom['label'] = df_ord_nom['label'].map(class_mapping)
print df_ord_nom
print ''

#doing the reverse class conversion
reverse_class_mapping = {v:k for k,v in class_mapping.items()}
df_ord_nom['label'] = df_ord_nom['label'].map(reverse_class_mapping)
print df_ord_nom
print ''

   color size  price  label
0  green    M   10.1      0
1    red    L   20.1      1
2   blue   XL   30.1      0

   color size  price  label
0  green    M   10.1      0
1    red    L   20.1      1
2   blue   XL   30.1      0



In [86]:
# the same thing can be achieved by using Sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df_ord_nom['label'] = class_le.fit_transform(df_ord_nom['label'])
print df_ord_nom
df_ord_nom['label'] = class_le.inverse_transform(df_ord_nom['label'])
print df_ord_nom

   color size  price  label
0  green    M   10.1      0
1    red    L   20.1      1
2   blue   XL   30.1      0
   color size  price  label
0  green    M   10.1      0
1    red    L   20.1      1
2   blue   XL   30.1      0


### perform One-hot encoding

In [106]:
# perform one-hot encoding
X = df_ord_nom[['color','size', 'price']].values
color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])
print X

# instead use the one-hot encoding technique to transform the color column
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(categorical_features=[0]) # can use sparse=False if the toarray() need to be removed
ohe.fit_transform(X).toarray()
print X

# pandas get_dummies only converts the string values to a one hot encoding values
# pd.get_dummies(df_ord_nom[['price','color','size']])

[[1 1 10.1]
 [2 2 20.1]
 [0 3 30.1]]
[[1 1 10.1]
 [2 2 20.1]
 [0 3 30.1]]


### Partitioning the Dataset into train and test - Wine DataSet

In [115]:
#Partition the dataset into train and test
df_wine = pd.read_csv('/Users/tkmacl9/Desktop/Patents_Research_Papers_Personalization/All_Machine_Learning/wine_UCI.csv', header=None)
df_wine.columns = ['Class label', 'Alcohol','Malic acid', 'Ash', 
                   'Alcalinity of ash', 'Magnesium', 'Total phenols', 
                   'Flavanoids','Nonflavanoid phenols', 'Proanthocyanins', 
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']

print np.unique(df_wine['Class label'])
print df_wine.head()

[1 2 3]
   Class label  Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0            1    14.23        1.71  2.43               15.6        127   
1            1    13.20        1.78  2.14               11.2        100   
2            1    13.16        2.36  2.67               18.6        101   
3            1    14.37        1.95  2.50               16.8        113   
4            1    13.24        2.59  2.87               21.0        118   

   Total phenols  Flavanoids  Nonflavanoid phenols  Proanthocyanins  \
0           2.80        3.06                  0.28             2.29   
1           2.65        2.76                  0.26             1.28   
2           2.80        3.24                  0.30             2.81   
3           3.85        3.49                  0.24             2.18   
4           2.80        2.69                  0.39             1.82   

   Color intensity   Hue  OD280/OD315 of diluted wines  Proline  
0             5.64  1.04                        

In [126]:
from sklearn.cross_validation import train_test_split
# iloc provides the specific columns to be picked up
X = df_wine.iloc[:,1:].values
y = df_wine.iloc[:,0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print X_train.shape
print X_test.shape
print y_train.shape
print y_test.shape


(124, 13)
(54, 13)
(124,)
(54,)


### Bringing Features to the same scale - Normalization and Standardization

In [135]:
# Bringing Features to the same scale - Normalization and Standardization
# This is the Normalization - MinMaxScaler which uses the min max scaling 
from sklearn.preprocessing import MinMaxScaler
stdsc = MinMaxScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
print X_train_std[:4,:2]
print ''
print X_test_std[:4,:2]

[[ 0.72043011  0.20378151]
 [ 0.31989247  0.08403361]
 [ 0.60215054  0.71218487]
 [ 0.57258065  0.56302521]]

[[ 0.72849462  0.16386555]
 [ 0.47311828  0.37394958]
 [ 0.36021505  0.05042017]
 [ 0.68010753  0.17647059]]


In [136]:
# Bringing Features to the same scale - Normalization and Standardization
# This is the Normalization - which uses the min max scaling 
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)
print X_train_std[:4,:2]
print ''
print X_test_std[:4,:2]

[[ 0.91083058 -0.46259897]
 [-0.95609928 -0.96608672]
 [ 0.35952243  1.67501572]
 [ 0.22169539  1.0478643 ]]

[[ 0.94841977 -0.63042822]
 [-0.24190464  0.25288364]
 [-0.76815332 -1.10741662]
 [ 0.72288462 -0.57742951]]


### Selecting Meaningful Features

In [144]:
# LR intercept runs one vs all classifiers for the 3 classes
# LR coefficients generate weights for all 13 wine featurs for all the 3 classes
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std,y_train)
print('Training Accuracy : ', lr.score(X_test_std, y_test))
print('Training Accuracy : ', lr.score(X_test_std, y_test))
print lr.intercept_
print ''
print lr.coef_

('Training Accuracy : ', 0.98148148148148151)
[-0.38380571 -0.15815566 -0.70038665]

[[ 0.27995866  0.          0.         -0.02782167  0.          0.
   0.7099735   0.          0.          0.          0.          0.
   1.23661684]
 [-0.64367427 -0.06891062 -0.05719491  0.          0.          0.          0.
   0.          0.         -0.92725086  0.05981741  0.         -0.37096993]
 [ 0.          0.06146911  0.          0.          0.          0.
  -0.63712137  0.          0.          0.4985422  -0.35791726 -0.57041736
   0.        ]]
