# Convert pandas categorical data for scikit-learn 

In [1]:
from sklearn import preprocessing 
import pandas as pd

#create data frame
raw_data = {'patient': [1, 1, 1, 2, 2],
        'obs': [1, 2, 3, 1, 2],
        'treatment': [0, 1, 0, 1, 0],
        'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'score'])

#label encoding
le = preprocessing.LabelEncoder()

#fit the encoder to the pandas column
le.fit(df['score'])

#list the labels 
list(le.classes_)

#transform categories into integers
le.transform(df['score'])

#convert some integers into their categorical names
list (le.inverse_transform([2, 2, 1]))


# Deleting observations with missing values

In [9]:
import numpy as np 
import pandas as pd

#Create feature matrix 
X = np.array([[1.1, 11.1], 
              [2.2, 22.2], 
              [3.3, 33.3], 
              [4.4, 44.4], 
              [np.nan, 55]])

#deleting observations with missing values
X[~np.isnan(X).any(axis=1)]

# Load data as a data frame
df = pd.DataFrame(X, columns=['feature_1', 'feature_2'])

# Remove observations with missing values
df.dropna()

Unnamed: 0,feature_1,feature_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


# Detecting outliers
EllipticEnvelope assumes the data is normally distributed and based on that assumption “draws” an ellipse around the data, classifying any observation inside the ellipse as an inlier (labeled as 1) and any observation outside the ellipse as an outlier (labeled as -1). A major limitation of this approach is the need to specify a contamination parameter which is the proportion of observations that are outliers, a value that we don’t know.

In [11]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs


#Create simulated data
X, _ = make_blobs(n_samples = 10, n_features = 2, centers = 1, random_state = 1)

#replace the first observation's values with extreme values 
X[0,0] = 10000
X[0,1] = 10000

#Crate the detector 
outlier_detector = EllipticEnvelope(contamination = 0.1)

#Fit detector 
outlier_detector.fit(X)

#Predict outliers
outlier_detector.predict(X)



array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

# Discretize features

In [14]:
from sklearn.preprocessing import Binarizer
import numpy as np

# Create feature
age = np.array([[6], 
                [12], 
                [20], 
                [36], 
                [65]])

#Option1: Binarize feature
binarizer = Binarizer(18)
binarizer.fit_transform(age)

#Option 2: Break into bins
np.digitize (age, bins=[20,30,40])

array([[0],
       [0],
       [1],
       [2],
       [3]])

# Encoding Ordinal Categorical Features


In [17]:
import pandas as pd

# Create features
df = pd.DataFrame({'Score': ['Low', 
                             'Low', 
                             'Medium', 
                             'Medium', 
                             'High']})

# View data frame
df

#create a scale map
scale_mapper = {'Low':1, 'Medium':2, 'High':3}

#Map feature values to scale 
df['Scale'] = df['Score'].replace(scale_mapper)
# View data frame
df

Unnamed: 0,Score,Scale
0,Low,1
1,Low,1
2,Medium,2
3,Medium,2
4,High,3


# Handling Imbalanced Classes with Downsampling & Upsampling
In downsampling, we randomly sample without replacement from the majority class (i.e. the class with more observations) to create a new subset of observation equal in size to the minority class.



In [23]:
import numpy as np 
from sklearn.datasets import load_iris

# Load iris data
iris = load_iris()

# Create feature matrix
X = iris.data

# Create target vector
y = iris.target

#Make Iris imbalanced by removing first 40 observations
X = X[40:,:]
y = y[40:]

#Create binary target vector indicating if class 0 
y = np.where((y==0), 0, 1)

#Look at the imbalanced target vector
y 

#Downsample the majority class to match the minority class 
#Indices of each class' observations
i_class0 = np.where(y == 0)[0]
i_class1 = np.where(y == 1)[0] 

#number of observations in each class 
n_class0 = len(i_class0) 
n_class1 = len(i_class1) 

# For every observation of class 0, randomly sample from class 1 without replacement
i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)

# Join together class 0's target vector with the downsampled class 1's target vector
np.hstack((y[i_class0], y[i_class1_downsampled]))

##UPSAMPLING

'''
In upsampling, for every observation in the majority class, we randomly select an
observation from the minority class with replacement. The end result is the same number 
of observations from the minority and majority classes.

'''

# For every observation in class 1, randomly sample from class 0 with replacement
i_class0_upsampled = np.random.choice(i_class0, size=n_class1, replace=True)

# Join together class 0's upsampled target vector with class 1's target vector
np.concatenate((y[i_class0_upsampled], y[i_class1]))




array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Handling outliers: drop, mark or re-scale?

In [27]:
# Load library
import pandas as pd

# Create DataFrame
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

#OPTION 1: Drop 
# Drop observations greater than some value
houses[houses['Bathrooms'] < 20]

#OPTION 2: Mark 

import numpy as np 
#create feature based on boolean condition
houses['Outlier'] = np.where(houses['Bathrooms'] < 20, 0, 1)

#Show data
houses

#Option 3: rescale 
#log feature 
houses['Log_of_Square_feet'] = [np.log(x) for x in houses ['Square_Feet']]
houses


Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_of_Square_feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


# Impute missing values with means

Mean imputation replaces missing values with the mean value of that feature/variable. Mean imputation is one of the most ‘naive’ imputation methods because unlike more complex methods like k-nearest neighbors imputation, it does not use the information we have about an observation to estimate a value for it.

In [28]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import Imputer 

#Create datafrfame 
df = pd.DataFrame()

# Create two variables called x0 and x1. Make the first value of x1 a missing value
df['x0'] = [0.3051,0.4949,0.6974,0.3769,0.2231,0.341,0.4436,0.5897,0.6308,0.5]
df['x1'] = [np.nan,0.2654,0.2615,0.5846,0.4615,0.8308,0.4962,0.3269,0.5346,0.6731]

# View the dataset
df

#create an imputer object that looks for 'NaN' values, 
#then replaces them with the mean value of the feature by columns (axis=0)
mean_imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis=0)

#Train the imputor on the df dataset
mean_imputer = mean_imputer.fit(df)

#Apply the imputer
imputed_df = mean_imputer.transform(df.values)

imputed_df


array([[ 0.3051    ,  0.49273333],
       [ 0.4949    ,  0.2654    ],
       [ 0.6974    ,  0.2615    ],
       [ 0.3769    ,  0.5846    ],
       [ 0.2231    ,  0.4615    ],
       [ 0.341     ,  0.8308    ],
       [ 0.4436    ,  0.4962    ],
       [ 0.5897    ,  0.3269    ],
       [ 0.6308    ,  0.5346    ],
       [ 0.5       ,  0.6731    ]])

# Imputing missing class labels 


In [29]:
import numpy as np 
from sklearn.preprocessing import Imputer

# Create feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [0, -0.21, -1.19],
              [np.nan, 0.87, 1.31],
              [np.nan, -0.67, -0.22]])


#Create Imputer Object 
imputer = Imputer(strategy='most_frequent', axis=0)

#Fill missing values with most frequent class
# Fill missing values with most frequent class
imputer.fit_transform(X)

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 0.  , -0.21, -1.19],
       [ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22]])

# Imputing missing class labels with k-nearest neighbours


In [30]:
import numpy as np 
from sklearn.neighbors import KNeighborsClassifier

# Create feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [1, -0.21, -1.19]])

# Create feature matrix with missing values in the categorical feature
X_with_nan = np.array([[np.nan, 0.87, 1.31], 
                       [np.nan, -0.67, -0.22]])

# Train KNN learner
clf = KNeighborsClassifier(3, weights='distance')
trained_model = clf.fit(X[:,1:], X[:,0])

# Predict missing values' class
imputed_values = trained_model.predict(X_with_nan[:,1:])

# Join column of predicted class with their other features
X_with_imputed = np.hstack((imputed_values.reshape(-1,1), X_with_nan[:,1:]))

# Join two feature matrices
np.vstack((X_with_imputed, X))


array([[ 0.  ,  0.87,  1.31],
       [ 1.  , -0.67, -0.22],
       [ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 1.  , -0.21, -1.19]])

# Normalize observations 
Normalizer rescales the values on individual observations to have unit norm (the sum of their lengths is one).



In [34]:
from sklearn.preprocessing import Normalizer
import numpy as np 

# Create feature matrix
X = np.array([[0.5, 0.5], 
              [1.1, 3.4], 
              [1.5, 20.2], 
              [1.63, 34.4], 
              [10.9, 3.3]])

#create normalizer object
normalizer = Normalizer(norm = 'l2')

#Transform feature matrix 
normalizer.transform(X)

array([[ 0.70710678,  0.70710678],
       [ 0.30782029,  0.95144452],
       [ 0.07405353,  0.99725427],
       [ 0.04733062,  0.99887928],
       [ 0.95709822,  0.28976368]])

# One-hot encode features with multiple labels

In [40]:
# Load libraries
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np

# Create NumPy array
y = [('Texas', 'Florida'), 
    ('California', 'Alabama'), 
    ('Texas', 'Florida'), 
    ('Delware', 'Florida'), 
    ('Texas', 'Alabama')]

# Create MultiLabelBinarizer object
one_hot = MultiLabelBinarizer()

# One-hot encode data
one_hot.fit_transform(y)

#View classes
one_hot.classes_


array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'], dtype=object)

# One-hot encode nominal categorical features

In [43]:
# Load libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer

# Create NumPy array
x = np.array([['Texas'], 
              ['California'], 
              ['Texas'], 
              ['Delaware'], 
              ['Texas']])

#create label binarizer object
one_hot = LabelBinarizer()

#one-hot encode data 
one_hot.fit_transform(x)

#method 2: get dummies 

# Dummy feature
pd.get_dummies(x[:,0])

Unnamed: 0,California,Delaware,Texas
0,0,0,1
1,1,0,0
2,0,0,1
3,0,1,0
4,0,0,1


# Iris dataset pre-processing

In [45]:
from sklearn import datasets
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the iris data
iris = datasets.load_iris()

# Create a variable for the feature data
X = iris.data

# Create a variable for the target data
y = iris.target

#split the dataset for cross-validation
# Random split the data into four new datasets, training features, training outcome, test features, 
# and test outcome. Set the size of the test data to be 30% of the full dataset.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Load the standard scaler
sc = StandardScaler()

# Compute the mean and standard deviation based on the training data
sc.fit(X_train)

# Scale the training data to be of mean 0 and of unit variance
X_train_std = sc.transform(X_train)

# Scale the test data to be of mean 0 and of unit variance
X_test_std = sc.transform(X_test)

# Feature Test Data, non-standardized
X_test[0:5]

# Feature Test Data, standardized.
X_test_std[0:5]

array([[ 0.3100623 , -0.49582097,  0.48403749, -0.05143998],
       [-0.17225683,  1.92563026, -1.26851205, -1.26670948],
       [ 2.23933883, -0.98011121,  1.76924049,  1.43388941],
       [ 0.18948252, -0.25367584,  0.36720086,  0.35364985],
       [ 1.15412078, -0.49582097,  0.54245581,  0.21861991]])

# Rescaling a feature 

In [49]:
# Create feature
x = np.array([[-500.5], 
              [-100.1], 
              [0], 
              [100.1], 
              [900.9]])

# Create scaler
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))

# Scale feature
x_scale = minmax_scale.fit_transform(x)

# Show feature
x_scale

array([[ 0.        ],
       [ 0.28571429],
       [ 0.35714286],
       [ 0.42857143],
       [ 1.        ]])