# Reading Data from UCI Archive

In [2]:
# Read raw dataset contains 32 columns.
# the 1st column has patient ID details, and the 2nd one has tumor type, i.e. malignant or benign.
# The rest 30 columns represent various features obtained from each patient.

import pandas as pd
cancer_set = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header = None)
cancer_set.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


# Extracting Features from Raw Set

In [4]:
cancer_features = cancer_set.iloc[:,2:]
cancer_features.head()

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [0]:
# Naming features, total 30 values
cancer_features_names = ['mean radius', 
'mean texture', 'mean perimeter', 
'mean area', 'mean smoothness', 
'mean compactness', 'mean concavity',
'mean concave points', 'mean symmetry',
'mean fractal dimension','radius error',
'texture error','perimeter error',
'area error', 'smoothness error',
'compactness error','concavity error',
'concave points error','symmetry error',
'fractal dimension error','worst radius',
'worst texture', 'worst perimeter', 
'worst area','worst smoothness', 
'worst compactness', 'worst concavity',
'worst concave points','worst symmetry',
'worst fractal dimension']

In [5]:
# Thus obtained cancer_features and cancer_target can be used by a ML algorithm.

cancer_target = cancer_set.iloc[:, 1]

# Replacing 'M' with 0 and 'B' with 1
cancer_target = cancer_target.replace(['M', 'B'], [0, 1])

# Converting to numpy array
cancer_target = cancer_target.values

print(type(cancer_target))
print(cancer_target.shape)

<class 'numpy.ndarray'>
(569,)


# Reading Cancer Data from scikit-learn

In [14]:
import sklearn.datasets as datasets
cancer = datasets.load_breast_cancer()
print(cancer.data.shape)
print(cancer.target.shape)
print(cancer.keys())

(569, 30)
(569,)
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


# Preprocesing using scikit-learn

In [17]:
# Standardization
# Standardization or Mean Removal is the process of transforming each feature vector into a normal distribution with mean 0 and variance 1.
import sklearn.preprocessing as preprocessing
breast_cancer = datasets.load_breast_cancer()
standardizer = preprocessing.StandardScaler()
standardizer = standardizer.fit(breast_cancer.data)
breast_cancer_standardized = standardizer.transform(breast_cancer.data)

print('Mean of each feature after Standardization :\n\n')
print(breast_cancer_standardized.mean(axis=0))
print('\nStd. of each feature after Standardization :\n\n')
print(breast_cancer_standardized.std(axis=0))


Mean of each feature after Standardization :


[-3.16286735e-15 -6.53060890e-15 -7.07889127e-16 -8.79983452e-16
  6.13217737e-15 -1.12036918e-15 -4.42138027e-16  9.73249991e-16
 -1.97167024e-15 -1.45363120e-15 -9.07641468e-16 -8.85349205e-16
  1.77367396e-15 -8.29155139e-16 -7.54180940e-16 -3.92187747e-16
  7.91789988e-16 -2.73946068e-16 -3.10823423e-16 -3.36676596e-16
 -2.33322442e-15  1.76367415e-15 -1.19802625e-15  5.04966114e-16
 -5.21317026e-15 -2.17478837e-15  6.85645643e-16 -1.41265636e-16
 -2.28956670e-15  2.57517109e-15]

Std. of each feature after Standardization :


[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


In [21]:
# Scaling transforms existing data values to lie between a minimum and maximum value.
breast_cancer = datasets.load_breast_cancer()

# min_max_scaler
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(0, 10)).fit(breast_cancer.data)
breast_cancer_minmaxscaled10 = min_max_scaler.transform(breast_cancer.data)
print(breast_cancer_minmaxscaled10.mean(axis=0))
print(type(breast_cancer_minmaxscaled10))


# MaxAbsScaler, By default, MaxAbsScaler transforms data to the range -1 and 1.
max_abs_scaler = preprocessing.MaxAbsScaler().fit(breast_cancer.data)
breast_cancer_maxabsscaled = max_abs_scaler.transform(breast_cancer.data)

[3.38221957 3.23965117 3.32935066 2.16920086 3.94784519 2.60600528
 2.08058378 2.43136908 3.79605368 2.70379314 1.06345123 1.89324042
 0.99376112 0.62635794 1.81119044 1.74438511 0.80539688 2.23454008
 1.78143451 1.00192912 2.96662747 3.63998486 2.83137669 1.70906196
 4.04137846 2.20212323 2.17402942 3.93835819 2.63306862 1.89596073]
<class 'numpy.ndarray'>


In [0]:
# Normalization - Example
# 'l1' norm makes the sum of absolute values of each row as 1, and 'l2' norm makes the sum of squares of each row as 1.
# 'l1' norm is insensitive to outliers.
# By default l2 norm is considered. Hence, removing outliers is recommended before applying l2 norm.
normalizer = preprocessing.Normalizer(norm='l1').fit(breast_cancer.data)
breast_cancer_normalized = normalizer.transform(breast_cancer.data)
# In above example, l1 norm is used with norm parameter.

In [25]:
# Binarization - Example
# Any value above the threshold is transformed to 1, and any value below the threshold is transformed to 0.
# By default, a threshold of 0 is used.
binarizer = preprocessing.Binarizer(threshold=3.0).fit(breast_cancer.data)
breast_cancer_binarized = binarizer.transform(breast_cancer.data)
print(breast_cancer_binarized[:5,:5])

[[1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]
 [1. 1. 1. 1. 0.]]


In [26]:
# OneHotEncoder
# OneHotEncoder converts categorical integer values into one-hot vectors. In an on-hot vector, every category is transformed into a binary attribute having only 0 and 1 values.
# An example creating two binary attributes for the categorical integers 1 and 2, is shown in the next slide
onehotencoder = preprocessing.OneHotEncoder()
onehotencoder = onehotencoder.fit([[1], [1], [1], [2], [2], [1]])

# Transforming category values 1 and 2 to one-hot vectors
print(onehotencoder.transform([[1]]).toarray())
print(onehotencoder.transform([[2]]).toarray())
# https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621


[[1. 0.]]
[[0. 1.]]


In [0]:
# Imputation
# Imputation replaces missing values with either median, mean, or the most common value of the column or row in which the missing values exist.
# Below example replaces missing values, represented by np.nan, with the mean of respective column (axis 0).
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imp.fit(breast_cancer.data)
breast_cancer_imputed = imputer.transform(breast_cancer.data)

In [33]:
# Label Encoding
# Label Encoding is a step in which, in which categorical features are represented as categorical integers. An example of transforming categorical values ["benign","malignant"]into[0, 1]` is shown below.
labels = ['malignant', 'benign', 'malignant', 'benign']
labelencoder = preprocessing.LabelEncoder()
labelencoder = labelencoder.fit(labels)
bc_labelencoded = labelencoder.transform(breast_cancer.target_names)
bc_labelencoded

array([1, 0])

# Excercise

In [46]:
import sklearn.preprocessing as preprocessing
import sklearn.datasets as datasets
iris = datasets.load_iris()
normalizer = preprocessing.Normalizer(norm='l2').fit(iris.data)
iris_normalized = normalizer.transform(iris.data)
print(iris_normalized.mean(axis=0))

[0.75140029 0.40517418 0.45478362 0.14107142]


In [78]:
iris = datasets.load_iris()
# onehotencoder = preprocessing.OneHotEncoder()
# iris_target_onehot = onehotencoder.fit([iris.target])
# print(iris_target_onehot.transform([iris.target]).toarray()[50])
print(iris.keys())
print(iris.target_names, iris.feature_names)
df = pd.DataFrame(iris.data)
df.head()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
['setosa' 'versicolor' 'virginica'] ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [87]:
from sklearn.preprocessing import OneHotEncoder
iris_data = iris.target.reshape(-1,1)
onehotencoder = OneHotEncoder(categories='auto')
iris_target_onehot = onehotencoder.fit_transform(iris_data)
print(iris_target_onehot.toarray()[[0,50,100]])

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [113]:
# imputer = preprocessing.Imputer(missing_values='NaN', strategy='mean')

# imputer = imputer.fit(breast_cancer.data)
# breast_cancer_imputed = imputer.transform(breast_cancer.data)

import numpy
iris = datasets.load_iris()
data = iris.data
for i in range(0, 50):
  for j in range(0, len(data[0])):
    data[i][j] = numpy.NaN

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=numpy.NaN, strategy='mean')
imputer = imp.fit(data)
iris_imputed = imputer.transform(data)
print(iris_imputed.mean(axis=0))

[6.262 2.872 4.906 1.676]
