### Filling Missing Values With Imputation

In [29]:
from numpy import isnan
import pandas as pd
from sklearn.impute import SimpleImputer


In [49]:
# Load dataset
df = pd.read_csv('horse-colic.csv', header=None, na_values='?')

# Split into input and output
data = df.values

ix = [i for i in range (data.shape[1]) if i !=23]
x, y = data[:, ix], data[:, 23]

# Print Missing
print('Missing: %d' % sum(isnan(x).flatten()))

# Define Imputer
imputer = SimpleImputer(strategy='mean')

# Fit on the Dataset
imputer.fit(x)

# Transform dataset
xtrans = imputer.transform(x)

# Print total missing
print('Missing: %d' % sum(isnan(xtrans).flatten()))

Missing: 1605
Missing: 0


### Selection of Feature with RFE

In [50]:
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

In [51]:
#Define dataset
x, y = make_classification(n_samples=1320, n_features=21, n_informative=10, n_redundant=11, random_state=1)

# Define RFE
rfe= RFE(estimator=DecisionTreeClassifier(), n_features_to_select=10)

# Fit RFE
rfe.fit(x, y)

# Summarize all features
for i in range (x.shape[1]):
    print('Column: %d, Selected=%s, Rank: %d' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected=True, Rank: 1
Column: 1, Selected=True, Rank: 1
Column: 2, Selected=False, Rank: 5
Column: 3, Selected=False, Rank: 4
Column: 4, Selected=False, Rank: 10
Column: 5, Selected=True, Rank: 1
Column: 6, Selected=False, Rank: 8
Column: 7, Selected=False, Rank: 3
Column: 8, Selected=False, Rank: 7
Column: 9, Selected=True, Rank: 1
Column: 10, Selected=True, Rank: 1
Column: 11, Selected=False, Rank: 12
Column: 12, Selected=True, Rank: 1
Column: 13, Selected=True, Rank: 1
Column: 14, Selected=True, Rank: 1
Column: 15, Selected=True, Rank: 1
Column: 16, Selected=False, Rank: 9
Column: 17, Selected=False, Rank: 11
Column: 18, Selected=False, Rank: 6
Column: 19, Selected=False, Rank: 2
Column: 20, Selected=True, Rank: 1


### Data Scaling with Normalization

In [45]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler

In [51]:
# Define dataset
x, y = make_classification(n_samples=1320, n_features=21, n_informative=10, n_redundant=11, random_state=1)

# Summarize data before transform
print(x[:3, :])

[[-1.42986543 -0.70112641 -3.88022659 -1.7387056   4.54086896  0.22325084
  -0.05693179 -0.68737561  1.12481055  3.03110157 -1.97962023  1.58773587
   2.14174319  1.10789093 -0.44510592  6.53815413 -0.15875236  0.03061438
   2.89607161  1.0205923  -2.97557394]
 [ 6.07171562  4.9107076   1.91753046 -9.10925988 -6.46697887  1.87525768
  -0.30463574  5.09646515 -0.70809416  2.13467119  2.42293306 -1.33809041
  -6.71698031  1.9384144  -2.53226687 -5.36747642 -0.69463424  1.28058558
  -0.28325715 -3.44856572  5.0639487 ]
 [ 0.55332344  1.22412386  3.26560962 -0.28425252 -3.90212539  0.08223332
   3.10949974 -2.21552738 -0.28227516  0.18809881 -0.53904089 -3.19871149
  -6.87169939 -0.86394712 -1.02995255  2.06670791  4.12201897  1.59369936
   1.48942138  2.1178143   1.74998875]]


In [52]:
# Define Scaler
trans = MinMaxScaler()

# Transform the data
x_norm = trans.fit_transform(x)

# Summarize data after transform
print(x_norm[:3, :])

[[0.33762904 0.47103263 0.37698868 0.42397866 0.63610393 0.48068202
  0.52059761 0.35337844 0.45422869 0.79450538 0.31633423 0.63049909
  0.59236802 0.57235989 0.55542084 0.78616386 0.47667119 0.49202468
  0.80595147 0.56982904 0.26948131]
 [0.83516892 0.8878633  0.59336362 0.15857275 0.20623505 0.59990297
  0.5092616  0.58829536 0.30786491 0.71878218 0.62359037 0.43829109
  0.22876087 0.62171876 0.38079364 0.36117746 0.43776791 0.53707088
  0.54485741 0.34145298 0.62580063]
 [0.46916339 0.61403461 0.64367455 0.47635199 0.30639547 0.47050516
  0.66550729 0.29131091 0.34186803 0.55435145 0.4168729  0.31606024
  0.22241041 0.45517152 0.50648828 0.62654999 0.78744122 0.5483548
  0.690434   0.6258976  0.47892276]]


### Transform Categories with OneHot Encoding 

In [26]:
from sklearn.preprocessing import OneHotEncoder

In [34]:
# Load dataset
rainfall1 = pd.read_csv('Rainfall Data.csv')

In [35]:
data = rainfall1.values

In [50]:
# Split into input and output
x = data[:, :-1].astype(str)
y = data[:, -1].astype(str)

# Summarize raw data
print(x[:3, :])

[['12/21/2013' '74' '60' '45' '67' '49' '43' '93' '75' '57' '29.86'
  '29.68' '29.59' '10' '7' '2' '20' '4' '31' '0.46']
 ['12/22/2013' '56' '48' '39' '43' '36' '28' '93' '68' '43' '30.41'
  '30.13' '29.87' '10' '10' '5' '16' '6' '25' '0']
 ['12/23/2013' '58' '45' '32' '31' '27' '23' '76' '52' '27' '30.56'
  '30.49' '30.41' '10' '10' '10' '8' '3' '12' '0']]


In [52]:
# Define onehot encoding transform
encoder = OneHotEncoder(sparse=False)

# Fit and apply transform to input data 
x_oe = encoder.fit_transform(x)

# Summarize  data after transform
print(x_oe[:3, :])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


### Transform Numbers to Category with KBins

In [58]:
from sklearn.datasets import make_classification
from sklearn.preprocessing import KBinsDiscretizer

In [60]:
# Define dataset
x, y = make_classification(n_samples=1320, n_features=21, n_informative=10, n_redundant=11, random_state=1)

# Summarize dataset
print(x[:3, :])

[[-1.42986543 -0.70112641 -3.88022659 -1.7387056   4.54086896  0.22325084
  -0.05693179 -0.68737561  1.12481055  3.03110157 -1.97962023  1.58773587
   2.14174319  1.10789093 -0.44510592  6.53815413 -0.15875236  0.03061438
   2.89607161  1.0205923  -2.97557394]
 [ 6.07171562  4.9107076   1.91753046 -9.10925988 -6.46697887  1.87525768
  -0.30463574  5.09646515 -0.70809416  2.13467119  2.42293306 -1.33809041
  -6.71698031  1.9384144  -2.53226687 -5.36747642 -0.69463424  1.28058558
  -0.28325715 -3.44856572  5.0639487 ]
 [ 0.55332344  1.22412386  3.26560962 -0.28425252 -3.90212539  0.08223332
   3.10949974 -2.21552738 -0.28227516  0.18809881 -0.53904089 -3.19871149
  -6.87169939 -0.86394712 -1.02995255  2.06670791  4.12201897  1.59369936
   1.48942138  2.1178143   1.74998875]]


In [61]:
# Define Transform
trans = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')

# Transform Data
x_discrete = trans.fit_transform(x)

# Summarize data after transform
print(x_discrete[:3, :])

[[3. 4. 3. 4. 6. 4. 5. 3. 4. 7. 3. 6. 5. 5. 5. 7. 4. 4. 8. 5. 2.]
 [8. 8. 5. 1. 2. 5. 5. 5. 3. 7. 6. 4. 2. 6. 3. 3. 4. 5. 5. 3. 6.]
 [4. 6. 6. 4. 3. 4. 6. 2. 3. 5. 4. 3. 2. 4. 5. 6. 7. 5. 6. 6. 4.]]


### Dimensionality Reduction with PCA

In [64]:
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA

In [66]:
# Define dataset
x, y = make_classification(n_samples=1320, n_features=21, n_informative=10, n_redundant=11, random_state=1)
print(x[:3, :])

[[-1.42986543 -0.70112641 -3.88022659 -1.7387056   4.54086896  0.22325084
  -0.05693179 -0.68737561  1.12481055  3.03110157 -1.97962023  1.58773587
   2.14174319  1.10789093 -0.44510592  6.53815413 -0.15875236  0.03061438
   2.89607161  1.0205923  -2.97557394]
 [ 6.07171562  4.9107076   1.91753046 -9.10925988 -6.46697887  1.87525768
  -0.30463574  5.09646515 -0.70809416  2.13467119  2.42293306 -1.33809041
  -6.71698031  1.9384144  -2.53226687 -5.36747642 -0.69463424  1.28058558
  -0.28325715 -3.44856572  5.0639487 ]
 [ 0.55332344  1.22412386  3.26560962 -0.28425252 -3.90212539  0.08223332
   3.10949974 -2.21552738 -0.28227516  0.18809881 -0.53904089 -3.19871149
  -6.87169939 -0.86394712 -1.02995255  2.06670791  4.12201897  1.59369936
   1.48942138  2.1178143   1.74998875]]


In [67]:
# Define transform
trans = PCA(n_components=3)

# Transform data
x_dim = trans.fit_transform(x)

# Summarize data after transform
print(x_dim[:3, :])

[[-2.98553144 -6.67857017 -6.81159835]
 [ 9.24870997 13.48588257 -2.90328002]
 [10.60782909  0.29232708 -0.47172531]]


## Label Encoding

In [2]:
import pandas as pd
df = pd.read_csv('Employee_data.csv')
df.head()

Unnamed: 0,Employee_ID,Gender,Remarks
0,56,Female,Great
1,75,Male,Nice
2,87,Male,Good
3,32,Female,Good
4,44,Female,Great


In [3]:
df['Remarks'].unique()

array(['Great', 'Nice', 'Good'], dtype=object)

In [4]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
df['Remarks']= label_encoder.fit_transform(df['Remarks'])

df['Remarks'].unique()

array([1, 2, 0])