In [None]:
'''
In this notebook, different techniques have been covered for data transformation which can be applied during data pre-processing phase

A. Simple Imputer
B. Label Encoder
C. One Hot Encoder
D. Standard Scalar
E. How to save these transformation logic into pickle file
F. How to use these pickle files on Test/Validation data while validating/testing model
G. At the end, how to configure all these data preprocessing techniques into pipeline.


'''

In [84]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

In [69]:
data = [['France',44.0,72000.0,'No'], ['Spain',27.0,48000.0,'Yes'], ['Germany',34.0,56000.0,'No'],
       ['India',24.0,61000.0,'Yes'], ['Spain',38,61000,'No'], ['Germany',40,'?','Yes'], ['France',35,58000,'Yes'],
       ['India','?',52000,'No'], ['France',48,79000,'Yes'], ['Germany',50,83000,'No'], ['France',37,67000,'Yes'], 
       ['India',44,0,'Yes']]

dataset = pd.DataFrame(data, columns=['Country', 'Age', 'Salary', 'Purchased'])
dataset = dataset.replace('?', np.NaN)
# or 
#dataset=dataset.applymap(lambda x: np.nan if x == '?' else x)

X = dataset.iloc[:,:-1].values
Y = dataset.iloc[:,3].values

print(dataset.head(12))

print("This prints the column # 2 i.e. 'Age'. Range starts from '0' here.")
print(X[:,1])

print("This prints the column #3 and #2 i.e. 'Age' and 'Salary'. Range starts from '0' here.")
print(X[:,1:3])

print("This prints the column #3 and #2 values only for Row # '2' and '3'.")
print(X[1:3,1:3])

print("This prints the row # 1. Range starts from '0' here.")
print(dataset[:1])

print("This prints the rows indexing from '4' to '8'. Range starts from '0' here.")
print(dataset[4:9])

print("This prints the column #3 and #2 i.e. 'Age' and 'Salary'. Range starts from '0' here.")
print(dataset.iloc[:,1:3])

print("This prints the last column i.e. 'Purchased'")
print(dataset.iloc[:,-1])

print("This prints the last column value of last row.")
print(dataset.iloc[-1,-1])

    Country   Age   Salary Purchased
0    France  44.0  72000.0        No
1     Spain  27.0  48000.0       Yes
2   Germany  34.0  56000.0        No
3     India  24.0  61000.0       Yes
4     Spain  38.0  61000.0        No
5   Germany  40.0      NaN       Yes
6    France  35.0  58000.0       Yes
7     India   NaN  52000.0        No
8    France  48.0  79000.0       Yes
9   Germany  50.0  83000.0        No
10   France  37.0  67000.0       Yes
11    India  44.0      0.0       Yes
This prints the column # 2 i.e. 'Age'. Range starts from '0' here.
[44.0 27.0 34.0 24.0 38.0 40.0 35.0 nan 48.0 50.0 37.0 44.0]
This prints the column #3 and #2 i.e. 'Age' and 'Salary'. Range starts from '0' here.
[[44.0 72000.0]
 [27.0 48000.0]
 [34.0 56000.0]
 [24.0 61000.0]
 [38.0 61000.0]
 [40.0 nan]
 [35.0 58000.0]
 [nan 52000.0]
 [48.0 79000.0]
 [50.0 83000.0]
 [37.0 67000.0]
 [44.0 0.0]]
This prints the column #3 and #2 values only for Row # '2' and '3'.
[[27.0 48000.0]
 [34.0 56000.0]]
This prints the row 

In [70]:
'''
                                    *** How to handle missing data [SimpleImputer] ****
A. There are different ways to handel missig data in dataset depends on whether column is continious or categorical such as
    a.  remove rows which are having missing data - here we can set some threshold as well like if a row contains missing value 
        in more than 3 columns then only drop that row
        for e.g.  X = X.dropna(subset = ['Age', 'Salary'])  or simply - X= X.dropna()
    
    b. remove columns which are having missing value - again we can configure some threshold here like if a column contains 20%
        or more missing values then drop that column
        for e.g.  X.drop(['Age', 'Salary'], axis = 1, inplace = True)
    
    c. or replace empty values with some logical values 
        1. In case of numeric values- we can replace empty values with 'mean'/'median' values of that column with fillna 
        method or we can use SimpleImputer api too
            X['Age'].fillna(X['Age'].mean, inplace=true)
        
        2. In case of categorical attribute, consider empty value as a unique value and treat them accordingly.
        
B. SimpleImputer is used to handle missing data for continious attributes i.e. Age and Salary
C. It needs to be saved as a file so that it can be used on Test data

'''

imputer = SimpleImputer(missing_values =np.nan,strategy = 'mean')
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])

pickle_label_out = open("SimpleImputer_dataprocessing.pickle","wb")
pickle.dump(imputer, pickle_label_out)
pickle_label_out.close()

In [71]:
'''
                                    *** ONE HOT ENCODING ****
A. Transform categorical data into Continious data via HotEncoder (or Label Encoder)
B. In case of Label encoder- For given categorical feature, all unique values are getting converted into unique 
   numeric correspoding value
C. As there is only one categorical attribute i.e. 'Country' at index '0' so we are applying encoding only for index '0' element
D. SAVE these encoders as a file so that it can be used further on test/validation data while validation/testing a model

'''                                    
 
labelencoder = LabelEncoder()
X[:, 0] = labelencoder.fit_transform(X[:, 0])
print('After Label Encoder')
print(X)

labelencoder_Y=LabelEncoder()
Y=labelencoder_Y.fit_transform(Y)


pickle_label_out = open("label_encoded_dataprocessing_1.pickle","wb")
pickle.dump(labelencoder, pickle_label_out)
pickle_label_out.close()

After Label Encoder
[[0 44.0 72000.0]
 [3 27.0 48000.0]
 [1 34.0 56000.0]
 [2 24.0 61000.0]
 [3 38.0 61000.0]
 [1 40.0 57909.09090909091]
 [0 35.0 58000.0]
 [2 38.27272727272727 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]
 [2 44.0 0.0]]


In [72]:
'''
                                    *** ONE HOT ENCODING ****
A. In case of One hot encoding, For given categorical feature,new columns are being added into dataset correspoding to all unique values for that featur 
   and column contains either '0' or '1' based on the value present.
B. Before running One Hot encoding, categorical values need to be converrted into numeric values by Label Encoder
C. Here catagorical_features is paarameters and sparse used to make output easily readable
D. SAVE these encoders as a file so that it can be used further on test/validation data while validation/testing a model

'''

onehotencoder=OneHotEncoder(categorical_features=[0],sparse=False) 
X=onehotencoder.fit_transform(X) #.tolist()
print('After One Hot Encoder')
print(onehot_encoder_df)

pickle_out = open("onehot_encoded_dataprocessing.pickle","wb")
pickle.dump(onehotencoder, pickle_out)
pickle_out.close()

After One Hot Encoder
[[1.0, 0.0, 0.0, 0.0, 44.0, 72000.0], [0.0, 0.0, 0.0, 1.0, 27.0, 48000.0], [0.0, 1.0, 0.0, 0.0, 34.0, 56000.0], [0.0, 0.0, 1.0, 0.0, 24.0, 61000.0], [0.0, 0.0, 0.0, 1.0, 38.0, 61000.0], [0.0, 1.0, 0.0, 0.0, 40.0, 57909.09090909091], [1.0, 0.0, 0.0, 0.0, 35.0, 58000.0], [0.0, 0.0, 1.0, 0.0, 38.27272727272727, 52000.0], [1.0, 0.0, 0.0, 0.0, 48.0, 79000.0], [0.0, 1.0, 0.0, 0.0, 50.0, 83000.0], [1.0, 0.0, 0.0, 0.0, 37.0, 67000.0], [0.0, 0.0, 1.0, 0.0, 44.0, 0.0]]


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [73]:
'''
            *** STANDARD SCALAR or MIN-MAX SCALAR *******
A. It is used to scale down any numeric values which are fall under high range such as Salary range is between 43 to 80K around.
B. In this example, as Age and Salary are in high range so we are scaling down 
C. It has also to be saved as a file so that it can be used on Test data 

'''

sc_X = StandardScaler()
X[:, 4:6] = sc_X.fit_transform(X[:, 4:6])

pickle_out = open("standard_scalar_dataprocessing.pickle","wb")
pickle.dump(sc_X, pickle_out)
pickle_out.close()

print(X)

'''
from sklearn.preprocessing import MinMaxScaler
sc_X = MinMaxScaler()
X[:, 4:6] = scaler.fit_transform(X[:, 4:6])
'''

[[ 1.          0.          0.          0.          0.77099198  0.69982954]
 [ 0.          0.          0.          1.         -1.51750803 -0.49213819]
 [ 0.          1.          0.          0.         -0.57518449 -0.09481562]
 [ 0.          0.          1.          0.         -1.92136097  0.153511  ]
 [ 0.          0.          0.          1.         -0.0367139   0.153511  ]
 [ 0.          1.          0.          0.          0.23252139  0.        ]
 [ 1.          0.          0.          0.         -0.44056685  0.00451503]
 [ 0.          0.          1.          0.          0.         -0.2934769 ]
 [ 1.          0.          0.          0.          1.30946257  1.0474868 ]
 [ 0.          1.          0.          0.          1.57869787  1.24614809]
 [ 1.          0.          0.          0.         -0.17133155  0.45150293]
 [ 0.          0.          1.          0.          0.77099198 -2.87607367]]


In [74]:
#Spliting test and train data
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state=0)
print(X_test)
print(X_train)

[[ 1.          0.          0.          0.         -0.44056685  0.00451503]
 [ 0.          0.          1.          0.          0.77099198 -2.87607367]
 [ 0.          0.          0.          1.         -0.0367139   0.153511  ]]
[[ 1.          0.          0.          0.         -0.17133155  0.45150293]
 [ 0.          1.          0.          0.         -0.57518449 -0.09481562]
 [ 1.          0.          0.          0.          1.30946257  1.0474868 ]
 [ 0.          0.          0.          1.         -1.51750803 -0.49213819]
 [ 0.          0.          1.          0.          0.         -0.2934769 ]
 [ 0.          1.          0.          0.          1.57869787  1.24614809]
 [ 0.          0.          1.          0.         -1.92136097  0.153511  ]
 [ 1.          0.          0.          0.          0.77099198  0.69982954]
 [ 0.          1.          0.          0.          0.23252139  0.        ]]


In [75]:
#Fitting Simple Regression to the training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
lrmodel = regressor.fit(X_train, Y_train)

with open('model_pkl.pkl', 'wb') as fid:
    pickle.dump(lrmodel, fid)

In [76]:
#Predicting test set results
Y_pred = regressor.predict(X_train)
Y_pred

array([0.84763177, 0.51989443, 0.53643998, 1.        , 0.14120207,
       0.18916905, 0.85879793, 0.61592825, 0.29093652])

In [78]:
test_data = dataset.iloc[:,:-1].values
print(test_data)

simple_imputer_file = open("SimpleImputer_dataprocessing.pickle","rb")
simple_imputer_file_pickle = pickle.load(simple_imputer_file)
test_data[:,1:3] = simple_imputer_file_pickle.transform(test_data[:,1:3])
print('After SimpleImputer')
print(test_data)

pickle_label_in = open("label_encoded_dataprocessing_1.pickle","rb")
label_encoded_pickle = pickle.load(pickle_label_in)
test_data[:, 0] = label_encoded_pickle.transform(test_data[:, 0])

print('After Label Encoding')
print(test_data)


pickle_in = open("onehot_encoded_dataprocessing.pickle","rb")
onehot_encoded_pickle = pickle.load(pickle_in)
data_tranform = onehot_encoded_pickle.transform(test_data) #.tolist()

print('After one hot Encoding')
print(data_tranform)

standard_scalar_in = open("standard_scalar_dataprocessing.pickle","rb")
standard_scalar_in_pickle = pickle.load(standard_scalar_in)
data_tranform[:, 4:6] = standard_scalar_in_pickle.transform(data_tranform[:, 4:6])

print('After Standard Scalar')
print(data_tranform)

with open('model_pkl.pkl', 'rb') as fid:
    sv = pickle.load(fid)

print(sv.predict(data_tranform))

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 34.0 56000.0]
 ['India' 24.0 61000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['India' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]
 ['India' 44.0 0.0]]
After SimpleImputer
[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 34.0 56000.0]
 ['India' 24.0 61000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 57909.09090909091]
 ['France' 35.0 58000.0]
 ['India' 38.27272727272727 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]
 ['India' 44.0 0.0]]
After Label Encoding
[[0 44.0 72000.0]
 [3 27.0 48000.0]
 [1 34.0 56000.0]
 [2 24.0 61000.0]
 [3 38.0 61000.0]
 [1 40.0 57909.09090909091]
 [0 35.0 58000.0]
 [2 38.27272727272727 52000.0]
 [0 48.0 79000.0]
 [1 50.0 83000.0]
 [0 37.0 67000.0]
 [2 44.0 0.0]]
After one hot Encoding
[[1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
  4.40000000e+01 7.20000000e+0

In [88]:
'''
A. We can create a pipeline to combine all pre-processing logic in single line and run it to transform the data. 
B. We can save this pipeline as a file which can be used further for test data pre-processing.
C. This is a very neat and clean recommended approach.

'''
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.compose import ColumnTransformer

titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)

X = data.drop('survived', axis=1)
y = data['survived']

numerical_attribs = ['age', 'fare']
categorical_attribs = ['embarked', 'sex', 'pclass']

num_pipeline = Pipeline([
    ('imputer', Imputer(strategy = 'mean')),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))
])


data_preprocessor = ColumnTransformer(transformers = [
    ('numeric', num_pipeline, numerical_attribs),
    ('category', cat_pipeline, categorical_attribs)
])


'''
if you want to see whether data preprocessor is working correctly or not- so you can try to run 
below method [data_preprocessor.fit_transform(X)] to see the transformed data before going for model building.
'''
transformed_dataset = data_preprocessor.fit_transform(X)
print(transformed_dataset)


model_pipeline = Pipeline(steps=[('preprocessor', data_preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model_pipeline.fit(X_train, y_train)
print("model score: %.3f" % model_pipeline.score(X_test, y_test))



[[-0.06842022  3.4424802   0.         ...  1.          0.
   0.        ]
 [-2.24909235  2.28647622  0.         ...  1.          0.
   0.        ]
 [-2.16497396  2.28647622  0.         ...  1.          0.
   0.        ]
 ...
 [-0.26254556 -0.50407824  1.         ...  0.          0.
   1.        ]
 [-0.22372049 -0.50407824  1.         ...  0.          0.
   1.        ]
 [-0.06842022 -0.49151035  0.         ...  0.          0.
   1.        ]]
model score: 0.748
