<a href="https://colab.research.google.com/github/bhatiaparteek/ml_with_python/blob/main/chapter_4_preprocessing/pre-processing-of-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Importing the Libraries

In [1]:
import numpy as np
import pandas as pd

# style 1
import matplotlib.pyplot
from sklearn.preprocessing import StandardScaler
# Used to perform scaling of data under pre-processing phase.

from sklearn.model_selection import train_test_split
# Used to split the dataset into training and testing.

from sklearn.linear_model import LinearRegression
# Used to perform Linear regression.

from sklearn.metrics import confusion_matrix
# Used to perform performance analysis of classifier by making a confusion matrix.


2. Data Acquisition

This will open a file selection dialog. Choose your salary.csv file (get this from GitHub repo) and upload it.

In [3]:
from google.colab import files

# Upload the file from your local system
uploaded = files.upload()

Saving salary.csv to salary.csv


In [5]:
#----------------------------------Reading the dataset-----------------------
dataset = pd.read_csv('salary.csv')
# pd is the alias of the Pandas library imported.
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:,3].values
print(X)
print(Y)

[['Hyderabad' 44.0 72000.0]
 ['Mumbai' 27.0 48000.0]
 ['Delhi' 30.0 54000.0]
 ['Mumbai' 38.0 61000.0]
 ['Delhi' 40.0 nan]
 ['Hyderabad' 35.0 58000.0]
 ['Mumbai' nan 52000.0]
 ['Hyderabad' 48.0 79000.0]
 ['Delhi' 50.0 83000.0]
 ['Hyderabad' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']



3. Taking care of missing data



In [6]:
# handling of missing data
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(X[:,1:3])
X[:,1:3] = imputer.transform(X[:,1:3])
print(X)

[['Hyderabad' 44.0 72000.0]
 ['Mumbai' 27.0 48000.0]
 ['Delhi' 30.0 54000.0]
 ['Mumbai' 38.0 61000.0]
 ['Delhi' 40.0 63777.77777777778]
 ['Hyderabad' 35.0 58000.0]
 ['Mumbai' 38.77777777777778 52000.0]
 ['Hyderabad' 48.0 79000.0]
 ['Delhi' 50.0 83000.0]
 ['Hyderabad' 37.0 67000.0]]


4. Encoding categorical data


In [7]:
#label encoding categorical attribute city
from sklearn.preprocessing import LabelEncoder
labelEncoder_X = LabelEncoder()
X[:, 0] = labelEncoder_X.fit_transform(X[:, 0])
print(X[:, 0])

[1 2 0 2 0 1 2 1 0 1]


5. One-hot encoding on categorical data

In [8]:
#One-hot encoding categorical attribute city
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)
#Alternate way to do One-hot encoding is mentioned in the end of this file

[[0.0 1.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [1.0 0.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 48.0 79000.0]
 [1.0 0.0 0.0 50.0 83000.0]
 [0.0 1.0 0.0 37.0 67000.0]]


6. Lable encoding on purchased attribute

In [9]:
#label encoding purchased attribute
labelEncoder_Y = LabelEncoder()
Y = labelEncoder_Y.fit_transform(Y)
print(Y)

[0 1 0 0 1 1 0 1 0 1]


7. Splitting of Dataset into training and testing


In [10]:
#Splitting of Dataset into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
print(X_train)
print(X_test)
print(Y_train)
print(Y_test)


[[1.0 0.0 0.0 40.0 63777.77777777778]
 [0.0 1.0 0.0 37.0 67000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 48.0 79000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 44.0 72000.0]
 [0.0 1.0 0.0 35.0 58000.0]]
[[1.0 0.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 50.0 83000.0]]
[1 1 1 0 1 0 0 1]
[0 0]


#Feature Scaling

In [11]:
#Feature Scaling
from sklearn import preprocessing
X_train = preprocessing.normalize(X_train, norm='l1')
X_test = preprocessing.normalize(X_test, norm='l1')
print(X_train)
print(X_test)

[[1.56693693e-05 0.00000000e+00 0.00000000e+00 6.26774774e-04
  9.99357556e-01]
 [0.00000000e+00 1.49169128e-05 0.00000000e+00 5.51925773e-04
  9.99433157e-01]
 [0.00000000e+00 0.00000000e+00 2.08211876e-05 5.62172066e-04
  9.99417007e-01]
 [0.00000000e+00 0.00000000e+00 1.92160698e-05 7.45156483e-04
  9.99235627e-01]
 [0.00000000e+00 1.26503814e-05 0.00000000e+00 6.07218308e-04
  9.99380131e-01]
 [0.00000000e+00 0.00000000e+00 1.63829683e-05 6.22552794e-04
  9.99361064e-01]
 [0.00000000e+00 1.38802138e-05 0.00000000e+00 6.10729405e-04
  9.99375390e-01]
 [0.00000000e+00 1.72306844e-05 0.00000000e+00 6.03073954e-04
  9.99379695e-01]]
[[1.85078936e-05 0.00000000e+00 0.00000000e+00 5.55236808e-04
  9.99426255e-01]
 [1.20407942e-05 0.00000000e+00 0.00000000e+00 6.02039711e-04
  9.99385919e-01]]


Standardizing the data

In [12]:
#Standardizing the data
#lets do split again as we have already applied normalization on X_train and X_test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
#apply standarization
from sklearn.preprocessing import StandardScaler
scale_X = StandardScaler()
X_train = scale_X.fit_transform(X_train)
X_test = scale_X.transform(X_test)
print(X_train)
print(X_test)

[[ 2.64575131 -1.         -0.77459667  0.26306757  0.12381479]
 [-0.37796447  1.         -0.77459667 -0.25350148  0.46175632]
 [-0.37796447 -1.          1.29099445 -1.97539832 -1.53093341]
 [-0.37796447 -1.          1.29099445  0.05261351 -1.11141978]
 [-0.37796447  1.         -0.77459667  1.64058505  1.7202972 ]
 [-0.37796447 -1.          1.29099445 -0.0813118  -0.16751412]
 [-0.37796447  1.         -0.77459667  0.95182631  0.98614835]
 [-0.37796447  1.         -0.77459667 -0.59788085 -0.48214934]]
[[ 2.64575131 -1.         -0.77459667 -1.45882927 -0.90166297]
 [ 2.64575131 -1.         -0.77459667  1.98496442  2.13981082]]


Alternate way to do One-hot encoding

In [13]:
#Alternate way to do One-hot encoding
#Importing of desired libraries
import numpy as np
import pandas as pd
#---------------------------Reading the dataset----------------------
dataset = pd.read_csv('salary.csv')
# pd is the alias of the Pandas library imported.
dataset = pd.get_dummies(dataset, columns=["City"], drop_first=True)
print(dataset)

    Age   Salary Purchased  City_Hyderabad  City_Mumbai
0  44.0  72000.0        No            True        False
1  27.0  48000.0       Yes           False         True
2  30.0  54000.0        No           False        False
3  38.0  61000.0        No           False         True
4  40.0      NaN       Yes           False        False
5  35.0  58000.0       Yes            True        False
6   NaN  52000.0        No           False         True
7  48.0  79000.0       Yes            True        False
8  50.0  83000.0        No           False        False
9  37.0  67000.0       Yes            True        False
