# Lab 2: Data Preprocessing Tools

1. input variables and output variables
2. find misssing values
3. convert categorical into numerical (encoding )
4. split into training and testing daTa
5. feature scaling

### Import Libraries

In [1]:
# !pip install scikit-learn

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Import Dataset

In [3]:
dataset = pd.read_csv("Data.csv")            

### EDA Steps

In [4]:
dataset.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [5]:
dataset.shape

(10, 4)

In [6]:
dataset.columns

Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

In [7]:
dataset.info()     # for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 448.0+ bytes


In [8]:
# descriptive stastics of numerical columns
dataset.describe()

Unnamed: 0,Age,Salary
count,9.0,9.0
mean,38.777778,63777.777778
std,7.693793,12265.579662
min,27.0,48000.0
25%,35.0,54000.0
50%,38.0,61000.0
75%,44.0,72000.0
max,50.0,83000.0


In [9]:
# for categorical
dataset.describe(include='object')

Unnamed: 0,Country,Purchased
count,10,10
unique,3,2
top,France,No
freq,4,5


## Preprocessing steps

### Step 1: Divide dataframe into independent variable/ input and dependent / output features

In [10]:
X= dataset.iloc[:,:-1]       # remove last column
Y= dataset.iloc[:,-1]       # only include last columns

In [11]:
print(X)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [12]:
print(Y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


### step 2: Handle the missing values in Dataset

In [13]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer.fit(X.iloc[:,1:3])
X.iloc[:,1:3] = imputer.transform(X.iloc[:,1:3])

In [14]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


### Step 3: Encoding Categorical Data

In [15]:
dataset['Country'].value_counts()

France     4
Spain      3
Germany    3
Name: Country, dtype: int64

In [16]:
dataset['Purchased'].value_counts()

No     5
Yes    5
Name: Purchased, dtype: int64

**Two Encoding Technique**
1. OneHotEncoder = Use when you have more than 2 categories
2. LabelEncoder = Use when you have exactly 2 categories

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct= ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[0])],remainder='passthrough')
X=np.array(ct.fit_transform(X))

In [18]:
print(X)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]


In [19]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y=le.fit_transform(Y)

In [20]:
print(Y)        

[0 1 0 0 1 1 0 1 0 1]


## Step 4: Spliting Data into Training and Testing 

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=4)

In [22]:
print(X_train.shape)
print(Y_train.shape)

(7, 5)
(7,)


In [23]:
X_test

array([[0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04]])

## step 5: Feature Scaling

min-max scaler  == -1 to 1

standard scaler == -2 to 2

In [24]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,3:5] = sc.fit_transform(X_train[:,3:5])
X_test[:,3:5] = sc.fit_transform(X_test[:,3:5])

In [25]:
print(X_test)

[[ 0.          0.          1.         -0.88900089 -0.84432151]
 [ 0.          1.          0.          1.3970014   1.40467838]
 [ 0.          1.          0.         -0.50800051 -0.56035688]]


#### Test Your Knowledge

#### Q1. When to use the Label Encoder and When to use OneHotEncoder

###### 1. Depending upon the data encoding technique is selected. For example, we have encoded different state names into numerical data in the above example. This categorical data is having no relation, of any kind, between the rows. Then we can use Lable encoding.
###### 2. When the order does not matter in categorical features and Categories in a feature are fewer.

**Q2. Write a code to Feature Scaling our dataset numerical variable using MinMaxScaler**

In [26]:
from sklearn.preprocessing import MinMaxScaler

mn = MinMaxScaler()

X_train[:,3:5] = mn.fit_transform(X_train[:,3:5])

X_test[:,3:5] = mn.fit_transform(X_test[:,3:5])

**Q3. For the below student dataset remove missing values from column ‘gender’ and ‘marks’**

In [27]:
students = [[85, 'M', 'verygood'],
[95, 'F', 'excellent'],
[75, None,'good'],
[np.NaN, 'M', 'average'],
[70, 'M', 'good'],
[np.NaN, None, 'verygood'],
[92, 'F', 'verygood'],
[98, 'M', 'excellent']]

In [28]:
stud_df = pd.DataFrame(students)
stud_df.columns = ['marks', 'gender', 'result']
stud_df

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,70.0,M,good
5,,,verygood
6,92.0,F,verygood
7,98.0,M,excellent


**Replacing missing categorical values with most frequent values**

In [29]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=None, strategy='most_frequent')
imputer.fit(stud_df[["gender"]])
stud_df[["gender"]] = imputer.transform(stud_df[["gender"]])
stud_df

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,M,good
3,,M,average
4,70.0,M,good
5,,M,verygood
6,92.0,F,verygood
7,98.0,M,excellent


**Replacing missing numerical values with mean values**

In [30]:
numericalImputer = SimpleImputer(missing_values=np.NaN,strategy="mean")
numericalImputer.fit(stud_df[["marks"]])
stud_df[["marks"]] = numericalImputer.transform(stud_df[["marks"]])
stud_df

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,M,good
3,85.833333,M,average
4,70.0,M,good
5,85.833333,M,verygood
6,92.0,F,verygood
7,98.0,M,excellent


**Q4. For dataset given in above Apply the proper Categorical Encoder to encode column ‘gender’ and ‘result’ (Attach separate sheet for answer)**

In [31]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
stud_df["gender"]=le.fit_transform(stud_df["gender"])
stud_df

Unnamed: 0,marks,gender,result
0,85.0,1,verygood
1,95.0,0,excellent
2,75.0,1,good
3,85.833333,1,average
4,70.0,1,good
5,85.833333,1,verygood
6,92.0,0,verygood
7,98.0,1,excellent


In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

stud_df['result'] = stud_df['result'].astype('category')
stud_df['result_new'] = stud_df['result'].cat.codes
df_encoder = OneHotEncoder()
df_enc_data = pd.DataFrame(df_encoder.fit_transform( stud_df[['result_new']] ).toarray())
encoder_df = stud_df.join(df_enc_data)

In [33]:
encoder_df

Unnamed: 0,marks,gender,result,result_new,0,1,2,3
0,85.0,1,verygood,3,0.0,0.0,0.0,1.0
1,95.0,0,excellent,1,0.0,1.0,0.0,0.0
2,75.0,1,good,2,0.0,0.0,1.0,0.0
3,85.833333,1,average,0,1.0,0.0,0.0,0.0
4,70.0,1,good,2,0.0,0.0,1.0,0.0
5,85.833333,1,verygood,3,0.0,0.0,0.0,1.0
6,92.0,0,verygood,3,0.0,0.0,0.0,1.0
7,98.0,1,excellent,1,0.0,1.0,0.0,0.0
