# Lab2: Data Preprocessing Tools

### Import Libraries

In [1]:
# !pip install scikit-learn

In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Import Dataset

In [3]:
dataset = pd.read_csv("Data.csv")          

## Preprocessing steps

### Step 1: Divide dataframe into independent variable/ input and dependent / output features

In [10]:
X= dataset.iloc[:,:-1]       # remove last column
Y= dataset.iloc[:,-1]       # only include last columns

In [11]:
print(X)

   Country   Age   Salary
0   France  44.0  72000.0
1    Spain  27.0  48000.0
2  Germany  30.0  54000.0
3    Spain  38.0  61000.0
4  Germany  40.0      NaN
5   France  35.0  58000.0
6    Spain   NaN  52000.0
7   France  48.0  79000.0
8  Germany  50.0  83000.0
9   France  37.0  67000.0


In [12]:
print(Y)

0     No
1    Yes
2     No
3     No
4    Yes
5    Yes
6     No
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object


### step 2: Handle the missing values in Dataset

In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')         # replace mean value for numerical value and replace null for catagorical value
imputer.fit(X.iloc[:,1:3])
X.iloc[:,1:3] = imputer.transform(X.iloc[:,1:3])

In [15]:
print(X)

   Country        Age        Salary
0   France  44.000000  72000.000000
1    Spain  27.000000  48000.000000
2  Germany  30.000000  54000.000000
3    Spain  38.000000  61000.000000
4  Germany  40.000000  63777.777778
5   France  35.000000  58000.000000
6    Spain  38.777778  52000.000000
7   France  48.000000  79000.000000
8  Germany  50.000000  83000.000000
9   France  37.000000  67000.000000


### Step 3: Encoding Categorical Data

In [16]:
dataset['Country'].value_counts()

France     4
Spain      3
Germany    3
Name: Country, dtype: int64

In [17]:
dataset['Purchased'].value_counts()

No     5
Yes    5
Name: Purchased, dtype: int64

**Two Encoding Technique**
1. OneHotEncoder = Use when you have more than 2 categories
2. LabelEncoder = Use when you have exactly 2 categories

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct= ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[0])],remainder='passthrough')
X=np.array(ct.fit_transform(X))

In [19]:
print(X)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]


In [20]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y=le.fit_transform(Y)

In [21]:
print(Y)        

[0 1 0 0 1 1 0 1 0 1]


## Step 4: Spliting Data into Training and Testing 

In [22]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=1)

In [23]:
print(X_train.shape)
print(Y_train.shape)

(7, 5)
(7,)


In [24]:
X_test

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04]])

## step 5: Feature Scaling

min-max scaler  == -1 to 1

standard scaler == -2 to 2

In [25]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
#from sklearn.preprocessing import MinMaxScaler
#mn = MinMaxScaler()
X_train[:,3:5] = sc.fit_transform(X_train[:,3:5])
X_test[:,3:5] = sc.fit_transform(X_test[:,3:5])

In [26]:
print(X_test)

[[ 0.          1.          0.         -1.38802721 -0.55138018]
 [ 1.          0.          0.          0.45941746  1.40351318]
 [ 0.          0.          1.          0.92860975 -0.852133  ]]


**Q] For the below student dataset remove missing values from column ‘gender’ and ‘marks’**

In [67]:
dfstd3 = pd.DataFrame(students)
dfstd3.columns = ['marks', 'gender', 'result']
dfstd3

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,70.0,M,good
5,,,verygood
6,92.0,F,verygood
7,98.0,M,excellent


**Replacing missing categorical values with most frequent values**

In [68]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=None, strategy='most_frequent')
imputer.fit(dfstd3[["gender"]])
dfstd3[["gender"]] = imputer.transform(dfstd3[["gender"]])
dfstd3

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,M,good
3,,M,average
4,70.0,M,good
5,,M,verygood
6,92.0,F,verygood
7,98.0,M,excellent


**Replacing missing numerical values with mean values**

In [69]:
numericalImputer = SimpleImputer(missing_values=np.NaN,strategy="mean")
numericalImputer.fit(dfstd3[["marks"]])
dfstd3[["marks"]] = numericalImputer.transform(dfstd3[["marks"]])
dfstd3

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,M,good
3,85.833333,M,average
4,70.0,M,good
5,85.833333,M,verygood
6,92.0,F,verygood
7,98.0,M,excellent


**Q] For dataset given in above Apply the proper Categorical Encoder to encode column ‘gender’ and ‘result’ (Attach separate sheet for answer)**

In [70]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfstd3["gender"]=le.fit_transform(dfstd3["gender"])
dfstd3

Unnamed: 0,marks,gender,result
0,85.0,1,verygood
1,95.0,0,excellent
2,75.0,1,good
3,85.833333,1,average
4,70.0,1,good
5,85.833333,1,verygood
6,92.0,0,verygood
7,98.0,1,excellent


In [74]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

dfstd3['result'] = dfstd3['result'].astype('category')
dfstd3['result_new'] = dfstd3['result'].cat.codes
enc = OneHotEncoder()
enc_data = pd.DataFrame(enc.fit_transform( dfstd3[['result_new']] ).toarray())
New_df = dfstd3.join(enc_data)

In [77]:
New_df

Unnamed: 0,marks,gender,result,result_new,0,1,2,3
0,85.0,1,verygood,3,0.0,0.0,0.0,1.0
1,95.0,0,excellent,1,0.0,1.0,0.0,0.0
2,75.0,1,good,2,0.0,0.0,1.0,0.0
3,85.833333,1,average,0,1.0,0.0,0.0,0.0
4,70.0,1,good,2,0.0,0.0,1.0,0.0
5,85.833333,1,verygood,3,0.0,0.0,0.0,1.0
6,92.0,0,verygood,3,0.0,0.0,0.0,1.0
7,98.0,1,excellent,1,0.0,1.0,0.0,0.0


 we can do like this as well it will give an array
- ct= ColumnTransformer(transformers = [('encoder', OneHotEncoder(),[2])],remainder='passthrough')
- dfstd3 = ct.fit_transform(dfstd3)
