<a href="https://colab.research.google.com/github/atef-ataya/data_preprocessing_tool/blob/main/Copy_of_data_preprocessing_tools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing Tools

## 1.0 Importing the libraries

In [11]:
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt

## 2.0 Importing the dataset

In [12]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [13]:
print(X)

[['United States' 33.0 65000.0]
 ['Canada' 47.0 35000.0]
 ['United Kingdom' 50.0 25000.0]
 ['Canada' 56.0 55000.0]
 ['United Kingdom' 25.0 nan]
 ['United States' 15.0 20000.0]
 ['United Kingdom' nan 15000.0]
 ['United States' 47.0 10000.0]
 ['United Kingdom' 32.0 90000.0]
 ['United States' 62.0 48000.0]]


In [14]:
print(y)

['No' 'No' 'Yes' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes']


## 3.0 Taking care of missing data

In [15]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy = 'mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [16]:
print(X)

[['United States' 33.0 65000.0]
 ['Canada' 47.0 35000.0]
 ['United Kingdom' 50.0 25000.0]
 ['Canada' 56.0 55000.0]
 ['United Kingdom' 25.0 40333.333333333336]
 ['United States' 15.0 20000.0]
 ['United Kingdom' 40.77777777777778 15000.0]
 ['United States' 47.0 10000.0]
 ['United Kingdom' 32.0 90000.0]
 ['United States' 62.0 48000.0]]


## 4.0 Encoding categorical data

### 4.1 Encoding the Independent Variable

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [18]:
print(X)

[[0.0 0.0 1.0 33.0 65000.0]
 [1.0 0.0 0.0 47.0 35000.0]
 [0.0 1.0 0.0 50.0 25000.0]
 [1.0 0.0 0.0 56.0 55000.0]
 [0.0 1.0 0.0 25.0 40333.333333333336]
 [0.0 0.0 1.0 15.0 20000.0]
 [0.0 1.0 0.0 40.77777777777778 15000.0]
 [0.0 0.0 1.0 47.0 10000.0]
 [0.0 1.0 0.0 32.0 90000.0]
 [0.0 0.0 1.0 62.0 48000.0]]


### 4.2 Encoding the Dependent Variable

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [22]:
print(y)

[0 0 1 1 0 0 1 1 0 1]


## 5.0 Splitting the dataset into the Training set and Test set

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 1)

In [24]:
print(X_train)

[[0.0 1.0 0.0 40.77777777777778 15000.0]
 [0.0 1.0 0.0 25.0 40333.333333333336]
 [0.0 0.0 1.0 33.0 65000.0]
 [1.0 0.0 0.0 56.0 55000.0]
 [1.0 0.0 0.0 47.0 35000.0]
 [0.0 0.0 1.0 47.0 10000.0]
 [0.0 1.0 0.0 32.0 90000.0]
 [0.0 0.0 1.0 15.0 20000.0]]


In [25]:
print(X_test)

[[0.0 1.0 0.0 50.0 25000.0]
 [0.0 0.0 1.0 62.0 48000.0]]


In [26]:
print(y_train)

[1 0 0 1 0 1 0 0]


In [27]:
print(y_test)

[1 1]


## 6.0 Feature Scaling

In [28]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sc.transform(X_test[:, 3:])

In [29]:
print(X_train)

[[0.0 1.0 0.0 0.3051451994200677 -1.0227468570839182]
 [0.0 1.0 0.0 -0.9599823427010884 -0.03727920398245669]
 [0.0 0.0 1.0 -0.31850922275233323 0.9222550898268609]
 [1.0 0.0 0.0 1.5257259971003378 0.533254700444705]
 [1.0 0.0 0.0 0.8040687371579883 -0.24474607831960654]
 [0.0 0.0 1.0 0.8040687371579883 -1.217247051774996]
 [0.0 1.0 0.0 -0.39869336274592765 1.8947560632822502]
 [0.0 0.0 1.0 -1.7618237426370325 -0.8282466623928403]]


In [30]:
print(X_test)

[[0.0 1.0 0.0 1.0446211571387716 -0.6337464677017624]
 [0.0 0.0 1.0 2.006830837061904 0.26095442787719597]]
