In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer 
from sklearn.compose import ColumnTransformer 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler 
from sklearn.model_selection import train_test_split


In [2]:
# Importing the data set and getting the values
dataset = pd.read_csv('Desktop/Datasets/Data.csv')
X = dataset.iloc[:, :-1].values #first_row:last_row , first_column:last_column
y = dataset.iloc[:, -1].values 

In [3]:
# Filling the missing data with the average value
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3]) # looks for missing values and calculates the average (1:3 is used for including the columns carrying numerical value)
X[:, 1:3] = imputer.transform(X[:, 1:3]) # will replace the missing values with average

In [4]:
# Encoding categorical data
#Encoding the indepentent variables
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') #[(kind_of_transformation, class_proceeding_encoding, indexes of the columns)]
X = np.array(ct.fit_transform(X)) # as arrays are requested in numpy arrays while training models

#Encoding dependent variable
le = LabelEncoder()
y = le.fit_transform(y)

In [5]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1) #(matrix_of_features, dependent_variable_vector, split_size, arbitrary random_state)

In [6]:
# Feature Scaling
# Since standardisation is resourceful in almost all cases and normalisation is generally better for datasets fitting Gauss Distribution, using standardisation is a safer option.
sc = StandardScaler()
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) # will calculate the mean and the standard deviation and transforms the X_train with standardization
X_test[:, 3:] = sc.transform(X_test[:, 3:]) # For protecting the integrity of the data, the standardization of X_train must be applied to X_test instead of standardizing X_test in itself