# IMPORT LIBRARIES

In [4]:
#!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting joblib>=1.1.1
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 KB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)
Collecting scipy>=1.5.0
  Downloading scipy-1.11.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.3/36.3 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.3.0 scipy-1.11.1 threadpoolctl-3.2.0


In [5]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

# IMPORT DATASET

In [35]:
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[:, 0:3].values     # independent variables
y = dataset.iloc[:, 3].values       # dependent variable

In [36]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [37]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# HANDLE MISSING DATA (REPLACE WITH MEAN)

In [38]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [39]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

# ENCODE CATEGORICAL VARIABLES

In [29]:
ct = ColumnTransformer([('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],
                        remainder='passthrough')

In [30]:
ct

In [41]:
X = ct.fit_transform(X)

In [42]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [43]:
X = np.array(X, dtype=np.float32)

In [44]:
X

array([[1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.4000000e+01,
        7.2000000e+04],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 2.7000000e+01,
        4.8000000e+04],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 3.0000000e+01,
        5.4000000e+04],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 3.8000000e+01,
        6.1000000e+04],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 4.0000000e+01,
        6.3777777e+04],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 3.5000000e+01,
        5.8000000e+04],
       [0.0000000e+00, 0.0000000e+00, 1.0000000e+00, 3.8777779e+01,
        5.2000000e+04],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 4.8000000e+01,
        7.9000000e+04],
       [0.0000000e+00, 1.0000000e+00, 0.0000000e+00, 5.0000000e+01,
        8.3000000e+04],
       [1.0000000e+00, 0.0000000e+00, 0.0000000e+00, 3.7000000e+01,
        6.7000000e+04]], dtype=float32)

# PREVENT DUMMY VARIABLE TRAP

In [45]:
X = X[:, 1:]

In [46]:
X

array([[0.0000000e+00, 0.0000000e+00, 4.4000000e+01, 7.2000000e+04],
       [0.0000000e+00, 1.0000000e+00, 2.7000000e+01, 4.8000000e+04],
       [1.0000000e+00, 0.0000000e+00, 3.0000000e+01, 5.4000000e+04],
       [0.0000000e+00, 1.0000000e+00, 3.8000000e+01, 6.1000000e+04],
       [1.0000000e+00, 0.0000000e+00, 4.0000000e+01, 6.3777777e+04],
       [0.0000000e+00, 0.0000000e+00, 3.5000000e+01, 5.8000000e+04],
       [0.0000000e+00, 1.0000000e+00, 3.8777779e+01, 5.2000000e+04],
       [0.0000000e+00, 0.0000000e+00, 4.8000000e+01, 7.9000000e+04],
       [1.0000000e+00, 0.0000000e+00, 5.0000000e+01, 8.3000000e+04],
       [0.0000000e+00, 0.0000000e+00, 3.7000000e+01, 6.7000000e+04]],
      dtype=float32)

# TRAINING/TEST SET SPLIT

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# FEATURE SCALING

In [48]:
X_test

array([[1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04]], dtype=float32)

In [50]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [51]:
X_train

array([[ 2.6457512 , -0.7745967 ,  0.26306754,  0.12381478],
       [-0.37796447, -0.7745967 , -0.2535015 ,  0.46175635],
       [-0.37796447,  1.2909944 , -1.9753983 , -1.5309335 ],
       [-0.37796447,  1.2909944 ,  0.05261363, -1.1114198 ],
       [-0.37796447, -0.7745967 ,  1.6405851 ,  1.7202971 ],
       [-0.37796447,  1.2909944 , -0.08131182, -0.16751409],
       [-0.37796447, -0.7745967 ,  0.9518263 ,  0.98614836],
       [-0.37796447, -0.7745967 , -0.59788084, -0.4821493 ]],
      dtype=float32)

In [52]:
X_test

array([[ 2.6457512 , -0.7745967 , -1.4588293 , -0.90166295],
       [ 2.6457512 , -0.7745967 ,  1.9849644 ,  2.1398108 ]],
      dtype=float32)