In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
df = pd.read_csv("../archive/Social_Network_Ads.csv")

In [3]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [4]:
df.dtypes

User ID              int64
Gender              object
Age                float64
EstimatedSalary    float64
Purchased            int64
dtype: object

### Lecture 13: Missing Data

In [5]:
# Remember to fill the nan values with 0
# Filling missing values with the mean

# Only performing the missing values for integers/floats
for col in df.columns:
    if df.loc[:, col].dtype != "object":
        df.loc[:,col].fillna(df.loc[:,col].mean(), inplace=True)


In [6]:
# Fixing the decimals in the df
df = df.round(2)

In [7]:
Xs = df.drop('Purchased', 1)
Xs.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary
0,15624510,Male,19.0,19000.0
1,15810944,Male,35.0,20000.0
2,15668575,Female,26.0,43000.0
3,15603246,Female,27.0,57000.0
4,15804002,Male,19.0,76000.0


In [8]:
y = df['Purchased']
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

### Lecture 14: Categorical Data
- Setting up dummy variable

In [18]:
# The negative from using is the number labels will make the model
# think that's some kind of benefit of countries with the larger number
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X = LabelEncoder()
Xs.iloc[:, 1] = labelencoder_X.fit_transform(Xs.iloc[:,1])

onehotencoder = OneHotEncoder(categorical_features=[1])
Xs = onehotencoder.fit_transform(Xs).toarray()

labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

### Lecture 15: Splitting the Dataset into the Training Set and Test Set

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.2, random_state=0)
X_train, X_test, y_train, y_test

(array([[0.00e+00, 1.00e+00, 1.44e+02, 5.80e+01, 1.44e+05],
        [1.00e+00, 0.00e+00, 6.70e+01, 5.90e+01, 8.30e+04],
        [1.00e+00, 0.00e+00, 1.24e+02, 2.40e+01, 5.50e+04],
        ...,
        [0.00e+00, 1.00e+00, 3.90e+01, 3.60e+01, 5.20e+04],
        [1.00e+00, 0.00e+00, 3.39e+02, 2.70e+01, 5.40e+04],
        [1.00e+00, 0.00e+00, 3.69e+02, 2.60e+01, 1.18e+05]]),
 array([[0.00e+00, 1.00e+00, 2.57e+02, 3.00e+01, 8.70e+04],
        [1.00e+00, 0.00e+00, 1.28e+02, 3.80e+01, 5.00e+04],
        [0.00e+00, 1.00e+00, 3.41e+02, 3.50e+01, 7.50e+04],
        [1.00e+00, 0.00e+00, 2.79e+02, 3.00e+01, 7.90e+04],
        [1.00e+00, 0.00e+00, 1.13e+02, 3.50e+01, 5.00e+04],
        [0.00e+00, 1.00e+00, 3.96e+02, 2.70e+01, 2.00e+04],
        [1.00e+00, 0.00e+00, 7.40e+01, 3.10e+01, 1.50e+04],
        [0.00e+00, 1.00e+00, 1.80e+02, 3.60e+01, 1.44e+05],
        [1.00e+00, 0.00e+00, 1.47e+02, 1.80e+01, 6.80e+04],
        [0.00e+00, 1.00e+00, 9.30e+01, 4.70e+01, 4.30e+04],
        [0.00e+00, 1.00e+

### Lecture 16: Feature Scaling
- There's two of feature scaling
    - Standarisation: Xstand = x-mean(x)/(std. dev. (x))
    - Normalisation: Xnorm = x-min(x)/max(x)-min(x)
- A great point was what should we do for values that do need to be feature scaling
- the response is for each on its own!

In [20]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)

# Noticed how we are only using the transform and not the fit transform,
# this is because if we already fit, there's no need to fit it again
X_test = sc_X.transform(X_test)

X_train

array([[-1.02532046,  1.02532046, -0.48255451,  1.92295008,  2.14601566],
       [ 0.97530483, -0.97530483, -1.15951553,  2.02016082,  0.3787193 ],
       [ 0.97530483, -0.97530483, -0.65838854, -1.3822153 , -0.4324987 ],
       ...,
       [-1.02532046,  1.02532046, -1.40568317, -0.21568634, -0.51941492],
       [ 0.97530483, -0.97530483,  1.23182727, -1.09058306, -0.46147078],
       [ 0.97530483, -0.97530483,  1.49557832, -1.18779381,  1.3927418 ]])