# Part 2: Transformation

Let's first reload the dataset we obtained after pre-processing:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('CS_pre_processed_data.csv', index_col=0)
print(dataset.head())

   Occupation  Marital_Status  Purchase_Sum  annual_income  \
0          10               0         38891    42521.93013   
1          16               0         37417    59199.36954   
2          15               0         49947    40056.02938   
3           7               1         66607    79474.66782   
4          20               1         50684    85567.55715   

   number_of_children  proximity_town    sum  Age_18-25  Age_26-35  Age_36-45  \
0                   2        2.677101  38891          0          0          0   
1                   0        3.589760  37417          0          0          0   
2                   1        3.944390  49947          0          1          0   
3                   0        2.702605  66607          0          0          0   
4                   1        2.841509  50684          0          1          0   

   Age_46-50  Age_51-55  Age_55+  Gender_M  City_Category_B  City_Category_C  \
0          0          0        0         0                0 

## Missing values

You might have noticed that there are missing values in the dataset:

In [2]:
for var in dataset.columns:
    print(var, 'has \t\t\t', dataset[var].isna().sum(), ' missing values')

Occupation has 			 0  missing values
Marital_Status has 			 0  missing values
Purchase_Sum has 			 0  missing values
annual_income has 			 0  missing values
number_of_children has 			 0  missing values
proximity_town has 			 158  missing values
sum has 			 0  missing values
Age_18-25 has 			 0  missing values
Age_26-35 has 			 0  missing values
Age_36-45 has 			 0  missing values
Age_46-50 has 			 0  missing values
Age_51-55 has 			 0  missing values
Age_55+ has 			 0  missing values
Gender_M has 			 0  missing values
City_Category_B has 			 0  missing values
City_Category_C has 			 0  missing values
Stay_In_Current_City_Years_1 has 			 0  missing values
Stay_In_Current_City_Years_2 has 			 0  missing values
Stay_In_Current_City_Years_3 has 			 0  missing values
Stay_In_Current_City_Years_4+ has 			 0  missing values


Write a function that replaces missing values with the mean value:

In [3]:
def replace_missing_values(dataset, variable):
    transformed_data = dataset.copy()
    
    ### BEGIN SOLUTION    
    # We first calculate the mean
    mean = np.nanmean(transformed_data[variable])
    
    # Next, we fill in the NAs with the value
    transformed_data[variable] = transformed_data[variable].fillna(mean)
    ### END SOLUTION
    
    return transformed_data

Your answer will be verified below (no need for you to do anything).

In [4]:
### BEGIN HIDDEN TESTS
from pandas.testing import assert_frame_equal
transformed_data = dataset.copy()  
mean = np.nanmean(transformed_data['proximity_town'])
transformed_data['proximity_town'] = transformed_data['proximity_town'].fillna(mean)
assert transformed_data.equals(replace_missing_values(dataset, 'proximity_town'))
### END HIDDEN TESTS

In [5]:
print(transformed_data.head())

   Occupation  Marital_Status  Purchase_Sum  annual_income  \
0          10               0         38891    42521.93013   
1          16               0         37417    59199.36954   
2          15               0         49947    40056.02938   
3           7               1         66607    79474.66782   
4          20               1         50684    85567.55715   

   number_of_children  proximity_town    sum  Age_18-25  Age_26-35  Age_36-45  \
0                   2        2.677101  38891          0          0          0   
1                   0        3.589760  37417          0          0          0   
2                   1        3.944390  49947          0          1          0   
3                   0        2.702605  66607          0          0          0   
4                   1        2.841509  50684          0          1          0   

   Age_46-50  Age_51-55  Age_55+  Gender_M  City_Category_B  City_Category_C  \
0          0          0        0         0                0 

## Find outliers

Create a function that stores whether a variable is an outlier or not, in a separate column called 'outlier':

In [6]:
def mark_outliers(dataset, n_neighbours, contam):
    marked_outliers = dataset.copy()
    
    ### BEGIN SOLUTION    
    # First we import the appropriate code
    from sklearn.neighbors import LocalOutlierFactor
    
    # As before, we apply LOF and add a new column to the dataset
    loc= LocalOutlierFactor(n_neighbors = n_neighbours, contamination = contam)
    outliers_loc = loc.fit_predict(marked_outliers)
    marked_outliers['outlier'] = pd.DataFrame(outliers_loc)
    ### END SOLUTION
    
    return marked_outliers

Your answer will be verified below (no need for you to do anything).

In [7]:
### BEGIN HIDDEN TESTS
from pandas.testing import assert_frame_equal
from sklearn.neighbors import LocalOutlierFactor

n_neighbours = 20
contam = 0.1

marked_outliers = transformed_data.copy()   
loc= LocalOutlierFactor(n_neighbors = n_neighbours, contamination = contam)
outliers_loc = loc.fit_predict(marked_outliers)
marked_outliers['outlier'] = pd.DataFrame(outliers_loc)

assert marked_outliers.equals(mark_outliers(transformed_data, n_neighbours, contam))
### END HIDDEN TESTS

In [8]:
print(marked_outliers['outlier'].value_counts())

 1    4881
-1     543
Name: outlier, dtype: int64


## Normalisation

In our final step, we are going to standard normalise the numeric variables. Note that only particular variables need this treatment, as the binary ones are already within range. Write a function to transform the other variables:

In [9]:
def standardise_variable(dataset, variables):
    normalised_data = dataset.copy()
    
    ### BEGIN SOLUTION    
    # We import the appropriate code
    from sklearn.preprocessing import StandardScaler
    
    # We apply the standard scaler to values of the variables that are inputted
    ss = StandardScaler()
    norm_data = ss.fit_transform(normalised_data[variables].values)
    normalised_data[to_normalise] = norm_data
    ### END SOLUTION
    
    return normalised_data

Your answer will be verified below (no need for you to do anything).

In [10]:
### BEGIN HIDDEN TESTS
from pandas.testing import assert_frame_equal
from sklearn.preprocessing import StandardScaler

normalised_data = marked_outliers.copy()

to_normalise = ['annual_income', 'proximity_town', 'Occupation', 'Purchase_Sum', 'number_of_children']    
ss = StandardScaler()
norm_data = ss.fit_transform(normalised_data[to_normalise].values)
normalised_data[to_normalise] = norm_data

assert normalised_data.equals(standardise_variable(marked_outliers, to_normalise))
### END HIDDEN TESTS

Our data now looks like this:

In [11]:
print(normalised_data.head())

   Occupation  Marital_Status  Purchase_Sum  annual_income  \
0    0.288734               0     -0.452944      -0.434067   
1    1.233680               0     -0.467259      -0.275620   
2    1.076189               0     -0.345577      -0.457494   
3   -0.183740               1     -0.183788      -0.082992   
4    1.863644               1     -0.338420      -0.025106   

   number_of_children  proximity_town    sum  Age_18-25  Age_26-35  Age_36-45  \
0            1.050259       -0.797397  38891          0          0          0   
1           -0.982428       -0.750248  37417          0          0          0   
2            0.033916       -0.731928  49947          0          1          0   
3           -0.982428       -0.796079  66607          0          0          0   
4            0.033916       -0.788903  50684          0          1          0   

   ...  Age_51-55  Age_55+  Gender_M  City_Category_B  City_Category_C  \
0  ...          0        0         0                0             

Again, you can save your final output for Part 3: Modelling, which is the final step in the process:

In [12]:
normalised_data.to_csv('CS_transformed_data.csv')