# Preprocessing data

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Get data
df = pd.read_csv('ecommerce_data.csv')
df.head()

Unnamed: 0,is_mobile,n_products_viewed,visit_duration,is_returning_visitor,time_of_day,user_action
0,1,0,0.65751,0,3,0
1,1,1,0.568571,0,2,1
2,1,0,0.042246,1,1,0
3,1,1,1.659793,1,1,2
4,0,1,2.014745,1,1,2


In [3]:
# Turn data into a numpy matrix
data = df.values
data[:5,:]

array([[1.        , 0.        , 0.65750995, 0.        , 3.        ,
        0.        ],
       [1.        , 1.        , 0.56857123, 0.        , 2.        ,
        1.        ],
       [1.        , 0.        , 0.042246  , 1.        , 1.        ,
        0.        ],
       [1.        , 1.        , 1.65979338, 1.        , 1.        ,
        2.        ],
       [0.        , 1.        , 2.01474485, 1.        , 1.        ,
        2.        ]])

In [4]:
df.shape, data.shape

((500, 6), (500, 6))

In [5]:
# Split variables into dependend and independent variables
X = data[:,:-1]
Y = data[:,-1]

X.shape, Y.shape

((500, 5), (500,))

In [6]:
X[0], Y[0]

(array([1.        , 0.        , 0.65750995, 0.        , 3.        ]), 0.0)

In [7]:
# Normalise numerical columns = n_products viewed, visit_duration
X[:,1] = (X[:,1]-X[:,1].mean())/X[:,1].std()
X[:,2] = (X[:,2]-X[:,2].mean())/X[:,2].std()

In [8]:
# One-hot encode categorical column = time_of_day

N,D=X.shape

X2 = np.zeros((N,D+3)) # New matrix
X2[:,0:(D-1)] = X[:,0:(D-1)] # Copy all variables except the categorical variable

Z = np.zeros((N,4)) # Create a matrix with where columns = option for categorical column
Z[np.arange(N),X[:,D-1].astype(np.int32)]=1 # Select the columns that include the value and mark them as 1
X2[:,-4:] = Z # Replace all values for the one-hot encoded columns

In [9]:
# Extract binary data for logistic regression
# Solely need to extract data where the user_action = bounce, add_to_cart or 0, 1
Xr = X2[Y<=1]
Yr = Y[Y<=1]

# Logistic Prediction

In [10]:
# Initialise a matrix of weights of the LR model

N,D = Xr.shape
W = np.random.randn(D) # Initialise weights
b = 0 # Set bias term

In [11]:
def sigmoid(a):
    return 1/(1+np.exp(-a))

def forward(X,W,b):
    return sigmoid(X.dot(W)+b)

In [12]:
P_Y_given_X = forward(Xr, W, b) # Pass through a simoid the linear combination of weights and data 
predictions = np.round(P_Y_given_X) # Round to find items whose threshold is > 0.5

In [13]:
# Check accuracy of the prediction
(predictions==Yr).mean()

0.5829145728643216