In [55]:
import numpy as np
import pandas as pd
import dask.dataframe as ddf
import seaborn as sbn
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA, FactorAnalysis
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, MiniBatchKMeans, DBSCAN
from sklearn.linear_model import LinearRegression, BayesianRidge
import helper_fns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping

# import plotly.express as px
%matplotlib inline

# Import data

In [2]:
df = ddf.read_csv('session_data_v2_cleaned.csv')
df.head()

Unnamed: 0,start_hour,loyal,conv_rate,loyalty,pg_count,hit_evnt_cnt,hit_evnt_clicks,hit_evnt_forms,hit_evnt_ajax,"('BR',)",...,"('social',)","('ask.com',)","('baidu.com',)","('bing.com',)","('duckduckgo.com',)","('facebook.com',)","('google.com',)","('instagram.com',)","('pinterest.com',)","('yahoo.com',)"
0,23,0,-0.262246,-0.240792,-1.328991,-0.673618,-1.28356,-1.31504,-1.315113,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,23,0,-0.262246,-0.240792,-1.328991,-0.949226,-1.28356,-1.31504,-1.099848,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,23,0,-0.262246,-0.240792,-1.328991,-1.159924,-1.28356,-1.31504,-1.315113,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,23,0,-0.262246,-0.240792,-1.328991,-1.159924,-1.28356,-1.31504,-1.559321,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,23,0,-0.262246,-0.240792,-1.328991,-1.276792,-1.28356,-1.31504,-1.315113,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.compute().shape

(1435733, 73)

# Dimensionality Reduction

In [33]:
# data prep
X = df.drop(columns=['conv_rate']).compute()
X.reset_index(inplace=True, drop=True)
y = df[['conv_rate']].compute()
y.reset_index(inplace=True, drop=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Autoencoder

In [36]:
### Encoder
encoder = Sequential()
encoder.add(Dense(400,input_shape=[72],activation='relu'))
encoder.add(Dense(200,activation="relu"))
encoder.add(Dense(100,activation="relu"))
encoder.add(Dense(50,activation="relu"))
encoder.add(Dense(6,activation="relu"))
 
 
### Decoder
decoder = Sequential()
decoder.add(Dense(50,input_shape=[6],activation='relu'))
decoder.add(Dense(100,activation='relu'))
decoder.add(Dense(200,activation='relu'))
decoder.add(Dense(400,activation='relu'))
decoder.add(Dense(72, activation="relu"))




In [37]:
### Autoencoder
callback = EarlyStopping(monitor='loss', patience=3)

autoencoder = Sequential([encoder,decoder])
autoencoder.compile(loss="mse")
autoencoder.fit(X_train,X_train,epochs=50, use_multiprocessing=True, callbacks=[callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50


<tensorflow.python.keras.callbacks.History at 0x1a9ecb538e0>

In [38]:
# encoding features with autoencoder
encoded_matrix = encoder.predict(df.drop(columns=['conv_rate']).compute())
encoded_matrix = pd.DataFrame(encoded_matrix, columns=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
encoded_matrix.reset_index(inplace=True, drop=True)

# joining encoded values with conversion value column
enc_data_df = df[['conv_rate']].compute()
enc_data_df.reset_index(inplace=True, drop=True)

encoded_df = enc_data_df.join(encoded_matrix)
encoded_df.head()


Unnamed: 0,conv_rate,X1,X2,X3,X4,X5,X6
0,-0.262246,5.317648,4.004731,10.584099,0.0,6.977969,1.327806
1,-0.262246,4.539227,2.056666,8.602338,0.0,9.573707,0.0
2,-0.262246,4.450998,7.168342,10.712657,0.0,5.742918,0.288699
3,-0.262246,2.827416,2.882666,10.306307,0.0,9.101112,1.314992
4,-0.262246,5.165205,4.030619,10.597143,0.0,6.965371,1.306237


## Principal Component Analysis (PCA)

In [39]:
pca = PCA(n_components=6)
pca_reduced_data = pca.fit_transform(df.drop(columns=['conv_rate']))

print(pca.explained_variance_ratio_)
print('sum total explained variance: {}'.format(sum(pca.explained_variance_ratio_)))
print(pca.singular_values_)

conv_df = df[['conv_rate']]
reduced_df = pd.DataFrame(pca_reduced_data, columns=['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6'])
pca_df = conv_df.join(reduced_df)
pca_df.head()

[0.85335986 0.08492612 0.01830819 0.01234757 0.00765395 0.00331715]
sum total explained variance: 0.9799128397850125
[8226.04131315 2595.04820692 1204.89011971  989.49930283  779.05402657
  512.86998687]


Unnamed: 0,conv_rate,pc1,pc2,pc3,pc4,pc5,pc6
0,-0.262246,-10.944264,-2.47025,0.02318,-0.545362,0.813634,0.331955
1,-0.262246,-10.944993,-2.495109,0.019695,0.857042,-0.572363,0.102365
2,-0.262246,-10.948251,-2.689478,0.064897,-0.883097,0.231646,-0.465039
3,-0.262246,-10.94997,-2.793037,0.084172,0.817486,0.815207,0.061924
4,-0.262246,-10.949264,-2.739417,0.07364,-0.548368,0.814916,0.341104


# Data Modeling

## Data Prep

In [42]:
pca_X_train, pca_X_test, pca_y_train, pca_y_test = train_test_split(pca_df.drop(columns=['conv_rate']).compute(), pca_df[['conv_rate']].compute(), test_size=0.33, random_state=42)

In [45]:
enc_X_train, enc_X_test, enc_y_train, enc_y_test = train_test_split(encoded_df.drop(columns=['conv_rate']), encoded_df[['conv_rate']], test_size=0.33, random_state=42)

## Linear Regression

In [46]:
# pca Linear regression
pca_lin_reg = LinearRegression().fit(pca_X_train, pca_y_train)

# autoencoder linear regression
enc_lin_reg = LinearRegression().fit(enc_X_train, enc_y_train)


In [53]:
print('\npca score: {}, \npca coefs: {}, \npca intercept: {}'.format(pca_lin_reg.score(pca_X_test,pca_y_test), pca_lin_reg.coef_, pca_lin_reg.intercept_))

print('\nenc score: {}, \nenc coefs: {}, \nenc intercept: {}'.format(enc_lin_reg.score(enc_X_test,enc_y_test), enc_lin_reg.coef_, enc_lin_reg.intercept_))


pca score: 0.0016645523640990145, 
pca coefs: [[ 0.00133939  0.01689613  0.01305023  0.00150301 -0.00047918  0.00163531]], 
pca intercept: [0.00498393]

enc score: 0.10076421655780077, 
enc coefs: [[ 0.04239449 -0.02940937  0.02208099  0.12088491 -0.01153626  0.14453928]], 
enc intercept: [-0.5059542]


# Non-Linear Regression

## PCA Non-Linear Regression

In [60]:
# generate non linar features for PCA
poly = PolynomialFeatures(2)
pca_poly_matrix = poly.fit_transform(pca_df.drop(columns=['conv_rate']))
pca_poly_df = pd.DataFrame(pca_poly_matrix)
pca_poly_df.head()
# pca_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,1.0,-10.944264,-2.47025,0.02318,-0.545362,0.813634,0.331955,119.77692,27.035068,-0.253688,...,0.000537,-0.012642,0.01886,0.007695,0.29742,-0.443725,-0.181036,0.662001,0.27009,0.110194
1,1.0,-10.944993,-2.495109,0.019695,0.857042,-0.572363,0.102365,119.792868,27.308954,-0.215557,...,0.000388,0.016879,-0.011272,0.002016,0.73452,-0.490539,0.087731,0.3276,-0.05859,0.010479
2,1.0,-10.948251,-2.689478,0.064897,-0.883097,0.231646,-0.465039,119.864198,29.44508,-0.710509,...,0.004212,-0.05731,0.015033,-0.03018,0.779861,-0.204566,0.410675,0.05366,-0.107725,0.216261
3,1.0,-10.94997,-2.793037,0.084172,0.817486,0.815207,0.061924,119.901836,30.583666,-0.921681,...,0.007085,0.068809,0.068618,0.005212,0.668283,0.666421,0.050622,0.664563,0.050481,0.003835
4,1.0,-10.949264,-2.739417,0.07364,-0.548368,0.814916,0.341104,119.886376,29.9946,-0.806309,...,0.005423,-0.040382,0.060011,0.025119,0.300707,-0.446874,-0.18705,0.664088,0.277971,0.116352


In [61]:
pca_poly_df.shape

(1435733, 28)

In [62]:
pca_poly_X_train, pca_poly_X_test, pca_poly_y_train, pca_poly_y_test = train_test_split(pca_poly_df, pca_df[['conv_rate']].compute(), test_size=0.33, random_state=42)

In [65]:
pca_lin_reg = LinearRegression().fit(pca_poly_X_train, pca_poly_y_train)

In [66]:
print('\npca score: {}, \npca coefs: {}, \npca intercept: {}'.format(pca_lin_reg.score(pca_poly_X_test,pca_poly_y_test), pca_lin_reg.coef_, pca_lin_reg.intercept_))


pca score: 0.002778226991056565, 
pca coefs: [[-9.70678855e-17  1.44423035e-03  6.46628937e-02  3.56245035e-01
   3.56843682e-03 -1.23566745e-03 -1.55562202e-02 -1.27920092e-05
   5.03959644e-06  8.39683142e-04 -3.60024067e-04 -4.06939039e-04
   1.11612267e-03  6.26908656e-03 -8.87625418e-03  2.87967917e-04
  -1.08633630e-03  1.45536436e-03 -9.27501068e-02 -3.89574494e-04
   5.34311160e-04 -1.41225016e-03 -1.04233989e-02 -1.88456345e-03
  -1.76619246e-02  1.11490520e-03 -5.69731582e-03  6.72320027e-03]], 
pca intercept: [0.07634688]
