In [2]:
import numpy as np
import pandas as pd
import dask.dataframe as ddf
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, TruncatedSVD
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import EarlyStopping

# Import data

In [3]:
df = ddf.read_csv('session_data_v2_cleaned.csv')
df.head()

Unnamed: 0,start_hour,loyal,conv_rate,loyalty,pg_count,hit_evnt_cnt,hit_evnt_clicks,hit_evnt_forms,hit_evnt_ajax,"('BR',)",...,"('social',)","('ask.com',)","('baidu.com',)","('bing.com',)","('duckduckgo.com',)","('facebook.com',)","('google.com',)","('instagram.com',)","('pinterest.com',)","('yahoo.com',)"
0,23,0,-0.262246,-0.240792,-1.328991,-0.673618,-1.28356,-1.31504,-1.315113,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,23,0,-0.262246,-0.240792,-1.328991,-0.949226,-1.28356,-1.31504,-1.099848,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,23,0,-0.262246,-0.240792,-1.328991,-1.159924,-1.28356,-1.31504,-1.315113,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,23,0,-0.262246,-0.240792,-1.328991,-1.159924,-1.28356,-1.31504,-1.559321,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,23,0,-0.262246,-0.240792,-1.328991,-1.276792,-1.28356,-1.31504,-1.315113,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.compute().shape

(1435733, 73)

# Dimensionality Reduction and save to file

In [33]:
# data prep
X = df.drop(columns=['conv_rate']).compute()
X.reset_index(inplace=True, drop=True)
y = df[['conv_rate']].compute()
y.reset_index(inplace=True, drop=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Autoencoder

In [10]:
### Encoder
encoder = Sequential()
encoder.add(Dense(400,input_shape=[72],activation='relu'))
encoder.add(Dense(200,activation="relu"))
encoder.add(Dense(100,activation="relu"))
encoder.add(Dense(50,activation="relu"))
encoder.add(Dense(6,activation="relu"))
 
 
### Decoder
decoder = Sequential()
decoder.add(Dense(50,input_shape=[6],activation='relu'))
decoder.add(Dense(100,activation='relu'))
decoder.add(Dense(200,activation='relu'))
decoder.add(Dense(400,activation='relu'))
decoder.add(Dense(72, activation="relu"))




In [13]:
# autoencoder data prep
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['conv_rate']).compute(), df[['conv_rate']].compute(), test_size=0.33, random_state=42)

In [14]:
### Autoencoder
callback = EarlyStopping(monitor='loss', patience=3)

autoencoder = Sequential([encoder,decoder])
autoencoder.compile(loss="mse")
autoencoder.fit(X_train,X_train,epochs=50, use_multiprocessing=True, callbacks=[callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


<tensorflow.python.keras.callbacks.History at 0x1c36ec292b0>

In [15]:
# encoding features with autoencoder
encoded_matrix = encoder.predict(df.drop(columns=['conv_rate']).compute())
encoded_matrix = pd.DataFrame(encoded_matrix, columns=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
encoded_matrix.reset_index(inplace=True, drop=True)

# joining encoded values with conversion value column
enc_data_df = df[['conv_rate']].compute()
enc_data_df.reset_index(inplace=True, drop=True)

encoded_df = enc_data_df.join(encoded_matrix)
encoded_df.head()


Unnamed: 0,conv_rate,X1,X2,X3,X4,X5,X6
0,-0.262246,0.0,8.757374,0.0,13.29649,11.232584,0.71449
1,-0.262246,0.0,5.009789,0.0,14.405546,3.484223,1.16033
2,-0.262246,0.0,10.031455,0.0,12.960207,9.013577,0.552835
3,-0.262246,0.0,5.331458,0.0,13.109734,12.018019,0.537757
4,-0.262246,0.0,8.857171,0.0,13.320195,11.315697,0.759686


In [17]:
# save dataset for further analysis
encoded_df.to_csv('session_data_v2_dimReduced_encoder.csv', index=False)

## Principal Component Analysis (PCA)

In [5]:
pca = PCA(n_components=6)
pca_reduced_data = pca.fit_transform(df.drop(columns=['conv_rate']))

print(pca.explained_variance_ratio_)
print('sum total explained variance: {}'.format(sum(pca.explained_variance_ratio_)))
print(pca.singular_values_)

conv_df = df[['conv_rate']]
reduced_df = pd.DataFrame(pca_reduced_data, columns=['pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6'])
pca_df = conv_df.join(reduced_df)
pca_df.head()

[0.85335986 0.08492612 0.01830819 0.01234757 0.00765395 0.00331715]
sum total explained variance: 0.979912839784961
[8226.04131315 2595.04820692 1204.89011971  989.49930283  779.05402657
  512.86998687]


Unnamed: 0,conv_rate,pc1,pc2,pc3,pc4,pc5,pc6
0,-0.262246,-10.944264,-2.47025,0.02318,-0.545362,0.813634,0.331955
1,-0.262246,-10.944993,-2.495109,0.019695,0.857042,-0.572363,0.102365
2,-0.262246,-10.948251,-2.689478,0.064897,-0.883097,0.231646,-0.465039
3,-0.262246,-10.94997,-2.793037,0.084172,0.817486,0.815207,0.061924
4,-0.262246,-10.949264,-2.739417,0.07364,-0.548368,0.814916,0.341104


In [7]:
# save dataset for easier future analysis
pca_df.compute().to_csv('session_data_v2_dimReduced_pca.csv', index=False)

## Truncated SVD

In [8]:
tsvd = TruncatedSVD(n_components=6)
tsvd_reduced_data = tsvd.fit_transform(df.drop(columns=['conv_rate']))

print(tsvd.explained_variance_ratio_)
print('sum total explained variance: {}'.format(sum(tsvd.explained_variance_ratio_)))
print(tsvd.singular_values_)

conv_df = df[['conv_rate']]
tsvd_reduced_df = pd.DataFrame(tsvd_reduced_data, columns=['sv1', 'sv2', 'sv3', 'sv4', 'sv5', 'sv6'])
tsvd_df = conv_df.join(tsvd_reduced_df)
tsvd_df.head()

[0.84346215 0.08507563 0.01830838 0.01122738 0.01190403 0.00713819]
sum total explained variance: 0.9771157468386472
[16764.0576891   2598.14018948  1204.94457438  1048.93916029
   982.29921819   757.69539531]


Unnamed: 0,conv_rate,sv1,sv2,sv3,sv4,sv5,sv6
0,-0.262246,23.053715,-2.585653,0.040374,-0.683031,-0.815175,0.643175
1,-0.262246,23.068722,-2.607612,0.034357,-0.764778,0.678262,-0.644648
2,-0.262246,23.057267,-2.804253,0.080387,-0.475019,-1.086974,0.042169
3,-0.262246,23.054243,-2.908769,0.103166,-1.097773,0.48356,0.69688
4,-0.262246,23.055073,-2.854825,0.090708,-0.67702,-0.81661,0.645025


In [9]:
# save dataset for further analysis
tsvd_df.compute().to_csv('session_data_v2_dimReduced_tsvd.csv', index=False)