### Loading the Dataset

In [63]:
import pandas as pd
import numpy as np
import seaborn as sns

demo = "../data/raw/df_final_demo.txt"
webdata1 = "../data/raw/df_final_web_data_pt_1.txt"
webdata2 = "../data/raw/df_final_web_data_pt_2.txt"
experiment = "../data/raw/df_final_experiment_clients.txt"

df_demo = pd.read_csv(demo) # Client Profile Data (1)
df_webdata1 = pd.read_csv(webdata1)
df_webdata2 = pd.read_csv(webdata2)
# Appending webdata part1 and part2
df_webdata = pd.concat([df_webdata1, df_webdata2], ignore_index=True) # Digital Footprint Data (2)
df_exp = pd.read_csv(experiment) # Experiment Roster (3)

### First Look at the Dataset

In [64]:
df_demo.head()

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.3,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.3,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0


In [65]:
df_webdata.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time
0,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:27:07
1,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:26:51
2,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:19:22
3,9988021,580560515_7732621733,781255054_21935453173_531117,step_2,2017-04-17 15:19:13
4,9988021,580560515_7732621733,781255054_21935453173_531117,step_3,2017-04-17 15:18:04


In [66]:
df_exp.head()

Unnamed: 0,client_id,Variation
0,9988021,Test
1,8320017,Test
2,4033851,Control
3,1982004,Test
4,9294070,Control


### Cleaning Digital Footprint Data

In [67]:
df_webdata.dtypes

client_id        int64
visitor_id      object
visit_id        object
process_step    object
date_time       object
dtype: object

In [68]:
df_webdata['date_time'] = pd.to_datetime(df_webdata['date_time'])

### Merge Data (Digital Footprint with Experiment Roster)

In [69]:
# Merge Digital Footprint with Experiment Roster Data
web_exp = pd.merge(df_webdata, df_exp, on='client_id')

# Split the data into Test & Control
test = web_exp[web_exp["Variation"]=="Test"]
control = web_exp[web_exp["Variation"]=="Control"]

# Sorting Dataset
test = test.sort_values(by=['client_id', 'visit_id', 'date_time'], ascending=[True, True, True]).reset_index(drop=True)
control = control.sort_values(by=['client_id', 'visit_id', 'date_time'], ascending=[True, True, True]).reset_index(drop=True)

In [73]:
test.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation
0,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,Test
1,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,Test
2,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,Test
3,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,Test
4,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,Test


In [71]:
control.head()

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation
0,1028,42237450_62128060588,557292053_87239438319_391157,start,2017-04-08 18:51:28,Control
1,1028,42237450_62128060588,557292053_87239438319_391157,step_1,2017-04-08 18:52:17,Control
2,1028,42237450_62128060588,557292053_87239438319_391157,step_1,2017-04-08 18:53:20,Control
3,1028,42237450_62128060588,557292053_87239438319_391157,step_2,2017-04-08 18:53:29,Control
4,1028,42237450_62128060588,557292053_87239438319_391157,step_3,2017-04-08 18:58:04,Control


### Export Cleaned Data

In [72]:
df_demo.to_csv('../data/cleaned/demo.csv', index=False)
test.to_csv('../data/cleaned/test.csv', index=False)
control.to_csv('../data/cleaned/control.csv', index=False)