In [1]:
import pandas as pd

In [2]:
# --- Datasets ---

url_1 = "https://raw.githubusercontent.com/cgveradi/house_martell_project/main/data/raw/df_final_demo.txt" # Demographics Dataset
url_2 = "https://raw.githubusercontent.com/cgveradi/house_martell_project/main/data/raw/df_final_experiment_clients.txt" # Experiment Dataset
url_3_1 = "https://raw.githubusercontent.com/cgveradi/house_martell_project/main/data/raw/df_final_web_data_pt_1.txt" # Web_data part 1 Dataset
url_3_2 = "https://raw.githubusercontent.com/cgveradi/house_martell_project/main/data/raw/df_final_web_data_pt_2.txt" # Web_data part 2 Dataset

# --- Dataframes --- 

df_demo = pd.read_csv(url_1)
df_experiment = pd.read_csv(url_2)
df_web_pt1 = pd.read_csv(url_3_1)
df_web_pt2 = pd.read_csv(url_3_2)
    
    
print(f"df_demo shape: {df_demo.shape}")
print(f"df_experiment shape: {df_experiment.shape}")
print(f"df_web_pt1 shape: {df_web_pt1.shape}")
print(f"df_web_pt2 shape: {df_web_pt2.shape}")

df_demo shape: (70609, 9)
df_experiment shape: (70609, 2)
df_web_pt1 shape: (343141, 5)
df_web_pt2 shape: (412264, 5)


In [None]:
df_demo.info()

In [None]:
df_demo

In [3]:
# --- Rename Columns in df_demo ---
# We observed that column names need adjustment for easy reading and use
df_demo.rename(columns={
    'clnt_age': 'age', 
    'gendr': 'gender', 
    'clnt_tenure_yr': 'tenure_yrs'}, inplace=True)

In [None]:
df_experiment.info()

In [None]:
df_experiment

In [None]:
df_web_pt1.info()

In [None]:
df_web_pt1

In [None]:
df_web_pt2.info()

In [None]:
df_web_pt2

In [4]:
# --- Combine and Clean Web Data ---
df_web = pd.concat([df_web_pt1, df_web_pt2], ignore_index=True)

In [5]:
# Convert date_time to datetime object 
df_web['date_time'] = pd.to_datetime(df_web['date_time'])

In [None]:
df_web

In [None]:
print(df_web.duplicated().any())
print(df_web.duplicated().sum())

In [None]:
# Drop duplicate web interaction records
initial_rows = len(df_web)
df_web.drop_duplicates(inplace=True)

In [None]:
df_web

In [None]:
print(df_web.duplicated().any())
print(df_web.duplicated().sum())

In [None]:
# Sort chronologically 
df_web.sort_values(by=['client_id', 'date_time'], inplace=True)

In [6]:
# --- Merge Datasets ---

df_merged = pd.merge(df_web, df_experiment, on='client_id', how='left')

In [7]:
df_final = pd.merge(df_merged, df_demo, on='client_id', how='left')

In [None]:
df_final

In [8]:
df_final['Variation'].unique()

array(['Test', 'Control', nan], dtype=object)

In [9]:
print(df_final.duplicated().any())
print(df_final.duplicated().sum())

True
10764


In [10]:
df_clean = df_final.dropna(subset=['Variation']).drop_duplicates().copy()

df_clean.sort_values(by=['client_id', 'date_time'], inplace=True)

In [11]:
df_clean

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,tenure_yrs,clnt_tenure_mnth,age,gender,num_accts,bal,calls_6_mnth,logons_6_mnth
72018,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
72017,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
72016,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
72015,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
72014,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,Test,3.0,46.0,29.5,U,2.0,25454.66,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471738,9999729,834634258_21862004160,870243567_56915814033_814203,step_2,2017-05-08 16:08:40,Test,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0
471737,9999729,834634258_21862004160,870243567_56915814033_814203,step_3,2017-05-08 16:09:19,Test,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0
471736,9999729,834634258_21862004160,870243567_56915814033_814203,confirm,2017-05-08 16:09:40,Test,10.0,124.0,31.0,F,3.0,107059.74,6.0,9.0
356799,9999832,145538019_54444341400,472154369_16714624241_585315,start,2017-05-16 16:46:03,Test,23.0,281.0,49.0,F,2.0,431887.61,1.0,4.0


In [12]:
print(df_clean.duplicated().any())
print(df_clean.duplicated().sum())

False
0


In [None]:
df_clean = df_clean.drop_duplicates()

In [None]:
print(df_clean.duplicated().any())
print(df_clean.duplicated().sum())

In [13]:
df_clean['gender'] = df_clean['gender'].replace({'M': 'Male','F': 'Female','U': 'Unknown','X': 'Other'})

In [15]:
df_clean['gender'].nunique()

4

In [16]:
df_clean['gender'].value_counts()

gender
Unknown    107614
Male       106703
Female     102810
Other           8
Name: count, dtype: int64

In [18]:
df_clean['gender']

72018     Unknown
72017     Unknown
72016     Unknown
72015     Unknown
72014     Unknown
           ...   
471738     Female
471737     Female
471736     Female
356799     Female
356798     Female
Name: gender, Length: 317235, dtype: object

In [19]:
df_clean

Unnamed: 0,client_id,visitor_id,visit_id,process_step,date_time,Variation,tenure_yrs,clnt_tenure_mnth,age,gender,num_accts,bal,calls_6_mnth,logons_6_mnth
72018,555,402506806_56087378777,637149525_38041617439_716659,start,2017-04-15 12:57:56,Test,3.0,46.0,29.5,Unknown,2.0,25454.66,2.0,6.0
72017,555,402506806_56087378777,637149525_38041617439_716659,step_1,2017-04-15 12:58:03,Test,3.0,46.0,29.5,Unknown,2.0,25454.66,2.0,6.0
72016,555,402506806_56087378777,637149525_38041617439_716659,step_2,2017-04-15 12:58:35,Test,3.0,46.0,29.5,Unknown,2.0,25454.66,2.0,6.0
72015,555,402506806_56087378777,637149525_38041617439_716659,step_3,2017-04-15 13:00:14,Test,3.0,46.0,29.5,Unknown,2.0,25454.66,2.0,6.0
72014,555,402506806_56087378777,637149525_38041617439_716659,confirm,2017-04-15 13:00:34,Test,3.0,46.0,29.5,Unknown,2.0,25454.66,2.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
471738,9999729,834634258_21862004160,870243567_56915814033_814203,step_2,2017-05-08 16:08:40,Test,10.0,124.0,31.0,Female,3.0,107059.74,6.0,9.0
471737,9999729,834634258_21862004160,870243567_56915814033_814203,step_3,2017-05-08 16:09:19,Test,10.0,124.0,31.0,Female,3.0,107059.74,6.0,9.0
471736,9999729,834634258_21862004160,870243567_56915814033_814203,confirm,2017-05-08 16:09:40,Test,10.0,124.0,31.0,Female,3.0,107059.74,6.0,9.0
356799,9999832,145538019_54444341400,472154369_16714624241_585315,start,2017-05-16 16:46:03,Test,23.0,281.0,49.0,Female,2.0,431887.61,1.0,4.0


In [20]:
# Exporting the cleaned dataset
df_clean.to_csv('df_clean.csv', index=False, encoding='utf-8')