In [None]:
import pandas as pd

raw_data_path = '../data/00_raw/raw_ab_data.csv' # Make sure your CSV is named this
df = pd.read_csv(raw_data_path)

# *INITIAL INSPECTION*

In [None]:
# DataFrame Header
print("--- DataFrame Header ---")
print(df.head())
# DataFrame Tail
print("\n--- DataFrame Tail ---")
print(df.tail())
# DataFrame Info to check data types and missing values
print("\n--- DataFrame Info ---")
print(df.info())

# Descriptive Statistics can be ignored since numerical columns are id type and binary type
print("\n--- Descriptive Statistics ---")
print(df.describe(include='all'))

# Value Counts for categorical columns to check for inconsistent data entries
categorical_columns = ['group', 'landing_page', 'converted']
for col in categorical_columns:
    print(f"\n--- Value Counts for '{col}' ---")
    print(df[col].value_counts())

# Check for duplicates
print("\n--- Duplicate Rows ---")
duplicates = df.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

--- DataFrame Header ---
   user_id timestamp      group landing_page  converted
0   851104   11:48.6    control     old_page          0
1   804228   01:45.2    control     old_page          0
2   661590   55:06.2  treatment     new_page          0
3   853541   28:03.1  treatment     new_page          0
4   864975   52:26.2    control     old_page          1

--- DataFrame Tail ---
        user_id timestamp      group landing_page  converted
294475   734608   45:03.4    control     old_page          0
294476   697314   20:29.0    control     old_page          0
294477   715931   40:24.5  treatment     new_page          0
294478   759899   20:29.0  treatment     new_page          0
294479   643532   40:24.5  treatment     new_page          0

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294480 entries, 0 to 294479
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       294480 non

- No errors in formatting and data entry
- timestamp should be formatted to datetime, other columns have correct data types
- No missing values
- Descriptive stats can be ignored since no pure numerical columns only user_id and converted(binary)
- No inconsistent date for categorical Columns
- No duplicate rows

# *Data Cleaning*

- convert timestamp column data type to datetime

In [None]:
cleaned_df = df.copy()

# Convert timestamp column to datetime
# The timestamp format is MM:SS.s (minutes:seconds.decimal)
# We need to convert this to a proper time format
cleaned_df['timestamp'] = pd.to_datetime('00:' + cleaned_df['timestamp'], format='%H:%M:%S.%f')

# show the data types to confirm the change and check for errors in formatting
print("--- Cleaned DataFrame Data Types ---")
print(cleaned_df.dtypes)

print("\n--- Cleaned DataFrame Head ---")
print(cleaned_df.head())


In [None]:
# Save the cleaned DataFrame to a new CSV file
cleaned_data_path = '../data/01_cleaned/cleaned_ab_data.csv'
cleaned_df.to_csv(cleaned_data_path, index=False)