# 02 â€“ Data Cleaning

In this notebook we identify and handle missing values, drop duplicate rows and convert timestamps to proper datetime objects.  Clean data sets are saved into `data/processed` for subsequent notebooks.

In [1]:
import os
import pandas as pd

# Read raw data
raw_dir = os.path.join(os.path.pardir, 'data', 'raw')
processed_dir = os.path.join(os.path.pardir, 'data', 'processed')
os.makedirs(processed_dir, exist_ok=True)

user_df = pd.read_csv(os.path.join(raw_dir, 'user_profile.csv'), engine='python', nrows=100_000)
ad_df = pd.read_csv(os.path.join(raw_dir, 'ad_feature.csv'), engine='python', nrows=100_000)
click_df = pd.read_csv(os.path.join(raw_dir, 'raw_sample.csv'), engine='python', nrows=100_000)
behaviour_df = pd.read_csv(os.path.join(raw_dir, 'behavior_log.csv'), engine='python', nrows=100_000)

# Drop duplicate rows
user_df.drop_duplicates(inplace=True)
ad_df.drop_duplicates(inplace=True)
click_df.drop_duplicates(inplace=True)
behaviour_df.drop_duplicates(inplace=True)

# Handle missing values
# For numerical columns, fill missing values with median; for categorical, fill with mode
for df in [user_df, ad_df, click_df, behaviour_df]:
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])

# Convert timestamp columns to datetime for click and behaviour logs
click_df['time_stamp'] = pd.to_datetime(click_df['time_stamp'], unit='s', errors='coerce')
behaviour_df['time_stamp'] = pd.to_datetime(behaviour_df['time_stamp'], unit='s', errors='coerce')

# Save cleaned data
user_df.to_csv(os.path.join(processed_dir, 'user_profile_clean.csv'), index=False)
ad_df.to_csv(os.path.join(processed_dir, 'ad_feature_clean.csv'), index=False)
click_df.to_csv(os.path.join(processed_dir, 'raw_sample_clean.csv'), index=False)
behaviour_df.to_csv(os.path.join(processed_dir, 'behavior_log_clean.csv'), index=False)

print('Cleaning complete.  Processed files saved to data/processed')


Cleaning complete.  Processed files saved to data/processed
