In [24]:
import pandas as pd
from datetime import datetime


In [27]:
df = pd.read_csv('activity.csv')
postal = pd.read_csv('SG_postal.csv')
activity_df = df.merge(postal, left_on='postalCode', right_on='postal_code', how='left')

In [28]:
# Ensure 'startTime' is a datetime column
activity_df['startTime'] = pd.to_datetime(activity_df['startTime'])

# Extract year and month from 'startTime'
activity_df['year'] = activity_df['startTime'].dt.year
activity_df['month'] = activity_df['startTime'].dt.month

# Convert 'tags' list to string and get dummies
tags_dummies = activity_df['tags'].str.get_dummies(sep=',')

# Convert boolean columns to integer type
activity_df['drivingLicence'] = activity_df['drivingLicence'].astype(int)
activity_df['pwdTrained'] = activity_df['pwdTrained'].astype(int)

# Get dummies for categorical columns
citizenship_dummies = pd.get_dummies(activity_df['citizenshipType'], prefix='citizenship')
employment_dummies = pd.get_dummies(activity_df['employmentStatus'], prefix='employment')
gender_dummies = pd.get_dummies(activity_df['gender'], prefix='gender')

current_year = pd.Timestamp('now').year
activity_df['dateOfBirth'] = pd.to_datetime(activity_df['dateOfBirth'])
activity_df['age'] = current_year - activity_df['dateOfBirth'].dt.year

# Group age into categories and get dummies
bins = [0, 12, 21, 50, 65, float('inf')]
labels = ['under12', 'under21', 'under50', 'under65', 'over65']
activity_df['ageGroup'] = pd.cut(activity_df['age'], bins=bins, labels=labels, right=False)
age_group_dummies = pd.get_dummies(activity_df['ageGroup'])

# Concatenate all dummies and the original DataFrame (excluding original categorical columns)
final_df = pd.concat([
    activity_df.drop(['tags', 'citizenshipType', 'employmentStatus', 'gender', 'age', 'ageGroup'], axis=1),
    tags_dummies, citizenship_dummies, employment_dummies, gender_dummies, age_group_dummies
], axis=1)

# Group by year and month, and sum the numHours and other one-hot encoded columns
monthly_summary = final_df.groupby(['year', 'month']).agg({
    'numHours': 'sum',  # Sum numHours for total hours per month
    **{col: 'sum' for col in tags_dummies.columns},  # Sum for each tag column
    'drivingLicence': 'sum',
    'pwdTrained': 'sum',
    **{col: 'sum' for col in citizenship_dummies.columns},  # Sum for each citizenship status
    **{col: 'sum' for col in employment_dummies.columns},  # Sum for each employment status
    **{col: 'sum' for col in gender_dummies.columns},  # Sum for each gender
    **{col: 'sum' for col in age_group_dummies.columns},  # Sum for each age group
}).reset_index()

In [29]:
# Save to CSV
monthly_summary.to_csv('monthly_summary.csv', index=False)