In [4]:
import pandas as pd
import numpy as np
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata

### Importing data

In [5]:
data = pd.read_csv('clean_data.csv')
data

Unnamed: 0,timestamp,role,frequency_of_travel,primary_purpose,travel_days,travel_hours,ISB_Service_trip_1,bus_stop_board_trip_1,bus_stop_alight_trip_1,day_of_the_week_trip_1,...,additional_features_seats,additional_features_cleanliness,additional_features_comfortable,additional_features_route_coverage,additional_features_updates,issues_with_quality_of_info,special_events,seasonal_changes,seasonal_changes_specific,further_comments
0,10/1/2024 22:17:11,Undergraduate student,Daily,Commute to classes,"Monday, Tuesday, Wednesday, Thursday, Friday","0900 - 1000, 1000 - 1100, 1100 - 1200, 1200 - ...",A2,Opp NUSS,Ventus,Monday,...,4th,6th,5th,2nd,3rd,Yes,3,"No, service is consistent",,More frequent buses during exam periods
1,10/2/2024 1:03:02,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","0900 - 1000, 1100 - 1200, 1300 - 1400, 1500 - ...",D2,KR MRT,UTown,Monday,...,1st,4th,5th,6th,2nd,No,4,"No, service is consistent",,Bigger bus size
2,10/2/2024 9:18:23,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","1700 - 1800, 1800 - 1900, 2100 - 2200, 2200 - ...",A2,KR Bus Terminal,S17,Thursday,...,5th,6th,4th,3rd,2nd,No,3,"Yes, service improves/worsens (please specify ...",more in exams,na
3,10/2/2024 13:27:16,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Thursday, Friday, Saturday, Sunday","1100 - 1200, 1200 - 1300, 1800 - 1900, 2100 - ...",A1,PGP Terminal,CLB,Friday,...,2nd,6th,5th,3rd,4th,No,3,"No, service is consistent",,More frequent buses to avoid overcrowding
4,10/2/2024 13:58:50,Undergraduate student,1 - 2 days a week,Commute to classes,"Tuesday, Thursday, Friday","0700 - 0800, 1000 - 1100, 1100 - 1200, 1200 - ...",A1,KR MRT,LT27,Tuesday,...,2nd,6th,5th,3rd,4th,No,1,"Yes, service improves/worsens (please specify ...",worsens during semester break,better capacity management
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,10/18/2024 19:53:24,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Friday","0900 - 1000, 1100 - 1200, 1200 - 1300, 1400 - ...",D1,UTown,BIZ2,Friday,...,4th,5th,6th,2nd,3rd,No,4,"No, service is consistent",,
73,10/18/2024 22:12:59,Undergraduate student,1 - 2 days a week,Commute to classes,"Monday, Tuesday","0700 - 0800, 1100 - 1200, 1300 - 1400, 1700 - ...",A2,Opp KR MRT,Opp NUSS,Tuesday,...,4th,5th,6th,2nd,3rd,No,3,"Yes, service improves/worsens (please specify ...",lesser buses during break,increase the frequency of buses
74,10/18/2024 22:20:22,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Tuesday, Thursday, Friday","0800 - 0900, 0900 - 1000, 1100 - 1200, 1500 - ...",A1,KR MRT,CLB,Monday,...,4th,6th,5th,2nd,3rd,No,4,"No, service is consistent",,
75,10/18/2024 22:30:15,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Tuesday, Thursday, Friday","0800 - 0900, 1100 - 1200, 1300 - 1400, 1500 - ...",A2,Opp KR MRT,Ventus,Thursday,...,4th,6th,5th,1st,3rd,No,4,"No, service is consistent",,Have a bus that goes from kr to closer to sde


### Preparing dataset

In [6]:
data_1 = pd.concat([data.loc[:, 'timestamp':'crowdedness_trip_1'], data.loc[:, 'usage_influence_convenience':]], axis=1)
data_2 = pd.concat([data.loc[:, 'timestamp':'travel_hours'], data.loc[:, 'ISB_Service_trip_2':'crowdedness_trip_2'], data.loc[:, 'usage_influence_convenience':]], axis=1)
data_3 = pd.concat([data.loc[:, 'timestamp':'travel_hours'], data.loc[:, 'ISB_Service_trip_3':]], axis=1)

data_1.rename(columns={'ISB_Service_trip_1': 'ISB_Service',
                            'bus_stop_board_trip_1': 'bus_stop_board',
                            'bus_stop_alight_trip_1': 'bus_stop_alight',
                            'day_of_the_week_trip_1': 'day_of_the_week',
                            'time_start_trip_1': 'time_start',
                            'travel_duration_trip_1': 'travel_duration',
                            'frequency_trip_1': 'frequency',
                            'punctuality_trip_1': 'punctuality',
                            'cleanliness_trip_1': 'cleanliness',
                            'safety_trip_1': 'safety',
                            'coverage_trip_1': 'coverage',
                            'crowdedness_trip_1': 'crowdedness'}, inplace=True)

data_2.rename(columns={'ISB_Service_trip_2': 'ISB_Service',
                            f'bus_stop_board_trip_2': 'bus_stop_board',
                            f'bus_stop_alight_trip_2': 'bus_stop_alight',
                            f'day_of_the_week_trip_2': 'day_of_the_week',
                            f'time_start_trip_2': 'time_start',
                            f'travel_duration_trip_2': 'travel_duration',
                            f'frequency_trip_2': 'frequency',
                            f'punctuality_trip_2': 'punctuality',
                            f'cleanliness_trip_2': 'cleanliness',
                            f'safety_trip_2': 'safety',
                            f'coverage_trip_2': 'coverage',
                            f'crowdedness_trip_2': 'crowdedness'}, inplace=True)

data_3.rename(columns={'ISB_Service_trip_3': 'ISB_Service',
                            f'bus_stop_board_trip_3': 'bus_stop_board',
                            f'bus_stop_alight_trip_3': 'bus_stop_alight',
                            f'day_of_the_week_trip_3': 'day_of_the_week',
                            f'time_start_trip_3': 'time_start',
                            f'travel_duration_trip_3': 'travel_duration',
                            f'frequency_trip_3': 'frequency',
                            f'punctuality_trip_3': 'punctuality',
                            f'cleanliness_trip_3': 'cleanliness',
                            f'safety_trip_3': 'safety',
                            f'coverage_trip_3': 'coverage',
                            f'crowdedness_trip_3': 'crowdedness'}, inplace=True)

data_combined = pd.concat([data_1, data_2, data_3], axis=0)
data_combined = data_combined[data_combined['ISB_Service'] != 'No trip']
data_combined['day_of_the_week'] = data_combined['day_of_the_week'].str.split(', ')
data_combined = data_combined.explode('day_of_the_week').reset_index(drop=True)

# Remove columns that need not be generated
data_combined.drop(columns=['timestamp', 'top_3_frustrations', 'seasonal_changes_specific', 'further_comments'], inplace=True)
data_combined

Unnamed: 0,role,frequency_of_travel,primary_purpose,travel_days,travel_hours,ISB_Service,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,...,not_able_to_get_on,additional_features_frequency,additional_features_seats,additional_features_cleanliness,additional_features_comfortable,additional_features_route_coverage,additional_features_updates,issues_with_quality_of_info,special_events,seasonal_changes
0,Undergraduate student,Daily,Commute to classes,"Monday, Tuesday, Wednesday, Thursday, Friday","0900 - 1000, 1000 - 1100, 1100 - 1200, 1200 - ...",A2,Opp NUSS,Ventus,Monday,09:30:00,...,Occasionally,1st,4th,6th,5th,2nd,3rd,Yes,3,"No, service is consistent"
1,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","0900 - 1000, 1100 - 1200, 1300 - 1400, 1500 - ...",D2,KR MRT,UTown,Monday,09:30:00,...,Frequently,3rd,1st,4th,5th,6th,2nd,No,4,"No, service is consistent"
2,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","1700 - 1800, 1800 - 1900, 2100 - 2200, 2200 - ...",A2,KR Bus Terminal,S17,Thursday,17:50:00,...,Occasionally,1st,5th,6th,4th,3rd,2nd,No,3,"Yes, service improves/worsens (please specify ..."
3,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Thursday, Friday, Saturday, Sunday","1100 - 1200, 1200 - 1300, 1800 - 1900, 2100 - ...",A1,PGP Terminal,CLB,Friday,11:30:00,...,Occasionally,1st,2nd,6th,5th,3rd,4th,No,3,"No, service is consistent"
4,Undergraduate student,1 - 2 days a week,Commute to classes,"Tuesday, Thursday, Friday","0700 - 0800, 1000 - 1100, 1100 - 1200, 1200 - ...",A1,KR MRT,LT27,Tuesday,07:50:00,...,Frequently,1st,2nd,6th,5th,3rd,4th,No,1,"Yes, service improves/worsens (please specify ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,Visitor,Less than once a week,Travelling to food establishments,"Monday, Tuesday, Wednesday, Thursday, Friday","1100 - 1200, 1200 - 1300",K,S17,Opp KR MRT,Wednesday,12:45:00,...,Frequently,1st,2nd,4th,5th,6th,3rd,No,3,"Yes, service improves/worsens (please specify ..."
236,Visitor,Less than once a week,Travelling to food establishments,"Monday, Tuesday, Wednesday, Thursday, Friday","1100 - 1200, 1200 - 1300",K,S17,Opp KR MRT,Thursday,12:45:00,...,Frequently,1st,2nd,4th,5th,6th,3rd,No,3,"Yes, service improves/worsens (please specify ..."
237,Visitor,Less than once a week,Travelling to food establishments,"Monday, Tuesday, Wednesday, Thursday, Friday","1100 - 1200, 1200 - 1300",K,S17,Opp KR MRT,Friday,12:45:00,...,Frequently,1st,2nd,4th,5th,6th,3rd,No,3,"Yes, service improves/worsens (please specify ..."
238,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Tuesday, Thursday","1100 - 1200, 1300 - 1400, 1400 - 1500, 1800 - ...",BTC (Bukit Timah Campus),LT27,UTown,Tuesday,19:00:00,...,Rarely,1st,5th,2nd,6th,3rd,4th,No,5,"No, service is consistent"


## Preparing data for generation

In [7]:
metadata = Metadata.detect_from_dataframe(
    data = data_combined,
    table_name='transport')

metadata.update_columns_metadata(
    column_metadata = {
        'travel_days' : {'sdtype': 'categorical'},
        'travel_hours' : {'sdtype': 'categorical'},
        'time_start' : {'sdtype': 'datetime', 'datetime_format': '%H:%M:%S'}
        }
)

metadata.validate()

metadata

{
    "tables": {
        "transport": {
            "columns": {
                "role": {
                    "sdtype": "categorical"
                },
                "frequency_of_travel": {
                    "sdtype": "categorical"
                },
                "primary_purpose": {
                    "sdtype": "categorical"
                },
                "travel_days": {
                    "sdtype": "categorical"
                },
                "travel_hours": {
                    "sdtype": "categorical"
                },
                "ISB_Service": {
                    "sdtype": "categorical"
                },
                "bus_stop_board": {
                    "sdtype": "categorical"
                },
                "bus_stop_alight": {
                    "sdtype": "categorical"
                },
                "day_of_the_week": {
                    "sdtype": "categorical"
                },
                "time_start": {
                    "

In [8]:
# Set a random seed for reproducibility
np.random.seed(42)

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.load_custom_constraint_classes(filepath='custom_constraints.py', class_names=['BusStopsCheck', 'TimeCheck'])

route_constraint = {
    'constraint_class': 'BusStopsCheck',
    'constraint_parameters': {
        'column_names':['ISB_Service', 'bus_stop_board', 'bus_stop_alight']
    }
}
time_constraint = {
    'constraint_class': 'TimeCheck',
    'constraint_parameters': {
        'column_names':['time_start']
    }
}

synthesizer.add_constraints(constraints=[route_constraint, time_constraint])

synthesizer.fit(data_combined)

synthetic_data = synthesizer.sample(num_rows=60000)

synthetic_data

Sampling rows: 100%|██████████| 60000/60000 [01:34<00:00, 637.99it/s]


Unnamed: 0,role,frequency_of_travel,primary_purpose,travel_days,travel_hours,ISB_Service,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,...,not_able_to_get_on,additional_features_frequency,additional_features_seats,additional_features_cleanliness,additional_features_comfortable,additional_features_route_coverage,additional_features_updates,issues_with_quality_of_info,special_events,seasonal_changes
0,Undergraduate student,Daily,Commute to classes,"Tuesday, Thursday, Friday","1600 - 1700, 1700 - 1800",D1,IT,UTown,Thursday,11:25:53,...,Frequently,1st,4th,6th,5th,3rd,3rd,Yes,2,"Yes, service improves/worsens (please specify ..."
1,Undergraduate student,Daily,Commute to classes,Thursday,"1100 - 1200, 1200 - 1300, 1800 - 1900, 2100 - ...",A2,IT,Opp NUSS,Tuesday,13:37:23,...,Frequently,1st,3rd,5th,6th,3rd,2nd,Yes,3,"Yes, service improves/worsens (please specify ..."
2,Visitor,3 - 4 days a week,Commute to classes,"Monday, Thursday, Friday","1100 - 1200, 1200 - 1300",K,KR MRT,CLB,Friday,11:31:55,...,Occasionally,1st,3rd,5th,6th,6th,3rd,Yes,2,"No, service is consistent"
3,Undergraduate student,1 - 2 days a week,Commute to classes,"Monday, Tuesday, Thursday",0900 - 1000,D2,COM3,LT27,Sunday,15:22:10,...,Frequently,1st,2nd,5th,5th,4th,2nd,Yes,1,"Yes, service improves/worsens (please specify ..."
4,Undergraduate student,3 - 4 days a week,Commute to classes,Friday,"1000 - 1100, 1100 - 1200, 1200 - 1300, 1300 - ...",A1,BIZ2,CLB,Wednesday,14:48:32,...,Occasionally,2nd,4th,6th,6th,3rd,2nd,Yes,2,"No, service is consistent"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,Undergraduate student,Less than once a week,Commute to classes,"Monday, Tuesday, Wednesday, Thursday","1200 - 1300, 1300 - 1400, 1400 - 1500",A1,KR Bus Terminal,LT27,Wednesday,11:08:08,...,Frequently,1st,4th,4th,5th,3rd,5th,No,3,"Yes, service improves/worsens (please specify ..."
59996,Undergraduate student,3 - 4 days a week,"Accessing campus facilities (gym, library etc.)","Monday, Tuesday, Wednesday, Thursday, Friday","0800 - 0900, 1100 - 1200, 1300 - 1400, 1500 - ...",A1,KR MRT,CLB,Thursday,13:34:00,...,Occasionally,2nd,4th,4th,6th,6th,6th,Yes,4,"No, service is consistent"
59997,Undergraduate student,1 - 2 days a week,Travelling to extracurricular activities,"Monday, Tuesday, Wednesday, Thursday",0900 - 1000,D2,PGP Terminal,UTown,Tuesday,09:38:07,...,Occasionally,2nd,1st,6th,5th,6th,3rd,No,3,"No, service is consistent"
59998,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Tuesday, Wednesday, Thursday, Friday","0900 - 1000, 1100 - 1200, 1300 - 1400, 1500 - ...",E,UTown,UTown,Sunday,14:33:16,...,Occasionally,1st,3rd,6th,5th,3rd,2nd,Yes,4,"No, service is consistent"


# Cleaning synthetic data

Changing to Ordinal Values

In [124]:
created_data = synthetic_data.copy()
all_data = pd.concat([data_combined, created_data], axis=0)

# Map ordinal features
rank_mapping = {'1st': 1,
                '2nd': 2,
                '3rd': 3,
                '4th': 4,
                '5th': 5,
                '6th': 6}

satisfaction_mapping = {
    'Very dissatisfied': 1,
    'Dissatisfied': 2,
    'Neutral': 3,
    'Satisfied': 4,
    'Very Satisfied': 5
}

to_map_rank = ['usage_influence_convenience', 'usage_influence_cost', 'usage_influence_lack_of_options', 'usage_influence_availability_of_parking', 'usage_influence_environmental',
               'prioritize_frequency', 'prioritize_punctuality', 'prioritize_cleanliness', 'prioritize_safety', 'prioritize_bus_route_coverage', 'prioritize_crowdedness',
               'additional_features_frequency', 'additional_features_seats', 'additional_features_cleanliness', 'additional_features_comfortable',
               'additional_features_route_coverage', 'additional_features_updates']

to_map_satisfaction = ['frequency', 'punctuality', 'cleanliness', 'safety', 'coverage']

for c in to_map_rank:
    all_data[c] = synthetic_data[c].map(rank_mapping)

for c in to_map_satisfaction:
    all_data[c] = synthetic_data[c].map(satisfaction_mapping)

# Convert crowdedness column to integer
all_data['crowdedness'] = pd.to_numeric(synthetic_data['crowdedness']).astype(int)

all_data = all_data.reset_index(drop=True)
all_data

Unnamed: 0,role,frequency_of_travel,primary_purpose,travel_days,travel_hours,ISB_Service,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness,usage_influence_convenience,usage_influence_cost,usage_influence_lack_of_options,usage_influence_availability_of_parking,usage_influence_environmental,prioritize_frequency,prioritize_punctuality,prioritize_cleanliness,prioritize_safety,prioritize_bus_route_coverage,prioritize_crowdedness,not_able_to_get_on,additional_features_frequency,additional_features_seats,additional_features_cleanliness,additional_features_comfortable,additional_features_route_coverage,additional_features_updates,issues_with_quality_of_info,special_events,seasonal_changes
0,Undergraduate student,Daily,Commute to classes,"Monday, Tuesday, Wednesday, Thursday, Friday","0900 - 1000, 1000 - 1100, 1100 - 1200, 1200 - ...",A2,Opp NUSS,Ventus,Monday,09:30:00,15 - 20 minutes,1,3,2,3,1,2,1,1,3,5,4,1,3,5,1,1,1,Occasionally,1,4,6,5,3,3,Yes,3,"No, service is consistent"
1,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","0900 - 1000, 1100 - 1200, 1300 - 1400, 1500 - ...",D2,KR MRT,UTown,Monday,09:30:00,10 - 15 minutes,3,4,3,4,4,4,2,3,3,4,5,1,3,6,5,2,6,Frequently,1,3,5,6,3,2,No,4,"No, service is consistent"
2,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","1700 - 1800, 1800 - 1900, 2100 - 2200, 2200 - ...",A2,KR Bus Terminal,S17,Thursday,17:50:00,< 5 minutes,4,2,5,5,4,2,2,1,3,5,5,3,3,5,4,6,2,Occasionally,1,3,5,6,6,3,No,3,"Yes, service improves/worsens (please specify ..."
3,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Thursday, Friday, Saturday, Sunday","1100 - 1200, 1200 - 1300, 1800 - 1900, 2100 - ...",A1,PGP Terminal,CLB,Friday,11:30:00,10 - 15 minutes,2,3,5,4,4,4,1,3,3,4,4,3,5,6,4,6,4,Occasionally,1,2,5,5,4,2,No,3,"No, service is consistent"
4,Undergraduate student,1 - 2 days a week,Commute to classes,"Tuesday, Thursday, Friday","0700 - 0800, 1000 - 1100, 1100 - 1200, 1200 - ...",A1,KR MRT,LT27,Tuesday,07:50:00,< 5 minutes,2,5,4,4,3,5,4,5,3,1,3,2,1,6,4,2,1,Frequently,2,4,6,6,3,2,No,1,"Yes, service improves/worsens (please specify ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60235,Undergraduate student,Less than once a week,Commute to classes,"Monday, Tuesday, Wednesday, Thursday","1200 - 1300, 1300 - 1400, 1400 - 1500",A1,KR Bus Terminal,LT27,Wednesday,11:08:08,5 - 10 minutes,2,4,4,3,3,5,3,1,1,5,4,1,3,6,5,3,2,Frequently,1,4,4,5,3,5,No,3,"Yes, service improves/worsens (please specify ..."
60236,Undergraduate student,3 - 4 days a week,"Accessing campus facilities (gym, library etc.)","Monday, Tuesday, Wednesday, Thursday, Friday","0800 - 0900, 1100 - 1200, 1300 - 1400, 1500 - ...",A1,KR MRT,CLB,Thursday,13:34:00,10 - 15 minutes,4,2,5,1,4,2,2,3,1,3,2,2,2,4,4,1,4,Occasionally,2,4,4,6,6,6,Yes,4,"No, service is consistent"
60237,Undergraduate student,1 - 2 days a week,Travelling to extracurricular activities,"Monday, Tuesday, Wednesday, Thursday",0900 - 1000,D2,PGP Terminal,UTown,Tuesday,09:38:07,15 - 20 minutes,3,4,5,3,4,2,1,2,2,4,4,1,2,6,5,1,2,Occasionally,2,1,6,5,6,3,No,3,"No, service is consistent"
60238,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Tuesday, Wednesday, Thursday, Friday","0900 - 1000, 1100 - 1200, 1300 - 1400, 1500 - ...",E,UTown,UTown,Sunday,14:33:16,5 - 10 minutes,3,3,4,3,4,5,2,2,1,4,5,3,3,5,5,6,1,Occasionally,1,3,6,5,3,2,Yes,4,"No, service is consistent"


Checking that day_of_the_week is within travel days

In [125]:
import random
import re
# Convert 'travel_days' to string representation if they are lists
all_data['travel_days'] = all_data['travel_days'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Now convert to string type
all_data['travel_days'] = all_data['travel_days'].astype(str)
all_data['travel_days'] = all_data['travel_days'].apply(lambda x: [day.strip() for day in re.split(', |/ ', x)])

def filter_days(day_of_trip, travel_days):
  random.seed(2020)
  if isinstance(day_of_trip, float) and np.isnan(day_of_trip):
  # if day_of_trip is None or day_of_trip.count(None) >0:
    return None
  # Ensure travel_days is a valid list
  if isinstance(travel_days, list) and len(travel_days) > 0:
      # Keep only valid days from day_of_trip
      # print(day_of_trip)
      valid_days = [day for day in day_of_trip if day in travel_days]

      # Ensure at least one valid day is retained
      if not valid_days:
          # If no valid days left, choose one day from travel_days
          valid_days = [random.choice(travel_days)]
      return valid_days
  else:
      # If travel_days is not a list or invalid, return None
      return None

all_data['day_of_the_week'] = all_data.apply(lambda df: filter_days(df['day_of_the_week'], df['travel_days']), axis=1)
for i in ['travel_days', 'day_of_the_week']:
    all_data[i] = all_data[i].apply(lambda x: x if not isinstance(x, list) and pd.isnull(x) else ', '.join(map(str, x)))

In [126]:
# check no more invalid days
def check_invalid_days(data: pd.DataFrame) -> pd.DataFrame:
    # Function to check if day_of_week is in travel_days
    def is_valid_day(row):
        travel_days = row['travel_days'].split(', ') if isinstance(row['travel_days'], str) else []
        day_of_week = row['day_of_the_week'].split(', ') if isinstance(row['day_of_the_week'], str) else []

        # Check if all days in day_of_week are in travel_days
        return all(day in travel_days for day in day_of_week)

    data['is_valid'] = data.apply(is_valid_day, axis=1)

    invalid_rows = data[~data['is_valid']]

    # Drop the helper column
    invalid_rows = invalid_rows.drop(columns='is_valid')

    return invalid_rows

invalid_days = check_invalid_days(all_data)
invalid_days

Unnamed: 0,role,frequency_of_travel,primary_purpose,travel_days,travel_hours,ISB_Service,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness,usage_influence_convenience,usage_influence_cost,usage_influence_lack_of_options,usage_influence_availability_of_parking,usage_influence_environmental,prioritize_frequency,prioritize_punctuality,prioritize_cleanliness,prioritize_safety,prioritize_bus_route_coverage,prioritize_crowdedness,not_able_to_get_on,additional_features_frequency,additional_features_seats,additional_features_cleanliness,additional_features_comfortable,additional_features_route_coverage,additional_features_updates,issues_with_quality_of_info,special_events,seasonal_changes


Check time_start is within travel_hours

In [127]:
# Cleaning timings out of range

start_time = pd.to_datetime('07:00:00 AM', format='%I:%M:%S %p').time()
end_time = pd.to_datetime('11:00:00 PM', format='%I:%M:%S %p').time()

def adjust_time_in_range(time):
    if pd.isnull(time):
        return None
    elif not (start_time <= time.time() <= end_time):
        # Swap AM/PM to bring time in the desired range
        adjusted_time = (time + pd.Timedelta(hours=12)) if time.time() < start_time else (time - pd.Timedelta(hours=12))
        return adjusted_time.time()
    else:
        return time.time()

# Convert strings into datetime objects

all_data['time_start'] = pd.to_datetime(all_data['time_start'], format='%H:%M:%S')
all_data['time_start'] = all_data['time_start'].apply(adjust_time_in_range)


In [128]:
# Making sure the timings stated in specific trips are within the hours they come to school
from datetime import datetime, timedelta

def check_time_in_range(travel_hours, actual_time):
    random.seed(2020)
    # Check if time_start is NaN
    if pd.isnull(actual_time):
        return None

    # Split travel_hours into individual time ranges
    time_ranges = travel_hours.split(', ')
    valid_ranges = []
    time_ranges = travel_hours.split(', ')  # Split string into individual time ranges

    for time_range in time_ranges:
        start_str, end_str = time_range.split(' - ')  # Split each range into start and end times
        start_time = datetime.strptime(start_str, '%H%M').time()  # Convert to time object
        end_time = datetime.strptime(end_str, '%H%M').time()  # Convert to time object
        # Add to valid ranges list
        valid_ranges.append((start_time, end_time))

        # Check if the actual_time falls within the current range
        if start_time <= actual_time <= end_time:
            return actual_time

    # If the actual_time is not in any valid range, pick a random time from travel_hours
    random_range = random.choice(valid_ranges)  # Select a random valid range
    random_start, random_end = random_range

    # Generate a random time within the selected range
    delta = (datetime.combine(datetime.today(), random_end) - datetime.combine(datetime.today(), random_start)).seconds
    random_seconds = random.randint(0, delta)
    random_time = (datetime.combine(datetime.today(), random_start) + timedelta(seconds=random_seconds)).time()
    return random_time

all_data['time_start'] = all_data.apply(lambda row: check_time_in_range(row['travel_hours'], row['time_start']), axis=1)

In [129]:
def check_invalid_time_starts(data: pd.DataFrame) -> pd.DataFrame:
    """Check for invalid time_start entries based on travel_hours."""
    invalid_rows = []

    for index, row in data.iterrows():
        actual_time = row['time_start']
        travel_hours = row['travel_hours']

        # Check if the actual_time is in the specified travel_hours
        if pd.isnull(actual_time):
            continue

        # Split travel_hours into individual time ranges
        time_ranges = travel_hours.split(', ')

        # Check if actual_time is in any of the valid ranges
        valid = False
        for time_range in time_ranges:
            start_str, end_str = time_range.split(' - ')  # Split each range
            start_time = datetime.strptime(start_str, '%H%M').time()
            end_time = datetime.strptime(end_str, '%H%M').time()

            if start_time <= actual_time <= end_time:
                valid = True
                break  # Break if a valid range is found

        if not valid:
            invalid_rows.append(index)

    return data.loc[invalid_rows]

# Call the function and print invalid rows
invalid_time_starts = check_invalid_time_starts(all_data)
invalid_time_starts

Unnamed: 0,role,frequency_of_travel,primary_purpose,travel_days,travel_hours,ISB_Service,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness,usage_influence_convenience,usage_influence_cost,usage_influence_lack_of_options,usage_influence_availability_of_parking,usage_influence_environmental,prioritize_frequency,prioritize_punctuality,prioritize_cleanliness,prioritize_safety,prioritize_bus_route_coverage,prioritize_crowdedness,not_able_to_get_on,additional_features_frequency,additional_features_seats,additional_features_cleanliness,additional_features_comfortable,additional_features_route_coverage,additional_features_updates,issues_with_quality_of_info,special_events,seasonal_changes,is_valid


In [130]:
all_data

Unnamed: 0,role,frequency_of_travel,primary_purpose,travel_days,travel_hours,ISB_Service,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness,usage_influence_convenience,usage_influence_cost,usage_influence_lack_of_options,usage_influence_availability_of_parking,usage_influence_environmental,prioritize_frequency,prioritize_punctuality,prioritize_cleanliness,prioritize_safety,prioritize_bus_route_coverage,prioritize_crowdedness,not_able_to_get_on,additional_features_frequency,additional_features_seats,additional_features_cleanliness,additional_features_comfortable,additional_features_route_coverage,additional_features_updates,issues_with_quality_of_info,special_events,seasonal_changes,is_valid
0,Undergraduate student,Daily,Commute to classes,"Monday, Tuesday, Wednesday, Thursday, Friday","0900 - 1000, 1000 - 1100, 1100 - 1200, 1200 - ...",A2,Opp NUSS,Ventus,Friday,09:30:00,15 - 20 minutes,1,3,2,3,1,2,1,1,3,5,4,1,3,5,1,1,1,Occasionally,1,4,6,5,3,3,Yes,3,"No, service is consistent",True
1,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","0900 - 1000, 1100 - 1200, 1300 - 1400, 1500 - ...",D2,KR MRT,UTown,Thursday,09:30:00,10 - 15 minutes,3,4,3,4,4,4,2,3,3,4,5,1,3,6,5,2,6,Frequently,1,3,5,6,3,2,No,4,"No, service is consistent",True
2,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","1700 - 1800, 1800 - 1900, 2100 - 2200, 2200 - ...",A2,KR Bus Terminal,S17,Thursday,17:50:00,< 5 minutes,4,2,5,5,4,2,2,1,3,5,5,3,3,5,4,6,2,Occasionally,1,3,5,6,6,3,No,3,"Yes, service improves/worsens (please specify ...",True
3,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Thursday, Friday, Saturday, Sunday","1100 - 1200, 1200 - 1300, 1800 - 1900, 2100 - ...",A1,PGP Terminal,CLB,Sunday,11:30:00,10 - 15 minutes,2,3,5,4,4,4,1,3,3,4,4,3,5,6,4,6,4,Occasionally,1,2,5,5,4,2,No,3,"No, service is consistent",True
4,Undergraduate student,1 - 2 days a week,Commute to classes,"Tuesday, Thursday, Friday","0700 - 0800, 1000 - 1100, 1100 - 1200, 1200 - ...",A1,KR MRT,LT27,Friday,07:50:00,< 5 minutes,2,5,4,4,3,5,4,5,3,1,3,2,1,6,4,2,1,Frequently,2,4,6,6,3,2,No,1,"Yes, service improves/worsens (please specify ...",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60235,Undergraduate student,Less than once a week,Commute to classes,"Monday, Tuesday, Wednesday, Thursday","1200 - 1300, 1300 - 1400, 1400 - 1500",A1,KR Bus Terminal,LT27,Tuesday,14:42:17,5 - 10 minutes,2,4,4,3,3,5,3,1,1,5,4,1,3,6,5,3,2,Frequently,1,4,4,5,3,5,No,3,"Yes, service improves/worsens (please specify ...",True
60236,Undergraduate student,3 - 4 days a week,"Accessing campus facilities (gym, library etc.)","Monday, Tuesday, Wednesday, Thursday, Friday","0800 - 0900, 1100 - 1200, 1300 - 1400, 1500 - ...",A1,KR MRT,CLB,Friday,13:34:00,10 - 15 minutes,4,2,5,1,4,2,2,3,1,3,2,2,2,4,4,1,4,Occasionally,2,4,4,6,6,6,Yes,4,"No, service is consistent",True
60237,Undergraduate student,1 - 2 days a week,Travelling to extracurricular activities,"Monday, Tuesday, Wednesday, Thursday",0900 - 1000,D2,PGP Terminal,UTown,Tuesday,09:38:07,15 - 20 minutes,3,4,5,3,4,2,1,2,2,4,4,1,2,6,5,1,2,Occasionally,2,1,6,5,6,3,No,3,"No, service is consistent",True
60238,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Tuesday, Wednesday, Thursday, Friday","0900 - 1000, 1100 - 1200, 1300 - 1400, 1500 - ...",E,UTown,UTown,Friday,16:42:17,5 - 10 minutes,3,3,4,3,4,5,2,2,1,4,5,3,3,5,5,6,1,Occasionally,1,3,6,5,3,2,Yes,4,"No, service is consistent",True


### Writing synthetic data to CSV

In [131]:
#all_data.to_csv('synthetic_data.csv', index=False)