## Importing packages and data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb ##download xgboost using "pip3 install xgboost"

data = pd.read_csv('form_responses.csv')

# pip install sdv to download sdv package

## Seperating data

In [4]:
# Cleaning column names

# General Data
data.rename(columns={'Timestamp' : 'timestamp',
                     'What is your role at the school?': 'role', 
                     'How frequently do you use the public transport system (ISB) on campus? ': 'frequency_of_travel',
                     'What is your primary purpose for using the ISB on campus?' : 'primary_purpose',
                     'Which days of the week do you use the ISB?' : 'travel_days',
                     'At what times of the day do you travel using the ISB? (Please only choose the hours you would use the ISB)': 'travel_hours'
                     }, inplace=True)

# Trip 1
data.rename(columns={'ISB Service used': 'ISB_Service', 
                     'Where do you board the bus?': 'bus_stop_board',
                     'Where do you alight?' : 'bus_stop_alight',
                     'What day of the week was this trip made?' : 'day_of_the_week',
                     'What time do you typically start your journey?': 'time_start',
                     'What is your typical travel duration using the ISB?': 'travel_duration',
                     'Choose the column that best describes your satisfaction for each of the following.  [Frequency of buses]': 'frequency',
                     'Choose the column that best describes your satisfaction for each of the following.  [Punctuality of buses]': 'punctuality',
                     'Choose the column that best describes your satisfaction for each of the following.  [Cleanliness of buses]': 'cleanliness',
                     'Choose the column that best describes your satisfaction for each of the following.  [Safety on the buses]': 'safety',
                     'Choose the column that best describes your satisfaction for each of the following.  [Bus route coverage]': 'coverage',
                     'How crowded are the buses usually at this timing?': 'crowdedness'
                     }, inplace=True)

columns_to_suffix = range(6,18)
data = data.rename(columns={data.columns[i]: data.columns[i] + '_trip_1' for i in columns_to_suffix})

# Trip 2
data.rename(columns={'ISB Service used.1': 'ISB_Service', 
                     'Where do you board the bus?.1': 'bus_stop_board',
                     'Where do you alight?.1' : 'bus_stop_alight',
                     'What day of the week was this trip made?.1' : 'day_of_the_week',
                     'What time do you typically start your journey?.1': 'time_start',
                     'What is your typical travel duration using the ISB?.1': 'travel_duration',
                     'Choose the column that best describes your satisfaction for each of the following.  [Frequency of buses].1': 'frequency',
                     'Choose the column that best describes your satisfaction for each of the following.  [Punctuality of buses].1': 'punctuality',
                     'Choose the column that best describes your satisfaction for each of the following.  [Cleanliness of buses].1': 'cleanliness',
                     'Choose the column that best describes your satisfaction for each of the following.  [Safety on the buses].1': 'safety',
                     'Choose the column that best describes your satisfaction for each of the following.  [Bus route coverage].1': 'coverage',
                     'How crowded are the buses usually at this timing?.1': 'crowdedness'
                     }, inplace=True)

columns_to_suffix = range(18,30)
data = data.rename(columns={data.columns[i]: data.columns[i] + '_trip_2' for i in columns_to_suffix})

# Trip 3
data.rename(columns={'ISB Service used.2': 'ISB_Service', 
                     'Where do you board the bus?.2': 'bus_stop_board',
                     'Where do you alight?.2' : 'bus_stop_alight',
                     'What day of the week was this trip made?.2' : 'day_of_the_week',
                     'What time do you typically start your journey?.2': 'time_start',
                     'What is your typical travel duration using the ISB?.2': 'travel_duration',
                     'Choose the column that best describes your satisfaction for each of the following.  [Frequency of buses].2': 'frequency',
                     'Choose the column that best describes your satisfaction for each of the following.  [Punctuality of buses].2': 'punctuality',
                     'Choose the column that best describes your satisfaction for each of the following.  [Cleanliness of buses].2': 'cleanliness',
                     'Choose the column that best describes your satisfaction for each of the following.  [Safety on the buses].2': 'safety',
                     'Choose the column that best describes your satisfaction for each of the following.  [Bus route coverage].2': 'coverage',
                     'How crowded are the buses usually at this timing?.2': 'crowdedness'
                     }, inplace=True)

columns_to_suffix = range(30,42)
data = data.rename(columns={data.columns[i]: data.columns[i] + '_trip_3' for i in columns_to_suffix})

# Preferences & Pain Points
data.rename(columns={'What influences your usage of the ISB over other forms of transportation? Rank each factor from 1st to 5th, 1st being the most important and 5th being the least important. (Please only choose one option for each column) [Convenience]': 'usage_influence_convenience', 
                     'What influences your usage of the ISB over other forms of transportation? Rank each factor from 1st to 5th, 1st being the most important and 5th being the least important. (Please only choose one option for each column) [Cost]': 'usage_influence_cost',
                     'What influences your usage of the ISB over other forms of transportation? Rank each factor from 1st to 5th, 1st being the most important and 5th being the least important. (Please only choose one option for each column) [Lack of other transportation options]' : 'usage_influence_lack_of_options',
                     'What influences your usage of the ISB over other forms of transportation? Rank each factor from 1st to 5th, 1st being the most important and 5th being the least important. (Please only choose one option for each column) [Availability of parking]' : 'usage_influence_availability_of_parking',
                     'What influences your usage of the ISB over other forms of transportation? Rank each factor from 1st to 5th, 1st being the most important and 5th being the least important. (Please only choose one option for each column) [Environmental Concerns]' : 'usage_influence_environmental',
                     'Rank the factors you prioritize the most when choosing a bus route from 1st to 6th, 1st being the most important and 6th being the least important. (Please only choose one option for each column) [Frequency of buses]' : 'prioritize_frequency',
                     'Rank the factors you prioritize the most when choosing a bus route from 1st to 6th, 1st being the most important and 6th being the least important. (Please only choose one option for each column) [Punctuality of buses]' : 'prioritize_punctuality',
                     'Rank the factors you prioritize the most when choosing a bus route from 1st to 6th, 1st being the most important and 6th being the least important. (Please only choose one option for each column) [Cleanliness of the buses]' : 'prioritize_cleanliness',
                     'Rank the factors you prioritize the most when choosing a bus route from 1st to 6th, 1st being the most important and 6th being the least important. (Please only choose one option for each column) [Safety of the buses]' : 'prioritize_safety',
                     'Rank the factors you prioritize the most when choosing a bus route from 1st to 6th, 1st being the most important and 6th being the least important. (Please only choose one option for each column) [Bus route coverage]' : 'prioritize_bus_route_coverage',
                     'Rank the factors you prioritize the most when choosing a bus route from 1st to 6th, 1st being the most important and 6th being the least important. (Please only choose one option for each column) [Crowdedness of the bus]': 'prioritize_crowdedness',
                     'What are your top 3 frustrations with the ISB service?' : 'top_3_frustrations',
                     'How often are you not able to get on the bus due to overcrowding?' : 'not_able_to_get_on',
                     'What additional features would make the ISB more appealing to you?  Rank each factor from 1st to 6th, 1st being the most appealing and 6th being the least appealing. (Please only choose one option for each column) [More frequent bus services]': 'additional_features_frequency',
                     'What additional features would make the ISB more appealing to you?  Rank each factor from 1st to 6th, 1st being the most appealing and 6th being the least appealing. (Please only choose one option for each column) [More Seats]' : 'additional_features_seats',
                     'What additional features would make the ISB more appealing to you?  Rank each factor from 1st to 6th, 1st being the most appealing and 6th being the least appealing. (Please only choose one option for each column) [Improved cleanliness]' : 'additional_features_cleanliness',
                     'What additional features would make the ISB more appealing to you?  Rank each factor from 1st to 6th, 1st being the most appealing and 6th being the least appealing. (Please only choose one option for each column) [More comfortable seating]' : 'additional_features_comfortable',
                     'What additional features would make the ISB more appealing to you?  Rank each factor from 1st to 6th, 1st being the most appealing and 6th being the least appealing. (Please only choose one option for each column) [Better route coverage]' : 'additional_features_route_coverage',
                     'What additional features would make the ISB more appealing to you?  Rank each factor from 1st to 6th, 1st being the most appealing and 6th being the least appealing. (Please only choose one option for each column) [Real-time tracking and updates]' : 'additional_features_updates',
                     'Have you faced issues with the quality of information provided about bus services (eg. timing accuracy, route changes)?' : 'issues_with_quality_of_info',
                     'How well does the ISB accommodate special events (eg. Open House, exam season)?': 'special_events',
                     'Do you notice any seasonal changes in ISB quality and capacity?' : 'seasonal_changes',
                     'Specify the seasonal changes in service identified from the previous question. ' : 'seasonal_changes_specific',
                     'What changes would you like to see regarding the ISB system? (Enter NA if you do not wish to see any changes)': 'further_comments'
                    }, inplace=True)

pd.set_option('display.max_columns', None)
data

Unnamed: 0,timestamp,role,frequency_of_travel,primary_purpose,travel_days,travel_hours,ISB_Service_trip_1,bus_stop_board_trip_1,bus_stop_alight_trip_1,day_of_the_week_trip_1,time_start_trip_1,travel_duration_trip_1,frequency_trip_1,punctuality_trip_1,cleanliness_trip_1,safety_trip_1,coverage_trip_1,crowdedness_trip_1,ISB_Service_trip_2,bus_stop_board_trip_2,bus_stop_alight_trip_2,day_of_the_week_trip_2,time_start_trip_2,travel_duration_trip_2,frequency_trip_2,punctuality_trip_2,cleanliness_trip_2,safety_trip_2,coverage_trip_2,crowdedness_trip_2,ISB_Service_trip_3,bus_stop_board_trip_3,bus_stop_alight_trip_3,day_of_the_week_trip_3,time_start_trip_3,travel_duration_trip_3,frequency_trip_3,punctuality_trip_3,cleanliness_trip_3,safety_trip_3,coverage_trip_3,crowdedness_trip_3,usage_influence_convenience,usage_influence_cost,usage_influence_lack_of_options,usage_influence_availability_of_parking,usage_influence_environmental,prioritize_frequency,prioritize_punctuality,prioritize_cleanliness,prioritize_safety,prioritize_bus_route_coverage,prioritize_crowdedness,top_3_frustrations,not_able_to_get_on,additional_features_frequency,additional_features_seats,additional_features_cleanliness,additional_features_comfortable,additional_features_route_coverage,additional_features_updates,issues_with_quality_of_info,special_events,seasonal_changes,seasonal_changes_specific,further_comments
0,10/1/2024 22:17:11,Undergraduate student,Daily,Commute to classes,"Monday, Tuesday, Wednesday, Thursday, Friday","0900 - 1000, 1000 - 1100, 1100 - 1200, 1200 - ...",A2,KR MRT,Ventus,Monday,9:30:00 AM,15 - 20 minutes,Neutral,Neutral,Very Satisfied,Satisfied,Satisfied,3,D2,KR MRT,COM3,Friday,10:30:00 AM,5 - 10 minutes,Neutral,Neutral,Very Satisfied,Satisfied,Satisfied,4.0,D2,COM3,LT27,Tuesday,2:30:00 PM,10 - 15 minutes,Dissatisfied,Dissatisfied,Very Satisfied,Very Satisfied,Satisfied,3.0,1st,2nd,3rd,4th,5th,3rd,2nd,6th,5th,4th,1st,"Frequency of bus services, Capacity of the bus...",Occasionally,1st,4th,6th,5th,2nd,3rd,Yes,3,"No, service is consistent",,More frequent buses during exam periods
1,10/2/2024 1:03:02,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","0900 - 1000, 1100 - 1200, 1300 - 1400, 1500 - ...",D2,KR MRT,UTown,Monday,9:30:00 AM,10 - 15 minutes,Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,4,A2,IT,Opp KR MRT,Thursday,5:30:00 PM,15 - 20 minutes,Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,5.0,D2,UTown,Opp KR MRT,Wednesday,11:30:00 AM,10 - 15 minutes,Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,5.0,2nd,1st,3rd,5th,4th,2nd,6th,4th,3rd,5th,1st,"Travel time too long, Frequency of bus service...",Frequently,3rd,1st,4th,5th,6th,2nd,No,4,"No, service is consistent",,Bigger bus size
2,10/2/2024 9:18:23,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Thursday","1700 - 1800, 1800 - 1900, 2100 - 2200, 2200 - ...",A2,KR MRT,S17,Thursday,5:50:00 AM,< 5 minutes,Very Satisfied,Satisfied,Satisfied,Satisfied,Very Satisfied,4,A1,S17,Opp KR MRT,Wednesday,8:55:00 AM,< 5 minutes,Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,3.0,A1,KR MRT,S17,Wednesday,6:55:00 PM,< 5 minutes,Very Satisfied,Very Satisfied,Very Satisfied,Satisfied,Satisfied,4.0,2nd,1st,3rd,4th,5th,1st,6th,5th,4th,3rd,2nd,"Capacity of the bus (Overcrowding), Proximity ...",Occasionally,1st,3rd,6th,4th,3rd,2nd,No,3,"Yes, service improves/worsens (please specify ...",more in exams,na
3,10/2/2024 13:27:16,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Thursday, Friday, Saturday / Sunday","1100 - 1200, 1200 - 1300, 1800 - 1900, 2100 - ...",A1,PGP Terminal,S17,Friday,11:30:00 AM,10 - 15 minutes,Very Satisfied,Very Satisfied,Very Satisfied,Neutral,Very Satisfied,4,D2,PGP Foyer,COM3,Tuesday,2:00:00 PM,15 - 20 minutes,Neutral,Very Satisfied,Very Satisfied,Neutral,Satisfied,3.0,A2,PGP Foyer,Ventus,Monday,11:30:00 AM,15 - 20 minutes,Very dissatisfied,Very Satisfied,Very Satisfied,Neutral,Neutral,5.0,1st,3rd,2nd,5th,4th,1st,4th,5th,6th,3rd,2nd,"Travel time too long, Frequency of bus service...",Occasionally,1st,2nd,6th,5th,3rd,4th,No,3,"No, service is consistent",,More frequent buses to avoid overcrowding
4,10/2/2024 13:58:50,Undergraduate student,1 - 2 days a week,Commute to classes,"Tuesday, Thursday, Friday","0700 - 0800, 1000 - 1100, 1100 - 1200, 1200 - ...",A1,KR MRT,LT27,Tuesday,7:50:00 AM,< 5 minutes,Neutral,Satisfied,Satisfied,Satisfied,Satisfied,5,A2,S17,Opp KR MRT,Thursday,1:05:00 PM,< 5 minutes,Neutral,Satisfied,Satisfied,Satisfied,Satisfied,3.0,D2,KR MRT,LT27,Friday,7:50:00 AM,< 5 minutes,Neutral,Satisfied,Satisfied,Satisfied,Satisfied,5.0,2nd,3rd,1st,5th,4th,2nd,3rd,6th,5th,1st,4th,Capacity of the bus (Overcrowding),Frequently,1st,2nd,6th,5th,3rd,4th,No,1,"Yes, service improves/worsens (please specify ...",worsens during semester break,better capacity management
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,10/18/2024 19:53:24,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Wednesday, Friday","0900 - 1000, 1100 - 1200, 1200 - 1300, 1400 - ...",D1,UTown,BIZ2,Friday,11:40:00 AM,5 - 10 minutes,Satisfied,Very Satisfied,Very Satisfied,Very Satisfied,Very Satisfied,5,A2,Opp KR MRT,Opp HSSML,Monday,9:40:00 AM,10 - 15 minutes,Satisfied,Very Satisfied,Very Satisfied,Very Satisfied,Very Satisfied,3.0,A1,BIZ2,LT27,Monday,11:40:00 AM,10 - 15 minutes,Neutral,Satisfied,Very Satisfied,Very Satisfied,Very Satisfied,5.0,1st,3rd,2nd,5th,4th,1st,2nd,4th,3rd,5th,6th,"Frequency of bus services, Capacity of the bus...",Occasionally,1st,4th,5th,6th,2nd,3rd,No,4,"No, service is consistent",,
73,10/18/2024 22:12:59,Undergraduate student,1 - 2 days a week,Commute to classes,"Monday, Tuesday","0700 - 0800, 1100 - 1200, 1300 - 1400, 1700 - ...",A2,Opp KR MRT,Opp NUSS,Tuesday,11:45:00 AM,10 - 15 minutes,Dissatisfied,Satisfied,Satisfied,Dissatisfied,Neutral,4,,,,,,,,,,,,,,,,,,,,,,,,,1st,2nd,3rd,4th,5th,1st,3rd,6th,5th,2nd,4th,"Availability of seats, Frequency of bus servic...",Occasionally,1st,4th,5th,6th,2nd,3rd,No,3,"Yes, service improves/worsens (please specify ...",lesser buses during break,increase the frequency of buses
74,10/18/2024 22:20:22,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Tuesday, Thursday, Friday","0800 - 0900, 0900 - 1000, 1100 - 1200, 1500 - ...",A1,KR MRT,CLB,Monday,8:45:00 AM,10 - 15 minutes,Satisfied,Satisfied,Satisfied,Neutral,Dissatisfied,2,K,KR MRT,CLB,Tuesday,8:45:00 AM,10 - 15 minutes,Dissatisfied,Satisfied,Satisfied,Neutral,Dissatisfied,2.0,,,,,,,,,,,,,2nd,3rd,1st,5th,4th,1st,4th,6th,5th,3rd,2nd,"Frequency of bus services, Capacity of the bus...",Rarely,1st,4th,6th,5th,2nd,3rd,No,4,"No, service is consistent",,
75,10/18/2024 22:30:15,Undergraduate student,3 - 4 days a week,Commute to classes,"Monday, Tuesday, Thursday, Friday","0800 - 0900, 1100 - 1200, 1300 - 1400, 1500 - ...",A2,Opp KR MRT,Ventus,Thursday,11:30:00 AM,10 - 15 minutes,Dissatisfied,Satisfied,Satisfied,Satisfied,Dissatisfied,5,,,,,,,,,,,,,,,,,,,,,,,,,2nd,3rd,1st,5th,4th,2nd,4th,6th,5th,3rd,1st,"Availability of seats, Capacity of the bus (Ov...",Rarely,2nd,4th,6th,5th,1st,3rd,No,4,"No, service is consistent",,Have a bus that goes from kr to closer to sde


In [5]:
# Cleaning timings out of range 

#start_time = pd.to_datetime('07:00:00 AM', format='%I:%M:%S %p').time()
#end_time = pd.to_datetime('11:00:00 PM', format='%I:%M:%S %p').time()

#def adjust_time_in_range(time):
#    if not (start_time <= time.time() <= end_time):
#        # Swap AM/PM to bring time in the desired range
#        adjusted_time = (time + pd.Timedelta(hours=12)) if time.time() < start_time else (time - pd.Timedelta(hours=12))
#        return adjusted_time
#    return time

# Convert strings into datetime objects
# for i in ['time_start_trip_1', 'time_start_trip_2', 'time_start_trip_3']:, need to handle NaT objects first
#for i in ['time_start_trip_1']:
#    data[i] = pd.to_datetime(data[i], format='%I:%M:%S %p')
#    data[i] = data[i].apply(adjust_time_in_range)


#data


## Sorting out routes

In [9]:
trip1 = data[['ISB_Service_trip_1',
       'bus_stop_board_trip_1', 'bus_stop_alight_trip_1',
       'time_start_trip_1', 'travel_duration_trip_1',
       'frequency_trip_1', 'punctuality_trip_1', 'cleanliness_trip_1',
       'safety_trip_1', 'coverage_trip_1', 'crowdedness_trip_1']]

trip2 = data[['ISB_Service_trip_2', 'bus_stop_board_trip_2', 'bus_stop_alight_trip_2',
       'time_start_trip_2', 'travel_duration_trip_2',
       'frequency_trip_2', 'punctuality_trip_2', 'cleanliness_trip_2',
       'safety_trip_2', 'coverage_trip_2', 'crowdedness_trip_2']]

trip3 = data[['ISB_Service_trip_3', 'bus_stop_board_trip_3', 'bus_stop_alight_trip_3',
       'time_start_trip_3', 'travel_duration_trip_3',
       'frequency_trip_3', 'punctuality_trip_3', 'cleanliness_trip_3',
       'safety_trip_3', 'coverage_trip_3', 'crowdedness_trip_3']]

trip1.rename(columns={'ISB_Service_trip_1': 'ISB_Service', 
                     'bus_stop_board_trip_1': 'bus_stop_board',
                     'bus_stop_alight_trip_1' : 'bus_stop_alight',
                     'time_start_trip_1': 'time_start',
                     'travel_duration_trip_1': 'travel_duration',
                     'frequency_trip_1': 'frequency',
                     'punctuality_trip_1': 'punctuality',
                     'cleanliness_trip_1': 'cleanliness',
                     'safety_trip_1': 'safety',
                     'coverage_trip_1': 'coverage',
                     'crowdedness_trip_1': 'crowdedness'
                     }, inplace=True)
trip2.rename(columns={'ISB_Service_trip_2': 'ISB_Service', 
                     'bus_stop_board_trip_2': 'bus_stop_board',
                     'bus_stop_alight_trip_2' : 'bus_stop_alight',
                     'time_start_trip_2': 'time_start',
                     'travel_duration_trip_2': 'travel_duration',
                     'frequency_trip_2': 'frequency',
                     'punctuality_trip_2': 'punctuality',
                     'cleanliness_trip_2': 'cleanliness',
                     'safety_trip_2': 'safety',
                     'coverage_trip_2': 'coverage',
                     'crowdedness_trip_2': 'crowdedness'
                     }, inplace=True)
trip3.rename(columns={'ISB_Service_trip_3': 'ISB_Service', 
                     'bus_stop_board_trip_3': 'bus_stop_board',
                     'bus_stop_alight_trip_3' : 'bus_stop_alight',
                     'time_start_trip_3': 'time_start',
                     'travel_duration_trip_3': 'travel_duration',
                     'frequency_trip_3': 'frequency',
                     'punctuality_trip_3': 'punctuality',
                     'cleanliness_trip_3': 'cleanliness',
                     'safety_trip_3': 'safety',
                     'coverage_trip_3': 'coverage',
                     'crowdedness_trip_3': 'crowdedness'
                     }, inplace=True)

routedata = pd.concat([trip1, trip2, trip3], axis=0)

routedata = routedata.dropna()

routedata


Unnamed: 0,ISB_Service,bus_stop_board,bus_stop_alight,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness
0,A2,KR MRT,Ventus,9:30:00 AM,15 - 20 minutes,Neutral,Neutral,Very Satisfied,Satisfied,Satisfied,3.0
1,D2,KR MRT,UTown,9:30:00 AM,10 - 15 minutes,Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,4.0
2,A2,KR MRT,S17,5:50:00 AM,< 5 minutes,Very Satisfied,Satisfied,Satisfied,Satisfied,Very Satisfied,4.0
3,A1,PGP Terminal,S17,11:30:00 AM,10 - 15 minutes,Very Satisfied,Very Satisfied,Very Satisfied,Neutral,Very Satisfied,4.0
4,A1,KR MRT,LT27,7:50:00 AM,< 5 minutes,Neutral,Satisfied,Satisfied,Satisfied,Satisfied,5.0
...,...,...,...,...,...,...,...,...,...,...,...
62,D1,UTown,CLB,11:50:00 AM,5 - 10 minutes,Dissatisfied,Satisfied,Satisfied,Satisfied,Neutral,5.0
64,D1,KR MRT,LT27,5:00:00 PM,5 - 10 minutes,Neutral,Neutral,Neutral,Neutral,Neutral,4.0
67,A1,S17,Opp KR MRT,12:45:00 PM,< 5 minutes,Neutral,Satisfied,Very Satisfied,Very Satisfied,Very Satisfied,3.0
71,A2,LT27,UTown,7:00:00 PM,5 - 10 minutes,Very Satisfied,Very Satisfied,Very Satisfied,Very Satisfied,Very Satisfied,4.0


## Cleaning routedata

In [10]:
A1_bus = ['KR Bus Terminal', 'LT13', 'AS5', 'BIZ2', 'Opp TCOMS', 'PGP Terminal', 'KR MRT', 'LT27', 'University Hall', 'Opp UHC', 'YIH', 'CLB', 'KR Bus Terminal']
A2_bus = ['KR Bus Terminal', 'IT', 'Opp YIH', 'Museum', 'UHC', 'Opp University Hall', 'S17', 'Opp KR MRT', 'PGP Foyer', 'TCOMS', 'Opp HSSML', 'Opp NUSS', 'Ventus', 'KR Bus Terminal']
D1_bus = ['COM3', 'Opp HSSML', 'Opp NUSS', 'Ventus', 'IT', 'Opp YIH', 'Museum', 'UTown', 'YIH', 'CLB', 'LT13', 'AS5', 'BIZ2', 'COM3']
D2_bus = ['COM3', 'Opp TCOMS', 'PGP Terminal', 'KR MRT', 'LT27', 'University Hall', 'Opp UHC', 'Museum', 'UTown', 'UHC', 'Opp University Hall', 'S17', 'Opp KR MRT', 'PGP Foyer', 'TCOMS', 'COM3']
BTC_bus = ['Oei Tiong Ham Building (BTC)', 'Botanic Gardens MRT (BTC)', 'KR MRT', 'LT27', 'University Hall', 'Opp UHC', 'UTown', 'Raffles Hall', 'Kent Vale', 'Museum', 'YIH', 'CLB', 'LT13', 'AS5', 'BIZ2', 'PGP Terminal', 'College Green (BTC)', 'Oei Tiong Ham Building (BTC)']
E_bus = ['UTown', 'Raffles Hall', 'Kent Vale', 'EA', 'SDE3', 'IT', 'Opp YIH', 'UTown']
K_bus = ['PGP Terminal', 'KR MRT', 'LT27', 'University Hall', 'Opp UHC', 'YIH', 'CLB', 'Opp SDE3', 'The Japanese Primary School', 'Kent Vale', 'Museum', 'University Health Centre', 'Opp University Hall', 'S17', 'Opp KR MRT', 'PGP Foyer']
L_bus = ['Oei Tiong Ham Building (BTC)', 'Botanic Gardents MRT (BTC)', 'College Green (BTC)', 'Oei Tiong Ham Building (BTC)']

bus_routes = {'A1':A1_bus, 'A2':A2_bus, 'D1':D1_bus, 'D2':D2_bus, 'BTC (Bukit Timah Campus)':BTC_bus, 'E':E_bus, 'K':K_bus, 'L':L_bus}

def is_valid(column_names, data):
    def check_route(row):
        bus = row[column_names[0]]
        start = row[column_names[1]]
        end = row[column_names[2]]

        route = bus_routes[bus]

        if start in route and end in route:
            start_index = route.index(start)
            end_index = route.index(end)
            if end_index == 0: # for routes that loop back
                end_index = len(route)
            
            return start_index < end_index
        
        return False
    
    return data.apply(check_route, axis=1)

route_data_valid = is_valid(column_names=['ISB_Service', 'bus_stop_board', 'bus_stop_alight'], data=routedata)
routedata['is_valid'] = route_data_valid
valid_routedata = routedata[routedata['is_valid']]
valid_routedata.drop(columns=['is_valid'], inplace=True)

## Creating metadata

### for routes only

In [7]:
from sdv.metadata import Metadata

metadata = Metadata.detect_from_dataframe(
    data = valid_routedata,
    table_name='transport')

metadata.update_column(
    column_name='time_start',
    sdtype='datetime',
    datetime_format= '%I:%M:%S %p' )

metadata.validate()

metadata


{
    "tables": {
        "transport": {
            "columns": {
                "ISB_Service": {
                    "sdtype": "categorical"
                },
                "bus_stop_board": {
                    "sdtype": "categorical"
                },
                "bus_stop_alight": {
                    "sdtype": "categorical"
                },
                "time_start": {
                    "sdtype": "datetime",
                    "datetime_format": "%I:%M:%S %p"
                },
                "travel_duration": {
                    "sdtype": "categorical"
                },
                "frequency": {
                    "sdtype": "categorical"
                },
                "punctuality": {
                    "sdtype": "categorical"
                },
                "cleanliness": {
                    "sdtype": "categorical"
                },
                "safety": {
                    "sdtype": "categorical"
                },
                

## Utilising GaussianCopulaSynthesizer

In [11]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.load_custom_constraint_classes(filepath='custom_constraints.py', class_names=['BusStopsCheck', 'TimeCheck'])

route_constraint = {
    'constraint_class': 'BusStopsCheck',
    'constraint_parameters': {
        'column_names':['ISB_Service', 'bus_stop_board', 'bus_stop_alight']
    }
}
time_constraint = {
    'constraint_class': 'TimeCheck',
    'constraint_parameters': {
        'column_names':['time_start']
    }
} # time constraint not working for now, trips that start at 1am are still being generated

synthesizer.add_constraints(constraints=[route_constraint, time_constraint])

synthesizer.fit(valid_routedata)

synthetic_data = synthesizer.sample(num_rows=50)

synthetic_data

Sampling rows: 100%|██████████| 50/50 [00:00<00:00, 646.24it/s]


Unnamed: 0,ISB_Service,bus_stop_board,bus_stop_alight,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness
0,A1,KR Bus Terminal,CLB,06:15:15 PM,5 - 10 minutes,Dissatisfied,Dissatisfied,Neutral,Satisfied,Satisfied,2.0
1,D2,UTown,COM3,12:14:25 PM,10 - 15 minutes,Satisfied,Neutral,Satisfied,Satisfied,Satisfied,2.0
2,D2,KR MRT,Opp KR MRT,01:40:07 PM,10 - 15 minutes,Neutral,Satisfied,Satisfied,Satisfied,Dissatisfied,5.0
3,A2,Opp KR MRT,Ventus,02:59:39 PM,5 - 10 minutes,Neutral,Satisfied,Satisfied,Very Satisfied,Neutral,3.0
4,D2,Opp KR MRT,COM3,04:15:51 PM,10 - 15 minutes,Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,1.0
5,D1,CLB,COM3,01:55:27 AM,5 - 10 minutes,Very dissatisfied,Satisfied,Satisfied,Very Satisfied,Very dissatisfied,1.0
6,K,University Hall,CLB,08:53:11 PM,10 - 15 minutes,Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,3.0
7,D2,KR MRT,Opp KR MRT,01:13:21 PM,10 - 15 minutes,Neutral,Dissatisfied,Very Satisfied,Satisfied,Satisfied,5.0
8,D2,KR MRT,Opp KR MRT,12:59:57 PM,10 - 15 minutes,Satisfied,Satisfied,Neutral,Neutral,Satisfied,5.0
9,D2,KR MRT,UTown,09:42:16 AM,10 - 15 minutes,Neutral,Satisfied,Satisfied,Neutral,Neutral,4.0


### For everything

In [24]:
from sdv.metadata import Metadata
from sdv.single_table import GaussianCopulaSynthesizer

allmetadata = Metadata.detect_from_dataframe(
    data = data,
    table_name='alltransport')


allmetadata.update_column(
    column_name='time_start_trip_1',
    sdtype='datetime',
    datetime_format= '%I:%M:%S %p' )


allmetadata.update_column(
    column_name='time_start_trip_2',
    sdtype='datetime',
    datetime_format= '%I:%M:%S %p' )


allmetadata.update_column(
    column_name='time_start_trip_2',
    sdtype='datetime',
    datetime_format= '%I:%M:%S %p' )



allmetadata.validate()


allmetadata

synthesizer1 = GaussianCopulaSynthesizer(allmetadata)
synthesizer1.fit(data)

synthetic_data = synthesizer1.sample(num_rows=50)

synthetic_data



Unnamed: 0,timestamp,role,frequency_of_travel,primary_purpose,travel_days,travel_hours,ISB_Service_trip_1,bus_stop_board_trip_1,bus_stop_alight_trip_1,day_of_the_week_trip_1,time_start_trip_1,travel_duration_trip_1,frequency_trip_1,punctuality_trip_1,cleanliness_trip_1,safety_trip_1,coverage_trip_1,crowdedness_trip_1,ISB_Service_trip_2,bus_stop_board_trip_2,bus_stop_alight_trip_2,day_of_the_week_trip_2,time_start_trip_2,travel_duration_trip_2,frequency_trip_2,punctuality_trip_2,cleanliness_trip_2,safety_trip_2,coverage_trip_2,crowdedness_trip_2,ISB_Service_trip_3,bus_stop_board_trip_3,bus_stop_alight_trip_3,day_of_the_week_trip_3,time_start_trip_3,travel_duration_trip_3,frequency_trip_3,punctuality_trip_3,cleanliness_trip_3,safety_trip_3,coverage_trip_3,crowdedness_trip_3,usage_influence_convenience,usage_influence_cost,usage_influence_lack_of_options,usage_influence_availability_of_parking,usage_influence_environmental,prioritize_frequency,prioritize_punctuality,prioritize_cleanliness,prioritize_safety,prioritize_bus_route_coverage,prioritize_crowdedness,top_3_frustrations,not_able_to_get_on,additional_features_frequency,additional_features_seats,additional_features_cleanliness,additional_features_comfortable,additional_features_route_coverage,additional_features_updates,issues_with_quality_of_info,special_events,seasonal_changes,seasonal_changes_specific,further_comments
0,10/04/2024 04:49:04,Undergraduate student,3 - 4 days a week,Commute to classes,sdv-pii-i6eeb,sdv-pii-mhmvk,A1,KR MRT,Ventus,,09:11:45 AM,10 - 15 minutes,Satisfied,Satisfied,Neutral,Dissatisfied,Satisfied,2,A2,sdv-pii-5n2x5,Opp KR MRT,,,15 - 20 minutes,Satisfied,Satisfied,,,Very Satisfied,1.0,D1,Botanic Gardens MRT (BTC),,,,10 - 15 minutes,Neutral,Neutral,Neutral,,,,2nd,3rd,1st,5th,4th,3rd,4th,4th,6th,5th,2nd,sdv-pii-yp0mw,Rarely,1st,3rd,4th,5th,5th,2nd,No,4,"Yes, service improves/worsens (please specify ...",,
1,10/02/2024 00:41:55,Staff,3 - 4 days a week,Commute to classes,sdv-pii-bwr4t,sdv-pii-eslhy,D1,KR MRT,LT27,,10:50:51 AM,10 - 15 minutes,Neutral,Satisfied,Satisfied,Neutral,Satisfied,4,D2,sdv-pii-pve05,COM3,,01:57:44 PM,15 - 20 minutes,Neutral,Satisfied,Very Satisfied,Neutral,Satisfied,4.0,D2,KR MRT,Opp KR MRT,Wednesday,,10 - 15 minutes,Dissatisfied,Satisfied,,Satisfied,,4.0,2nd,5th,1st,5th,4th,3rd,4th,6th,4th,5th,1st,sdv-pii-qvtae,Frequently,1st,4th,6th,5th,2nd,4th,Yes,4,"No, service is consistent",sdv-pii-xojqa,
2,10/01/2024 22:25:53,Undergraduate student,3 - 4 days a week,Commute to classes,sdv-pii-4nbco,sdv-pii-rdu87,D2,KR MRT,UTown,,07:43:41 AM,15 - 20 minutes,Neutral,Neutral,Very Satisfied,Neutral,Satisfied,5,A2,sdv-pii-sz7il,PGP Foyer,,11:37:05 AM,10 - 15 minutes,Dissatisfied,Satisfied,Very Satisfied,Neutral,Satisfied,,D1,,,,sdv-pii-ag5m7,,Dissatisfied,Dissatisfied,Very Satisfied,Neutral,Satisfied,,2nd,1st,1st,5th,5th,3rd,3rd,6th,5th,4th,1st,sdv-pii-85bfx,Frequently,2nd,4th,6th,5th,2nd,2nd,No,4,"Yes, service improves/worsens (please specify ...",sdv-pii-tonzs,sdv-pii-oclba
3,10/17/2024 03:43:29,Undergraduate student,1 - 2 days a week,Commute to classes,sdv-pii-rj915,sdv-pii-ket94,A2,UTown,CLB,"Tuesday, Thursday",10:23:07 AM,15 - 20 minutes,Dissatisfied,Neutral,Very Satisfied,Dissatisfied,Dissatisfied,5,,sdv-pii-3ijdw,CLB,,10:56:48 AM,20 - 30 minutes,Very dissatisfied,,Satisfied,Very dissatisfied,Satisfied,4.0,,KR MRT,S17,,,< 5 minutes,,,,,,,4th,3rd,3rd,5th,5th,1st,4th,6th,5th,1st,4th,sdv-pii-ur41g,Occasionally,1st,4th,6th,6th,3rd,3rd,No,3,"No, service is consistent",sdv-pii-cvr5x,
4,10/06/2024 13:33:06,Undergraduate student,1 - 2 days a week,Commute to classes,sdv-pii-ieb2f,sdv-pii-k5fu5,A1,KR MRT,LT27,,03:47:59 PM,10 - 15 minutes,Dissatisfied,Satisfied,Satisfied,Neutral,Satisfied,4,,sdv-pii-9xn05,CLB,,03:07:52 PM,15 - 20 minutes,Very dissatisfied,Satisfied,Satisfied,Satisfied,Satisfied,5.0,E,UTown,,,,,,,,,Very Satisfied,,2nd,3rd,2nd,5th,4th,1st,3rd,5th,6th,6th,2nd,sdv-pii-ye4ud,Occasionally,1st,3rd,5th,5th,3rd,3rd,Yes,2,"No, service is consistent",sdv-pii-yx39f,
5,10/02/2024 03:13:10,Undergraduate student,3 - 4 days a week,Commute to classes,sdv-pii-9ixy3,sdv-pii-d9d1g,D2,UTown,COM3,,11:01:35 AM,5 - 10 minutes,Dissatisfied,Neutral,Neutral,Very Satisfied,Dissatisfied,5,A2,sdv-pii-94gvb,LT27,,10:40:07 AM,,Dissatisfied,Satisfied,Neutral,Dissatisfied,Neutral,5.0,D2,,Opp KR MRT,,sdv-pii-c6s2v,,Dissatisfied,,,,Satisfied,5.0,2nd,3rd,3rd,5th,4th,2nd,3rd,4th,5th,4th,4th,sdv-pii-negh3,Occasionally,2nd,1st,6th,4th,3rd,5th,Yes,3,"No, service is consistent",,
6,10/16/2024 06:19:25,Undergraduate student,Less than once a week,Commute to classes,sdv-pii-jibht,sdv-pii-hklmf,A1,KR Bus Terminal,CLB,,08:47:43 AM,10 - 15 minutes,Dissatisfied,Satisfied,Satisfied,Neutral,Satisfied,5,D1,sdv-pii-7lxl8,CLB,,12:38:04 PM,10 - 15 minutes,Very dissatisfied,,Very Satisfied,Neutral,Satisfied,4.0,A2,,,Wednesday,sdv-pii-vdqrw,5 - 10 minutes,Satisfied,Neutral,,Neutral,,,3rd,3rd,1st,1st,4th,1st,3rd,6th,6th,6th,2nd,sdv-pii-n0584,Frequently,1st,1st,6th,6th,2nd,3rd,No,2,"No, service is consistent",sdv-pii-2bldk,sdv-pii-znyjd
7,10/02/2024 01:21:02,Undergraduate student,3 - 4 days a week,Commute to classes,sdv-pii-atbsb,sdv-pii-gmkas,A1,PGP Terminal,UTown,,11:25:13 AM,10 - 15 minutes,Satisfied,Satisfied,Neutral,Dissatisfied,Satisfied,4,D2,,CLB,"Thursday, Friday",03:08:01 PM,,Dissatisfied,Satisfied,Satisfied,Satisfied,Satisfied,4.0,D2,S17,Opp KR MRT,,,20 - 30 minutes,Very dissatisfied,Very Satisfied,Dissatisfied,,Neutral,3.0,2nd,5th,1st,5th,5th,1st,2nd,6th,6th,1st,3rd,sdv-pii-qgsky,Occasionally,3rd,4th,6th,5th,2nd,5th,Yes,4,"No, service is consistent",,sdv-pii-r48lc
8,10/11/2024 23:09:48,Undergraduate student,3 - 4 days a week,Commute to classes,sdv-pii-sy0rs,sdv-pii-ighwq,A1,Opp KR MRT,LT27,Monday,12:25:21 PM,10 - 15 minutes,Dissatisfied,Satisfied,Very dissatisfied,Neutral,Satisfied,5,,sdv-pii-1keb7,Opp KR MRT,,,,Dissatisfied,,,,Very dissatisfied,,D2,LT27,,Wednesday,sdv-pii-blzfl,15 - 20 minutes,Dissatisfied,,Neutral,,Satisfied,,1st,2nd,2nd,5th,5th,1st,1st,6th,3rd,1st,2nd,sdv-pii-473e6,Frequently,1st,4th,5th,1st,3rd,5th,No,1,"Yes, service improves/worsens (please specify ...",,
9,10/08/2024 04:34:12,Undergraduate student,1 - 2 days a week,Commute to classes,sdv-pii-ak0wb,sdv-pii-z2wn0,A1,Opp KR MRT,CLB,,11:34:12 AM,10 - 15 minutes,Very Satisfied,Very Satisfied,Satisfied,Dissatisfied,Neutral,5,A1,sdv-pii-mxf0n,KR MRT,,01:01:42 PM,15 - 20 minutes,Satisfied,,Satisfied,Neutral,Neutral,,,,UTown,,sdv-pii-250qb,,,Neutral,Satisfied,Neutral,Very dissatisfied,,2nd,2nd,4th,5th,5th,3rd,2nd,6th,5th,1st,2nd,sdv-pii-cddk4,Frequently,1st,4th,6th,5th,2nd,2nd,No,3,"Yes, service improves/worsens (please specify ...",,sdv-pii-0k85p
