## Importing packages and data

In [16]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb ##download xgboost using "pip3 install xgboost"
from datetime import datetime

#data = pd.read_csv('form_responses.csv')

# pip install sdv to download sdv package

## Loading data

In [17]:
data = pd.read_csv('cleaned_routes.csv')
data

Unnamed: 0,ISB_Service,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness
0,A2,PGP Foyer,Ventus,Monday,09:30:00,15 - 20 minutes,Neutral,Neutral,Very Satisfied,Satisfied,Satisfied,3.0
1,D2,KR MRT,UTown,Monday,09:30:00,10 - 15 minutes,Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,4.0
2,A2,KR Bus Terminal,S17,Thursday,17:50:00,< 5 minutes,Very Satisfied,Satisfied,Satisfied,Satisfied,Very Satisfied,4.0
3,A1,PGP Terminal,University Hall,Friday,11:30:00,10 - 15 minutes,Very Satisfied,Very Satisfied,Very Satisfied,Neutral,Very Satisfied,4.0
4,A1,KR MRT,LT27,Tuesday,07:50:00,< 5 minutes,Neutral,Satisfied,Satisfied,Satisfied,Satisfied,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
285,A1,BIZ2,LT27,Monday,11:40:00,10 - 15 minutes,Neutral,Satisfied,Very Satisfied,Very Satisfied,Very Satisfied,5.0
286,D2,KR MRT,TCOMS,Wednesday,08:45:45,10 - 15 minutes,Dissatisfied,Satisfied,Very Satisfied,Neutral,Dissatisfied,4.0
287,D2,S17,COM3,Thursday,07:45:45,10 - 15 minutes,Neutral,Very Satisfied,Satisfied,Very Satisfied,Satisfied,4.0
288,A2,IT,Opp HSSML,Monday,10:45:45,< 5 minutes,Dissatisfied,Satisfied,Satisfied,Neutral,Satisfied,4.0


## Creating metadata

### for routes only

In [18]:
from sdv.metadata import Metadata

metadata = Metadata.detect_from_dataframe(
    data = data,
    table_name='transport')

metadata.update_column(
    column_name='time_start',
    sdtype='datetime',
    datetime_format= '%H:%M:%S')

metadata.validate()

metadata

{
    "tables": {
        "transport": {
            "columns": {
                "ISB_Service": {
                    "sdtype": "categorical"
                },
                "bus_stop_board": {
                    "sdtype": "categorical"
                },
                "bus_stop_alight": {
                    "sdtype": "categorical"
                },
                "day_of_the_week": {
                    "sdtype": "categorical"
                },
                "time_start": {
                    "sdtype": "datetime",
                    "datetime_format": "%H:%M:%S"
                },
                "travel_duration": {
                    "sdtype": "categorical"
                },
                "frequency": {
                    "sdtype": "categorical"
                },
                "punctuality": {
                    "sdtype": "categorical"
                },
                "cleanliness": {
                    "sdtype": "categorical"
                },
          

## Utilising GaussianCopulaSynthesizer

In [19]:
from sdv.single_table import GaussianCopulaSynthesizer

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.load_custom_constraint_classes(filepath='custom_constraints.py', class_names=['BusStopsCheck', 'TimeCheck'])

route_constraint = {
    'constraint_class': 'BusStopsCheck',
    'constraint_parameters': {
        'column_names':['ISB_Service', 'bus_stop_board', 'bus_stop_alight']
    }
}
time_constraint = {
    'constraint_class': 'TimeCheck',
    'constraint_parameters': {
        'column_names':['time_start']
    }
}

synthesizer.add_constraints(constraints=[route_constraint, time_constraint])

synthesizer.fit(data)

synthetic_data = synthesizer.sample(num_rows=10000)

synthetic_data

Sampling rows: 100%|██████████| 10000/10000 [00:49<00:00, 201.19it/s]


Unnamed: 0,ISB_Service,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness
0,D2,KR MRT,UTown,Monday,08:17:43,5 - 10 minutes,Satisfied,Satisfied,Satisfied,Neutral,Satisfied,4.0
1,D2,KR MRT,LT27,Thursday,17:15:53,< 5 minutes,Neutral,Satisfied,Satisfied,Satisfied,Satisfied,5.0
2,D2,Opp KR MRT,PGP Foyer,Thursday,08:09:21,10 - 15 minutes,Neutral,Satisfied,Satisfied,Neutral,Very Satisfied,5.0
3,D1,YIH,COM3,Thursday,08:09:02,5 - 10 minutes,Dissatisfied,Satisfied,Satisfied,Very dissatisfied,Very dissatisfied,5.0
4,A1,KR MRT,CLB,Wednesday,10:28:37,5 - 10 minutes,Dissatisfied,Satisfied,Satisfied,Dissatisfied,Satisfied,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,D1,Museum,CLB,Monday,15:12:48,5 - 10 minutes,Dissatisfied,Dissatisfied,Satisfied,Very dissatisfied,Neutral,2.0
9996,A1,BIZ2,CLB,Tuesday,20:45:45,20 - 30 minutes,Neutral,Very dissatisfied,Very Satisfied,Neutral,Satisfied,2.0
9997,D2,KR MRT,UTown,Wednesday,20:45:45,5 - 10 minutes,Neutral,Very Satisfied,Very Satisfied,Neutral,Satisfied,4.0
9998,A1,KR MRT,KR Bus Terminal,Thursday,09:28:01,5 - 10 minutes,Dissatisfied,Satisfied,Satisfied,Dissatisfied,Satisfied,5.0


In [20]:
synthesizer.get_learned_distributions()

{'ISB_Service': {'distribution': 'beta',
  'learned_parameters': {'loc': 0.0017874543315022971,
   'scale': 0.9974191713480585,
   'a': 0.9017274626243743,
   'b': 0.9755642462209718}},
 'bus_stop_board': {'distribution': 'beta',
  'learned_parameters': {'loc': 0.005620606284709666,
   'scale': 0.9916506796214529,
   'a': 1.021624314365857,
   'b': 0.9819670145534691}},
 'bus_stop_alight': {'distribution': 'beta',
  'learned_parameters': {'loc': 0.001462034630041732,
   'scale': 0.9980722941023545,
   'a': 0.9536277283228882,
   'b': 0.961806198352781}},
 'day_of_the_week': {'distribution': 'beta',
  'learned_parameters': {'loc': 0.004986229126805837,
   'scale': 0.9922618618090253,
   'a': 0.9584505612531116,
   'b': 0.9812253433299487}},
 'time_start': {'distribution': 'beta',
  'learned_parameters': {'loc': -2.2089609000000003e+18,
   'scale': 48015535358267.91,
   'a': 0.8244999308269605,
   'b': 1.2826962631378755}},
 'travel_duration': {'distribution': 'beta',
  'learned_paramete

## Utilising CTGAN

In [21]:
from sdv.single_table import CTGANSynthesizer

ctgan = CTGANSynthesizer(metadata)

ctgan.load_custom_constraint_classes(filepath='custom_constraints.py', class_names=['BusStopsCheck', 'TimeCheck'])

route_constraint = {
    'constraint_class': 'BusStopsCheck',
    'constraint_parameters': {
        'column_names':['ISB_Service', 'bus_stop_board', 'bus_stop_alight']
    }
}
time_constraint = {
    'constraint_class': 'TimeCheck',
    'constraint_parameters': {
        'column_names':['time_start']
    }
}

ctgan.add_constraints(constraints=[route_constraint, time_constraint])

ctgan.fit(data)

ctgan_synthetic_data = ctgan.sample(num_rows=10000)

ctgan_synthetic_data

Sampling rows: 100%|██████████| 10000/10000 [01:00<00:00, 165.23it/s]


Unnamed: 0,ISB_Service,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness
0,A1,LT13,KR MRT,Thursday,14:22:22,< 5 minutes,Very dissatisfied,Neutral,Neutral,Neutral,Satisfied,3.0
1,A1,KR MRT,CLB,Tuesday,07:45:00,15 - 20 minutes,Neutral,Satisfied,Neutral,Neutral,Very Satisfied,5.0
2,D2,KR MRT,PGP Foyer,Monday,12:05:32,5 - 10 minutes,Very dissatisfied,Satisfied,Dissatisfied,Very Satisfied,Very dissatisfied,5.0
3,D2,KR MRT,LT27,Wednesday,12:23:27,10 - 15 minutes,Neutral,Satisfied,Neutral,Satisfied,Very dissatisfied,5.0
4,D1,COM3,UTown,Wednesday,07:45:00,10 - 15 minutes,Neutral,Neutral,Very dissatisfied,Satisfied,Satisfied,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,D2,KR MRT,LT27,Friday,09:46:29,10 - 15 minutes,Dissatisfied,Satisfied,Satisfied,Dissatisfied,Satisfied,3.0
9996,K,KR MRT,CLB,Monday,11:53:21,10 - 15 minutes,Neutral,Satisfied,Very Satisfied,Dissatisfied,Neutral,5.0
9997,D2,KR MRT,LT27,Wednesday,12:18:56,10 - 15 minutes,Neutral,Satisfied,Satisfied,Very dissatisfied,Neutral,5.0
9998,D2,KR MRT,UTown,Friday,09:16:55,10 - 15 minutes,Very dissatisfied,Satisfied,Satisfied,Very Satisfied,Satisfied,5.0


In [23]:
fig = ctgan.get_loss_values_plot()
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

## Validate Synthetic Data

In [24]:
A1_bus = ['KR Bus Terminal', 'LT13', 'AS5', 'BIZ2', 'Opp TCOMS', 'PGP Terminal', 'KR MRT', 'LT27', 'University Hall', 'Opp UHC', 'YIH', 'CLB', 'KR Bus Terminal']
A2_bus = ['KR Bus Terminal', 'IT', 'Opp YIH', 'Museum', 'UHC', 'Opp University Hall', 'S17', 'Opp KR MRT', 'PGP Foyer', 'TCOMS', 'Opp HSSML', 'Opp NUSS', 'Ventus', 'KR Bus Terminal']
D1_bus = ['COM3', 'Opp HSSML', 'Opp NUSS', 'Ventus', 'IT', 'Opp YIH', 'Museum', 'UTown', 'YIH', 'CLB', 'LT13', 'AS5', 'BIZ2', 'COM3']
D2_bus = ['COM3', 'Opp TCOMS', 'PGP Terminal', 'KR MRT', 'LT27', 'University Hall', 'Opp UHC', 'Museum', 'UTown', 'UHC', 'Opp University Hall', 'S17', 'Opp KR MRT', 'PGP Foyer', 'TCOMS', 'COM3']
BTC_bus = ['Oei Tiong Ham Building (BTC)', 'Botanic Gardens MRT (BTC)', 'KR MRT', 'LT27', 'University Hall', 'Opp UHC', 'UTown', 'Raffles Hall', 'Kent Vale', 'Museum', 'YIH', 'CLB', 'LT13', 'AS5', 'BIZ2', 'PGP Terminal', 'College Green (BTC)', 'Oei Tiong Ham Building (BTC)']
E_bus = ['UTown', 'Raffles Hall', 'Kent Vale', 'EA', 'SDE3', 'IT', 'Opp YIH', 'UTown']
K_bus = ['PGP Terminal', 'KR MRT', 'LT27', 'University Hall', 'Opp UHC', 'YIH', 'CLB', 'Opp SDE3', 'The Japanese Primary School', 'Kent Vale', 'Museum', 'UHC', 'Opp University Hall', 'S17', 'Opp KR MRT', 'PGP Foyer']
L_bus = ['Oei Tiong Ham Building (BTC)', 'Botanic Gardents MRT (BTC)', 'College Green (BTC)', 'Oei Tiong Ham Building (BTC)']

bus_routes = {'A1':A1_bus, 'A2':A2_bus, 'D1':D1_bus, 'D2':D2_bus, 'BTC (Bukit Timah Campus)':BTC_bus, 'E':E_bus, 'K':K_bus, 'L':L_bus}

def validate_bus_stops(data, bus_routes):
    service_col = 'ISB_Service'
    board_col = 'bus_stop_board'
    alight_col = 'bus_stop_alight'

    def check_stops(row):
        bus = row[service_col]
        start = row[board_col]
        end = row[alight_col]

        # Get the route for the bus
        route = bus_routes.get(bus, [])

        if start in route and end in route:
            start_index = route.index(start)
            end_index = route.index(end) if end != route[0] else len(route)

            if start_index < end_index:
                return bus, start, end  # No change if valid

        # If only start is valid, randomly select an end stop from the route after start
        if start in route:
            start_index = route.index(start)
            if start_index < len(route) - 1:  # Ensure there are stops after start
                possible_ends = route[start_index + 1:]
                new_end = np.random.choice(possible_ends)
                return bus, start, new_end

        # If only end is valid, select a new start stop before end
        if end in route:
            end_index = route.index(end)
            if end_index == 0:
                end_index = len(route) - 1  # If end is the first stop, assume it is the last stop (loop)
            if end_index > 0:
                possible_starts = route[:end_index]
                new_start = np.random.choice(possible_starts)
                return bus, new_start, end

        # Check for valid buses that have both start and end in the route
        valid_buses = [
            key for key, route in bus_routes.items()
            if start in route and end in route and
            (route.index(start) < route.index(end) or
                (route.index(end) == 0 and route.index(start) < len(route) - 1))
        ]
        if valid_buses:
            new_bus = np.random.choice(valid_buses)  # Randomly choose one valid bus
            return new_bus, start, end

        # else, select two random stops in the correct order from the current bus route
        random_start_index = np.random.randint(0, len(route) - 1)
        random_end_index = np.random.randint(random_start_index + 1, len(route))
        new_start = route[random_start_index]
        new_end = route[random_end_index]
        return bus, new_start, new_end

    data[[service_col, board_col, alight_col]] = data.apply(lambda row: check_stops(row), axis=1, result_type="expand")

    return data

def check_validate_bus_stops(data, bus_routes):
    service_col = 'ISB_Service'
    board_col = 'bus_stop_board'
    alight_col = 'bus_stop_alight'

    def check_stops(row):
        bus = row[service_col]
        start = row[board_col]
        end = row[alight_col]

        # Check if bus route exists in the dictionary
        if bus in bus_routes:
            route = bus_routes[bus]

            # Check both stops are in the route and start is before end
            if start in route and end in route:
                start_index = route.index(start)
                # Handle looping
                end_index = route.index(end) if end != route[0] else len(route)

                if start_index < end_index:
                    return row[board_col], row[alight_col]  # No change if valid
        return "error", "error"
    data[[board_col, alight_col]] = data.apply(lambda row: check_stops(row), axis=1, result_type="expand")

    return data

synthetic_data = validate_bus_stops(synthetic_data, bus_routes)

check_data = check_validate_bus_stops(synthetic_data, bus_routes)
error_rows = check_data[check_data.isin(['error']).any(axis=1)]
error_rows
len(error_rows)

0

In [25]:
ctgan_synthetic_data = validate_bus_stops(ctgan_synthetic_data, bus_routes)

check_data = check_validate_bus_stops(ctgan_synthetic_data, bus_routes)
error_rows = check_data[check_data.isin(['error']).any(axis=1)]
error_rows
len(error_rows)

0

### Write synthetic data to CSV

In [None]:
# synthetic_data.to_csv('synthetic_data.csv', index=False)

## Combining datasets

In [10]:
# Combining datasets
alldata = pd.concat([routedata, synthetic_data], axis=0)


# Creating a demand column
alldata['demand'] = 1


# Label encoding for ISB_service column
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
alldata['bus_number'] = label_encoder.fit_transform(alldata['ISB_Service'])
alldata = alldata.drop(labels="ISB_Service", axis=1)


alldata

Unnamed: 0,bus_stop_board,bus_stop_alight,day_of_the_week,time_start,travel_duration,frequency,punctuality,cleanliness,safety,coverage,crowdedness,demand,bus_number
0,KR MRT,Ventus,Monday,09:30:00,15 - 20 minutes,Neutral,Neutral,Very Satisfied,Satisfied,Satisfied,3.0,1,1
1,KR MRT,UTown,Monday,09:30:00,10 - 15 minutes,Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,4.0,1,4
2,KR MRT,S17,Thursday,17:50:00,< 5 minutes,Very Satisfied,Satisfied,Satisfied,Satisfied,Very Satisfied,4.0,1,1
3,PGP Terminal,S17,Friday,11:30:00,10 - 15 minutes,Very Satisfied,Very Satisfied,Very Satisfied,Neutral,Very Satisfied,4.0,1,0
4,KR MRT,LT27,Tuesday,07:50:00,< 5 minutes,Neutral,Satisfied,Satisfied,Satisfied,Satisfied,5.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,KR MRT,COM3,Friday,15:11:21,< 5 minutes,Neutral,Satisfied,Very Satisfied,Neutral,Satisfied,5.0,1,1
96,Opp KR MRT,UTown,"Tuesday, Friday",13:45:22,10 - 15 minutes,Dissatisfied,Neutral,Satisfied,Satisfied,Satisfied,5.0,1,4
97,Oei Tiong Ham Building (BTC),UTown,Wednesday,12:28:00,10 - 15 minutes,Very Satisfied,Satisfied,Satisfied,Satisfied,Satisfied,3.0,1,1
98,BIZ2,Opp KR MRT,"Tuesday, Thursday",15:49:24,5 - 10 minutes,Satisfied,Neutral,Satisfied,Satisfied,Satisfied,5.0,1,0


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Example dataset (transportation data with multiple predictors)
# Assuming df is your DataFrame with demand data
X = alldata[['bus_number', 'bus_stop_board', 'bus_stop_alight', 'day_of_the_week', 'time_start',
              'travel_duration', 'frequency', 'punctuality', 'cleanliness', 'safety', 'coverage', 'crowdedness']]  # Independent variables
y = alldata['demand']  # Dependent variable (demand)??

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the linear regression model
model = LinearRegression()

# Fit the model on training data
model.fit(X_train, y_train)

# Predict demand on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)  # Mean Squared Error
r2 = r2_score(y_test, y_pred)  # R-squared score

#print(f"Mean Squared Error: {mse}")
#print(f"R-squared: {r2}")


ValueError: could not convert string to float: 'Opp KR MRT'