## Getting Started

### Import Libraries

In [1]:
import requests
import json
import pandas as pd

### Import First Dataset (Volume Per Train Station)

In [2]:
augData = './datasets/Aug_2023.csv'
septData = './datasets/Sept_2023.csv'
octData = './datasets/Oct_2023.csv'


dfAug = pd.read_csv(augData)
dfSept = pd.read_csv(septData)
dfOct = pd.read_csv(octData)

print(dfAug)
print(dfSept)
print(dfOct)

     YEAR_MONTH          DAY_TYPE  TIME_PER_HOUR PT_TYPE        PT_CODE  \
0       2023-08           WEEKDAY             22   TRAIN           NS28   
1       2023-08  WEEKENDS/HOLIDAY             22   TRAIN           NS28   
2       2023-08  WEEKENDS/HOLIDAY              0   TRAIN      DT10/TE11   
3       2023-08           WEEKDAY              0   TRAIN      DT10/TE11   
4       2023-08           WEEKDAY             10   TRAIN  EW16/NE3/TE17   
...         ...               ...            ...     ...            ...   
6815    2023-08  WEEKENDS/HOLIDAY              6   TRAIN           DT23   
6816    2023-08  WEEKENDS/HOLIDAY              7   TRAIN  NS27/CE2/TE20   
6817    2023-08           WEEKDAY              7   TRAIN  NS27/CE2/TE20   
6818    2023-08  WEEKENDS/HOLIDAY             12   TRAIN            SE5   
6819    2023-08           WEEKDAY             12   TRAIN            SE5   

      TOTAL_TAP_IN_VOLUME  TOTAL_TAP_OUT_VOLUME  
0                     752                   311  

### Combine and verify all files for first dataset

In [3]:
# Combine All DataFrames
dfCombinedFirst = pd.concat([dfAug, dfSept, dfOct], ignore_index=True)
print(dfCombinedFirst)

# To Verify Combination
totalRows = dfAug.shape[0] + dfSept.shape[0] + dfOct.shape[0]
print("Total Rows should be:", totalRows)
if dfAug.shape[1] == dfSept.shape[1] == dfOct.shape[1]:
    print("Number of columns is the same for all three DataFrames:", dfAug.shape[1])
else:
    print("Column size is not the same for the DataFrames")


      YEAR_MONTH          DAY_TYPE  TIME_PER_HOUR PT_TYPE        PT_CODE  \
0        2023-08           WEEKDAY             22   TRAIN           NS28   
1        2023-08  WEEKENDS/HOLIDAY             22   TRAIN           NS28   
2        2023-08  WEEKENDS/HOLIDAY              0   TRAIN      DT10/TE11   
3        2023-08           WEEKDAY              0   TRAIN      DT10/TE11   
4        2023-08           WEEKDAY             10   TRAIN  EW16/NE3/TE17   
...          ...               ...            ...     ...            ...   
20456    2023-10  WEEKENDS/HOLIDAY              6   TRAIN           DT23   
20457    2023-10  WEEKENDS/HOLIDAY              7   TRAIN  NS27/CE2/TE20   
20458    2023-10           WEEKDAY              7   TRAIN  NS27/CE2/TE20   
20459    2023-10           WEEKDAY             12   TRAIN            SE5   
20460    2023-10  WEEKENDS/HOLIDAY             12   TRAIN            SE5   

       TOTAL_TAP_IN_VOLUME  TOTAL_TAP_OUT_VOLUME  
0                      752          

### Import Second Dataset (Volume for Origin-Destination Train Station)

In [4]:
augOriginData = './datasets/AugOrigin_2023.csv'
septOriginData = './datasets/septOrigin_2023.csv'
octOriginData = './datasets/octOrigin_2023.csv'

dfAugOrigin = pd.read_csv(augOriginData)
dfSeptOrigin = pd.read_csv(septOriginData)
dfOctOrigin = pd.read_csv(octOriginData)

print(dfAugOrigin)
print(dfSeptOrigin)
print(dfOctOrigin)

       YEAR_MONTH          DAY_TYPE  TIME_PER_HOUR PT_TYPE ORIGIN_PT_CODE  \
0         2023-08           WEEKDAY             13   TRAIN           NE11   
1         2023-08  WEEKENDS/HOLIDAY             13   TRAIN           NS19   
2         2023-08           WEEKDAY             13   TRAIN           NS19   
3         2023-08  WEEKENDS/HOLIDAY             13   TRAIN           NE11   
4         2023-08  WEEKENDS/HOLIDAY             14   TRAIN       CC4/DT15   
...           ...               ...            ...     ...            ...   
806370    2023-08           WEEKDAY             10   TRAIN           TE12   
806371    2023-08  WEEKENDS/HOLIDAY             10   TRAIN           TE12   
806372    2023-08           WEEKDAY             10   TRAIN            NS2   
806373    2023-08  WEEKENDS/HOLIDAY             22   TRAIN           DT17   
806374    2023-08           WEEKDAY             22   TRAIN           DT17   

       DESTINATION_PT_CODE  TOTAL_TRIPS  
0                     NS19       

### Combine and verify all files for second dataset

In [5]:
# Combine All DataFrames
dfCombinedSecond = pd.concat([dfAugOrigin, dfSeptOrigin, dfOctOrigin], ignore_index=True)
print(dfCombinedSecond)

# To Verify Combination
totalRows = dfAugOrigin.shape[0] + dfSeptOrigin.shape[0] + dfOctOrigin.shape[0]
print("Total Rows should be:", totalRows)
if dfAugOrigin.shape[1] == dfSeptOrigin.shape[1] == dfOctOrigin.shape[1]:
    print("Number of columns is the same for all three DataFrames:", dfAugOrigin.shape[1])
else:
    print("Column size is not the same for the DataFrames")

        YEAR_MONTH          DAY_TYPE  TIME_PER_HOUR PT_TYPE ORIGIN_PT_CODE  \
0          2023-08           WEEKDAY             13   TRAIN           NE11   
1          2023-08  WEEKENDS/HOLIDAY             13   TRAIN           NS19   
2          2023-08           WEEKDAY             13   TRAIN           NS19   
3          2023-08  WEEKENDS/HOLIDAY             13   TRAIN           NE11   
4          2023-08  WEEKENDS/HOLIDAY             14   TRAIN       CC4/DT15   
...            ...               ...            ...     ...            ...   
2415928    2023-10  WEEKENDS/HOLIDAY             10   TRAIN            NS2   
2415929    2023-10           WEEKDAY             10   TRAIN           TE12   
2415930    2023-10  WEEKENDS/HOLIDAY             22   TRAIN           DT17   
2415931    2023-10           WEEKDAY             22   TRAIN      EW21/CC22   
2415932    2023-10           WEEKDAY             22   TRAIN           DT17   

        DESTINATION_PT_CODE  TOTAL_TRIPS  
0                   

## Preparing First Dataset for Analysis

### Removing repeated columns

In [6]:
# Renaming a repeating column (PT_TYPE) to PT_NAME
dfCombinedFirst = dfCombinedFirst.rename(columns={'PT_TYPE': 'PT_NAME'})

### Mapping Station Codes with Station Names

In [7]:
# Map Station Names (PT_NAME) with Station Codes (PT_CODE)
dfCombinedFirst['PT_CODE_FirstPart'] = dfCombinedFirst['PT_CODE'].str.split('/').str[0] #Splitting Stations with multiple codes
csv_df = pd.read_csv('./datasets/TrainStationCodes.csv') # Our Train Station Names file
code_name_mapping = dict(zip(csv_df['stn_code'], csv_df['mrt_station_english'])) #Mapping
dfCombinedFirst['PT_NAME'] = dfCombinedFirst['PT_CODE_FirstPart'].map(code_name_mapping) # Mapping
dfCombinedFirst = dfCombinedFirst.drop('PT_CODE_FirstPart', axis=1) # Remove column used for mapping


print(dfCombinedFirst)

      YEAR_MONTH          DAY_TYPE  TIME_PER_HOUR            PT_NAME  \
0        2023-08           WEEKDAY             22  Marina South Pier   
1        2023-08  WEEKENDS/HOLIDAY             22  Marina South Pier   
2        2023-08  WEEKENDS/HOLIDAY              0            Stevens   
3        2023-08           WEEKDAY              0            Stevens   
4        2023-08           WEEKDAY             10        Outram Park   
...          ...               ...            ...                ...   
20456    2023-10  WEEKENDS/HOLIDAY              6          Bendemeer   
20457    2023-10  WEEKENDS/HOLIDAY              7         Marina Bay   
20458    2023-10           WEEKDAY              7         Marina Bay   
20459    2023-10           WEEKDAY             12           Ranggung   
20460    2023-10  WEEKENDS/HOLIDAY             12           Ranggung   

             PT_CODE  TOTAL_TAP_IN_VOLUME  TOTAL_TAP_OUT_VOLUME  
0               NS28                  752                   311  
1  

### Changing Time_Per_Hour column to proper time

## Saving Processed Data

In [8]:
# Output Combined DataFrames
output_path = './outputDatasets/combinedData.csv'
dfCombinedFirst.to_csv(output_path, index=False)

In [9]:
station_count_per_hour = dfCombinedFirst.groupby(['TIME_PER_HOUR', 'PT_CODE']).size().unstack(fill_value=0)

# Displaying the first few rows of the result for a concise view
station_count_per_hour.head(100)

PT_CODE,BP10,BP11,BP12,BP13,BP2,BP3,BP4,BP5,BP6/DT1,BP7,...,TE16,TE18,TE19,TE22,TE3,TE4,TE5,TE6,TE7,TE8
TIME_PER_HOUR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,0,2,1,0,0,1,6,1,...,6,6,6,6,6,6,6,6,6,6
5,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
6,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
7,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
8,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
9,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
10,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
11,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
12,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
13,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
