# Getting Started

## Import Libraries

In [1]:
import pandas as pd
import boto3

### Starting our AWS Session

In [2]:
session = boto3.Session(profile_name='personal-ltadatamall')
s3 = session.resource('s3')

## Import First Dataset (Volume Per Train Station)

### Using AWS S3 Bucket for our Dataset

In [3]:
bucket = s3.Bucket('ltadatamall')

dfCombinedFirst = pd.DataFrame()

for obj in bucket.objects.filter(Prefix='TrainVolume_Data/'):
    if not obj.key.endswith('/'):
        # Use the object's get method to retrieve the object
        response = obj.get()
        df_temp = pd.read_csv(response['Body'])  # Assuming the file is a CSV
        dfCombinedFirst = pd.concat([dfCombinedFirst, df_temp], ignore_index=True)

print(dfCombinedFirst)


      YEAR_MONTH          DAY_TYPE  TIME_PER_HOUR PT_TYPE        PT_CODE  \
0        2023-08           WEEKDAY             22   TRAIN           NS28   
1        2023-08  WEEKENDS/HOLIDAY             22   TRAIN           NS28   
2        2023-08  WEEKENDS/HOLIDAY              0   TRAIN      DT10/TE11   
3        2023-08           WEEKDAY              0   TRAIN      DT10/TE11   
4        2023-08           WEEKDAY             10   TRAIN  EW16/NE3/TE17   
...          ...               ...            ...     ...            ...   
34087    2023-12  WEEKENDS/HOLIDAY              6   TRAIN           DT23   
34088    2023-12           WEEKDAY              7   TRAIN  NS27/CE2/TE20   
34089    2023-12  WEEKENDS/HOLIDAY              7   TRAIN  NS27/CE2/TE20   
34090    2023-12           WEEKDAY             12   TRAIN            SE5   
34091    2023-12  WEEKENDS/HOLIDAY             12   TRAIN            SE5   

       TOTAL_TAP_IN_VOLUME  TOTAL_TAP_OUT_VOLUME  
0                      752          

## Preparing Our First Dataset for Analysis

### Finding Blank Columns or Rows

In [4]:
# Using isna() method to find blanks
blanks = dfCombinedFirst.isna()
print(blanks)

       YEAR_MONTH  DAY_TYPE  TIME_PER_HOUR  PT_TYPE  PT_CODE  \
0           False     False          False    False    False   
1           False     False          False    False    False   
2           False     False          False    False    False   
3           False     False          False    False    False   
4           False     False          False    False    False   
...           ...       ...            ...      ...      ...   
34087       False     False          False    False    False   
34088       False     False          False    False    False   
34089       False     False          False    False    False   
34090       False     False          False    False    False   
34091       False     False          False    False    False   

       TOTAL_TAP_IN_VOLUME  TOTAL_TAP_OUT_VOLUME  
0                    False                 False  
1                    False                 False  
2                    False                 False  
3                    False 

### Finding Duplicated Columns or Rows

In [5]:
duplicates = dfCombinedFirst.duplicated().count
print(duplicates)

<bound method Series.count of 0        False
1        False
2        False
3        False
4        False
         ...  
34087    False
34088    False
34089    False
34090    False
34091    False
Length: 34092, dtype: bool>


### Removing repeated columns

In [6]:
# Renaming a repeating column (PT_TYPE) to PT_NAME
dfCombinedFirst = dfCombinedFirst.rename(columns={'PT_TYPE': 'PT_NAME'})

### Mapping Station Codes with Station Names

In [7]:
# Map Station Names (PT_NAME) with Station Codes (PT_CODE)
dfCombinedFirst['PT_CODE_FirstPart'] = dfCombinedFirst['PT_CODE'].str.split('/').str[0] #Splitting Stations with multiple codes
csv_df = pd.read_csv(s3.Object('ltadatamall', 'TrainStationCodes.csv').get()['Body']) # Our Train Station Code File from AWS S3 Bucket
code_name_mapping = dict(zip(csv_df['stn_code'], csv_df['mrt_station_english'])) #Mapping
dfCombinedFirst['PT_NAME'] = dfCombinedFirst['PT_CODE_FirstPart'].map(code_name_mapping) # Mapping
dfCombinedFirst = dfCombinedFirst.drop('PT_CODE_FirstPart', axis=1) # Remove column used for mapping


print(dfCombinedFirst)

      YEAR_MONTH          DAY_TYPE  TIME_PER_HOUR            PT_NAME  \
0        2023-08           WEEKDAY             22  Marina South Pier   
1        2023-08  WEEKENDS/HOLIDAY             22  Marina South Pier   
2        2023-08  WEEKENDS/HOLIDAY              0            Stevens   
3        2023-08           WEEKDAY              0            Stevens   
4        2023-08           WEEKDAY             10        Outram Park   
...          ...               ...            ...                ...   
34087    2023-12  WEEKENDS/HOLIDAY              6          Bendemeer   
34088    2023-12           WEEKDAY              7         Marina Bay   
34089    2023-12  WEEKENDS/HOLIDAY              7         Marina Bay   
34090    2023-12           WEEKDAY             12           Ranggung   
34091    2023-12  WEEKENDS/HOLIDAY             12           Ranggung   

             PT_CODE  TOTAL_TAP_IN_VOLUME  TOTAL_TAP_OUT_VOLUME  
0               NS28                  752                   311  
1  

### Changing Month_Year & Time_Per_Year column to proper datetime format

In [8]:
# Convert YEAR_MONTH to the last day of the month
dfCombinedFirst['YEAR_MONTH'] = pd.to_datetime(dfCombinedFirst['YEAR_MONTH']).dt.to_period('M').dt.to_timestamp('M') + pd.offsets.MonthEnd(0)

# Combine with TIME_PER_HOUR to create a full datetime
dfCombinedFirst['DATETIME'] = dfCombinedFirst.apply(lambda row: pd.Timestamp(year=row['YEAR_MONTH'].year,
                                                   month=row['YEAR_MONTH'].month,
                                                   day=row['YEAR_MONTH'].day,
                                                   hour=row['TIME_PER_HOUR']), axis=1)

# Drop the original YEAR_MONTH column
dfCombinedFirst.drop('YEAR_MONTH', axis=1, inplace=True)

# Make DATETIME the first column by creating a new DataFrame with the desired column order
dfCombinedFirst = dfCombinedFirst[['DATETIME'] + [col for col in dfCombinedFirst.columns if col != 'DATETIME']]

# Drop the origin Time_Per_Hour columns
dfCombinedFirst.drop('TIME_PER_HOUR', axis=1, inplace=True)




### Changing (Weekdays to 0) & (Weekends/Holidays to  1)

In [9]:
dfCombinedFirst['DAY_TYPE'] = dfCombinedFirst['DAY_TYPE'].map({'WEEKDAY': 0, 'WEEKENDS/HOLIDAY': 1})

### Creating Train_Lines column to calculate number of train lines in the station

In [10]:
# Define a function that counts the number of train lines
def count_train_lines(pt_code):
    if pd.isna(pt_code):
        return 0
    return pt_code.count('/') + 1

# Apply the function to the PT_CODE column and create the TRAIN_LINES column
dfCombinedFirst['TRAIN_LINES'] = dfCombinedFirst['PT_CODE'].apply(count_train_lines)

# Insert TRAIN_LINES next to PT_CODE column
loc = dfCombinedFirst.columns.get_loc('PT_NAME') + 1
dfCombinedFirst.insert(loc, 'TRAIN_LINES', dfCombinedFirst.pop('TRAIN_LINES'))


### Mapping train lines to train codes

In [11]:
train_line_mapping = {
    'EW': 0, # East-West Line
    'CG': 0, # East-West Line to Changi Airport
    'NS': 1, # North-South Line
    'NE': 2, # North-East Line
    'CC': 3, # Circle Line
    'CE': 3, # Circle Line (Bayfront, Marina Bay)
    'DT': 4, # Downtown Line
    'TE': 5, # Thomson-East Coast Line
    'BP': 6, # Bukit Panjang LRT
    'SW': 7, # Sengkang LRT West
    'SE': 7, # Sengkang LRT East
    'PW': 8, # Punggol LRT West
    'PE': 8, # Punggol LRT East
}

def map_train_codes(pt_code):
    # Initialize an empty list to store the mapped train codes
    train_codes = []
    # Split the pt_code by '/' and iterate over each part
    for code in pt_code.split('/'):
        # Iterate over each key in the mapping to find a match
        for key in train_line_mapping:
            # If the key is found at the start of the code segment, append the mapped value
            if code.startswith(key):
                train_codes.append(train_line_mapping[key])
                break  # Break the loop once the match is found
    return train_codes

# Apply the revised function to the PT_CODE column
dfCombinedFirst['TRAIN_CODES'] = dfCombinedFirst['PT_CODE'].apply(map_train_codes)

# Convert the TRAIN_CODES column to a list
train_codes_list = dfCombinedFirst['TRAIN_CODES'].tolist()

# Insert TRAIN_CODES next to TRAIN_LINES
loc = dfCombinedFirst.columns.get_loc('TRAIN_LINES') + 1
dfCombinedFirst.insert(loc, 'TRAIN_CODES', dfCombinedFirst.pop('TRAIN_CODES'))

### Check number of train stations in Our Dataset 

In [12]:
numberOfTrainStations = dfCombinedFirst['PT_NAME'].nunique()
print(numberOfTrainStations)

171


### Check number of train station in our Location Dataset

In [13]:
dfLocation = pd.read_csv(s3.Object('ltadatamall', 'TrainStationLocation.csv').get()['Body']) # Our Train Station Location File from AWS S3 Bucket
unique_station_name_count = dfLocation['station_name'].nunique()
print(unique_station_name_count)

172


### Check that we have locations data for all of our train stations

In [14]:
# Find PT_NAMEs that are not present in the station_name and drop duplicates
missing_stations = dfCombinedFirst[~dfCombinedFirst['PT_NAME'].isin(dfLocation['station_name'])]
missing_stations_unique = missing_stations.drop_duplicates(subset=['PT_NAME'])
 
# Get the unique missing station names as a list
missing_station_names_unique = missing_stations_unique['PT_NAME'].tolist()

print(list(missing_station_names_unique))
print("This list shows what stations is not in our location dataset.")
print(len(missing_station_names_unique))
print("This number should be: 0")

[]
This list shows what stations is not in our location dataset.
0
This number should be: 0


### Finding out what is the hidden 1 station in Singapore..

In [15]:
dfLocation_set = set(dfLocation['station_name'])
dfCombinedFirst_set = set(dfCombinedFirst['PT_NAME'])


unique_station = dfLocation_set - dfCombinedFirst_set


print(unique_station)


{'Teck Lee'}


### Mapping location data to our station names

In [16]:
latitude_mapping = dict(zip(dfLocation['station_name'], dfLocation['lat'])) #Mapping
longitude_mappping = dict(zip(dfLocation['station_name'], dfLocation['lng'])) #Mapping
dfCombinedFirst['PT_LATITUDE'] = dfCombinedFirst['PT_NAME'].map(latitude_mapping) # Creating PT_LATITUDE column
dfCombinedFirst['PT_LONGITUDE'] = dfCombinedFirst['PT_NAME'].map(longitude_mappping) # Creating PT_LONGITUDE column

# Insert PT_LATITUDE & PT_LONGITUDE next to PT_CODE column
loc = dfCombinedFirst.columns.get_loc('PT_CODE') + 1
dfCombinedFirst.insert(loc, 'PT_LATITUDE', dfCombinedFirst.pop('PT_LATITUDE'))
loc1 = dfCombinedFirst.columns.get_loc('PT_CODE') + 2
dfCombinedFirst.insert(loc1, 'PT_LONGITUDE', dfCombinedFirst.pop('PT_LONGITUDE'))

## Saving our First Processed Dataset on AWS S3 Bucket

In [17]:
# Convert our Dataframe to CSV file format
df_to_csv = dfCombinedFirst.to_csv(index=False)

# Get the S3 object
s3_object = s3.Object('ltadatamall', 'TrainVolume_ProcessedData/Data.csv')

# Write our CSV data to AWS S3 
s3_object.put(Body=df_to_csv)

print("Data uploaded successfully to AWS S3.")

Data uploaded successfully to AWS S3.


## Import Second Dataset (Volume for Origin-Destination Train Station)

### Using AWS S3 Bucket for our Dataset

In [18]:
bucket = s3.Bucket('ltadatamall')

dfCombinedSecond = pd.DataFrame()

for obj in bucket.objects.filter(Prefix='TrainVolumeOrigin_Data/'):
    if not obj.key.endswith('/'):
        # Use the object's get method to retrieve the object
        response = obj.get()
        df_temp = pd.read_csv(response['Body'])  # Assuming the file is a CSV
        dfCombinedSecond = pd.concat([dfCombinedSecond, df_temp], ignore_index=True)

print(dfCombinedSecond)



        YEAR_MONTH          DAY_TYPE  TIME_PER_HOUR PT_TYPE ORIGIN_PT_CODE  \
0          2023-08           WEEKDAY             13   TRAIN           NE11   
1          2023-08  WEEKENDS/HOLIDAY             13   TRAIN           NS19   
2          2023-08           WEEKDAY             13   TRAIN           NS19   
3          2023-08  WEEKENDS/HOLIDAY             13   TRAIN           NE11   
4          2023-08  WEEKENDS/HOLIDAY             14   TRAIN       CC4/DT15   
...            ...               ...            ...     ...            ...   
4022728    2023-12  WEEKENDS/HOLIDAY             10   TRAIN            NS2   
4022729    2023-12  WEEKENDS/HOLIDAY             22   TRAIN      EW21/CC22   
4022730    2023-12           WEEKDAY             22   TRAIN           DT17   
4022731    2023-12           WEEKDAY             22   TRAIN      EW21/CC22   
4022732    2023-12  WEEKENDS/HOLIDAY             22   TRAIN           DT17   

        DESTINATION_PT_CODE  TOTAL_TRIPS  
0                   

## Preparing Our Second Dataset for Analysis

### Removing repeated columns

In [19]:
# Renaming a repeating column (PT_TYPE) to PT_NAME
dfCombinedSecond = dfCombinedSecond.rename(columns={'PT_TYPE': 'ORIGIN_PT_NAME'})

### Mapping Station Codes with Station Names

In [20]:
# Map Station Names (PT_NAME) with Station Codes (PT_CODE)
#dfCombinedFirst['PT_CODE_FirstPart'] = dfCombinedFirst['PT_CODE'].str.split('/').str[0] #Splitting Stations with multiple codes
dfCombinedSecond['ORIGIN_PT_CODE_FirstPart'] = dfCombinedSecond['ORIGIN_PT_CODE'].str.split('/').str[0]
code_name_mapping = dict(zip(csv_df['stn_code'], csv_df['mrt_station_english'])) #Mapping
dfCombinedSecond['ORIGIN_PT_NAME'] = dfCombinedSecond['ORIGIN_PT_CODE_FirstPart'].map(code_name_mapping) # Mapping
dfCombinedSecond = dfCombinedSecond.drop('ORIGIN_PT_CODE_FirstPart', axis=1) # Remove column used for mapping

dfCombinedSecond['DESTINATION_PT_CODE_FirstPart'] = dfCombinedSecond['DESTINATION_PT_CODE'].str.split('/').str[0]
code_name_mapping = dict(zip(csv_df['stn_code'], csv_df['mrt_station_english'])) #Mapping
dfCombinedSecond['DESTINATION_PT_NAME'] = dfCombinedSecond['DESTINATION_PT_CODE_FirstPart'].map(code_name_mapping) # Mapping
dfCombinedSecond = dfCombinedSecond.drop('DESTINATION_PT_CODE_FirstPart', axis=1) # Remove column used for mapping

# Insert DESTINATION_PT_NAME next to ORIGIN_PT_CODE column
loc = dfCombinedSecond.columns.get_loc('ORIGIN_PT_CODE') + 1
dfCombinedSecond.insert(loc, 'DESTINATION_PT_NAME', dfCombinedSecond.pop('DESTINATION_PT_NAME'))

print(dfCombinedSecond)

        YEAR_MONTH          DAY_TYPE  TIME_PER_HOUR ORIGIN_PT_NAME  \
0          2023-08           WEEKDAY             13      Woodleigh   
1          2023-08  WEEKENDS/HOLIDAY             13      Toa Payoh   
2          2023-08           WEEKDAY             13      Toa Payoh   
3          2023-08  WEEKENDS/HOLIDAY             13      Woodleigh   
4          2023-08  WEEKENDS/HOLIDAY             14      Promenade   
...            ...               ...            ...            ...   
4022728    2023-12  WEEKENDS/HOLIDAY             10    Bukit Batok   
4022729    2023-12  WEEKENDS/HOLIDAY             22    Buona Vista   
4022730    2023-12           WEEKDAY             22       Downtown   
4022731    2023-12           WEEKDAY             22    Buona Vista   
4022732    2023-12  WEEKENDS/HOLIDAY             22       Downtown   

        ORIGIN_PT_CODE DESTINATION_PT_NAME DESTINATION_PT_CODE  TOTAL_TRIPS  
0                 NE11           Toa Payoh                NS19           36  
1  

### Changing (Weekdays to 0) & (Weekends/Holidays to  1)

In [21]:
dfCombinedSecond['DAY_TYPE'] = dfCombinedSecond['DAY_TYPE'].map({'WEEKDAY': 0, 'WEEKENDS/HOLIDAY': 1})

### Creating Train_Lines column to calculate number of train lines in the station

In [22]:
# Apply the function from count_train_lines to create ORIGIN_TRAIN_LINES column
dfCombinedSecond['ORIGIN_TRAIN_LINES'] = dfCombinedSecond['ORIGIN_PT_CODE'].apply(count_train_lines)

# Insert ORIGN_TRAIN_LINES next to ORIGIN_PT_NAME column
loc = dfCombinedSecond.columns.get_loc('ORIGIN_PT_NAME') + 1
dfCombinedSecond.insert(loc, 'ORIGIN_TRAIN_LINES', dfCombinedSecond.pop('ORIGIN_TRAIN_LINES'))

# Apply the function from count_train_lines to create DESTINATION_TRAIN_LINES column
dfCombinedSecond['DESTINATION_TRAIN_LINES'] = dfCombinedSecond['DESTINATION_PT_CODE'].apply(count_train_lines)

# Insert DESTINATION_TRAIN_LINES next to DESTINATION_PT_NAME column
loc1 = dfCombinedSecond.columns.get_loc('DESTINATION_PT_NAME') + 1
dfCombinedSecond.insert(loc1, 'DESTINATION_TRAIN_LINES', dfCombinedSecond.pop('DESTINATION_TRAIN_LINES'))

### Changing Month_Year & Time_Per_Year column to proper datetime format

In [23]:
# Convert YEAR_MONTH to the last day of the month
dfCombinedSecond['YEAR_MONTH'] = pd.to_datetime(dfCombinedSecond['YEAR_MONTH']).dt.to_period('M').dt.to_timestamp('M') + pd.offsets.MonthEnd(0)

# Combine with TIME_PER_HOUR to create a full datetime
dfCombinedSecond['DATETIME'] = dfCombinedSecond.apply(lambda row: pd.Timestamp(year=row['YEAR_MONTH'].year,
                                                   month=row['YEAR_MONTH'].month,
                                                   day=row['YEAR_MONTH'].day,
                                                   hour=row['TIME_PER_HOUR']), axis=1)

# Drop the original YEAR_MONTH column
dfCombinedSecond.drop('YEAR_MONTH', axis=1, inplace=True)

# Make DATETIME the first column by creating a new DataFrame with the desired column order
dfCombinedSecond = dfCombinedSecond[['DATETIME'] + [col for col in dfCombinedSecond.columns if col != 'DATETIME']]

# Drop the origin Time_Per_Hour columns
dfCombinedSecond.drop('TIME_PER_HOUR', axis=1, inplace=True)



### Mapping location data to our station names

In [24]:
latitude_mapping = dict(zip(dfLocation['station_name'], dfLocation['lat'])) #Mapping
longitude_mappping = dict(zip(dfLocation['station_name'], dfLocation['lng'])) #Mapping

# Creating ORIGIN_PT_LATITUDE & ORIGIN_PT_LONGITUDE column
dfCombinedSecond['ORIGIN_PT_LATITUDE'] = dfCombinedSecond['ORIGIN_PT_NAME'].map(latitude_mapping) 
dfCombinedSecond['ORIGIN_PT_LONGITUDE'] = dfCombinedSecond['ORIGIN_PT_NAME'].map(longitude_mappping) 

# Insert ORIGIN LATITUDE & LATITUDE next to their respective columns
locOriginLatitude = dfCombinedSecond.columns.get_loc('ORIGIN_PT_CODE') + 1
locOriginLongitude = dfCombinedSecond.columns.get_loc('ORIGIN_PT_CODE') + 2
dfCombinedSecond.insert(locOriginLatitude, 'ORIGIN_PT_LATITUDE', dfCombinedSecond.pop('ORIGIN_PT_LATITUDE'))
dfCombinedSecond.insert(locOriginLongitude, 'ORIGIN_PT_LONGITUDE', dfCombinedSecond.pop('ORIGIN_PT_LONGITUDE'))



# Creating DESTINATION_PT_LATITUDE & DESTINATION_PT_LONGITUDE column
dfCombinedSecond['DESTINATION_PT_LATITUDE'] = dfCombinedSecond['ORIGIN_PT_NAME'].map(latitude_mapping) 
dfCombinedSecond['DESTINATION_PT_LONGITUDE'] = dfCombinedSecond['ORIGIN_PT_NAME'].map(longitude_mappping) 


# Insert DESTINATION LATITUDE & LATITUDE next to their respective columns
locDestLatitude = dfCombinedSecond.columns.get_loc('DESTINATION_PT_CODE') + 1
locDestLongitude = dfCombinedSecond.columns.get_loc('DESTINATION_PT_CODE') + 2
dfCombinedSecond.insert(locDestLatitude, 'DESTINATION_PT_LATITUDE', dfCombinedSecond.pop('DESTINATION_PT_LATITUDE'))
dfCombinedSecond.insert(locDestLongitude, 'DESTINATION_PT_LONGITUDE', dfCombinedSecond.pop('DESTINATION_PT_LONGITUDE'))

### Mapping train lines to train codes

In [25]:
train_line_mapping = {
    'EW': 0, # East-West Line
    'CG': 0, # East-West Line to Changi Airport
    'NS': 1, # North-South Line
    'NE': 2, # North-East Line
    'CC': 3, # Circle Line
    'CE': 3, # Circle Line (Bayfront, Marina Bay)
    'DT': 4, # Downtown Line
    'TE': 5, # Thomson-East Coast Line
    'BP': 6, # Bukit Panjang LRT
    'SW': 7, # Sengkang LRT West
    'SE': 7, # Sengkang LRT East
    'PW': 8, # Punggol LRT West
    'PE': 8, # Punggol LRT East
}

def map_train_codes(pt_code):
    # Initialize an empty list to store the mapped train codes
    train_codes = []
    # Split the pt_code by '/' and iterate over each part
    for code in pt_code.split('/'):
        # Iterate over each key in the mapping to find a match
        for key in train_line_mapping:
            # If the key is found at the start of the code segment, append the mapped value
            if code.startswith(key):
                train_codes.append(train_line_mapping[key])
                break  # Break the loop once the match is found
    return train_codes


# Apply the revised function to the ORIGIN_PT_CODE column
dfCombinedSecond['ORIGIN_TRAIN_CODES'] = dfCombinedSecond['ORIGIN_PT_CODE'].apply(map_train_codes)

# Convert the ORIGIN_PT_CODE column to a list
train_codes_list = dfCombinedSecond['ORIGIN_TRAIN_CODES'].tolist()

# Insert ORIGIN_TRAIN_CODES next to ORIGIN_TRAIN_LINES
loc = dfCombinedSecond.columns.get_loc('ORIGIN_TRAIN_LINES') + 1
dfCombinedSecond.insert(loc, 'ORIGIN_TRAIN_CODES', dfCombinedSecond.pop('ORIGIN_TRAIN_CODES'))



# Apply the revised function to the DESTINATION_PT_CODE column
dfCombinedSecond['DESTINATION_TRAIN_CODES'] = dfCombinedSecond['DESTINATION_PT_CODE'].apply(map_train_codes)

# Convert the DESTINATION_PT_CODE column to a list
train_codes_list = dfCombinedSecond['DESTINATION_TRAIN_CODES'].tolist()

# Insert DESTINATION_TRAIN_CODES next to DESTINATION_TRAIN_LINES
loc1 = dfCombinedSecond.columns.get_loc('DESTINATION_TRAIN_LINES') + 1
dfCombinedSecond.insert(loc1, 'DESTINATION_TRAIN_CODES', dfCombinedSecond.pop('DESTINATION_TRAIN_CODES'))


## Saving our Second Processed Dataset onto AWS S3 Bucket

In [26]:
# Convert our Dataframe to CSV file format
df_to_csv = dfCombinedSecond.to_csv(index=False) 

# Get our S3 object
s3_object = s3.Object('ltadatamall', 'TrainVolumeOrigin_ProcessedData/Data.csv')

# Write our CSV data to AWS S3 
s3_object.put(Body=df_to_csv)

print("Data uploaded successfully to AWS S3.")

Data uploaded successfully to AWS S3.


# The End of processing,

# Moving onto trainDataAnalysis!