In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the data from CSV file, explicitly setting the data type for specific columns to string
data = pd.read_csv('Data.csv', dtype={'Column_Name': str}, low_memory=False)

# Remove columns that are entirely null
data = data.dropna(axis=1, how='all')

# Convert 'Count Start Date' column to datetime format, specifying day-first format and coercing errors
data['Count Start Date'] = pd.to_datetime(data['Count Start Date'], dayfirst=True, errors='coerce')

# Get the average traffic volume for weekends
data['Weekend Traffic ADT'] = (data['Saturday Volume'].fillna(0) + data['Sunday Volume'].fillna(0)) / 2

# Filter data to include only records from '2018-02-09' onwards (Data with Location)
filtered_data = data[data['Count Start Date'] >= '2018-02-09']

print(filtered_data.info())

features = [
    '5 Day ADT', 'Weekend Traffic ADT', 
    'AM Peak Volume', 'Mid Peak Volume', 'PM Peak Volume'
]

# # Check if all features are present in the dataset
missing_features = [feature for feature in features if feature not in filtered_data.columns]
if missing_features:
    print(f"Missing features: {missing_features}")
else:
    # Extract the features for normalization
    traffic_data = filtered_data[features]
    
    # Initialize the StandardScaler
    scaler = StandardScaler()
    
    # Fit and transform the data
    normalized_data = scaler.fit_transform(traffic_data)
    
    # Convert the normalized data back to a DataFrame
    normalized_df = pd.DataFrame(normalized_data, columns=features)
    
    # Display the first few rows of the normalized data
    # print(filtered_data.head())
    print('\n-----------------------------------------------------------------------------------------\n')
    print(normalized_df.head())
    print('\n-----------------------------------------------------------------------------------------\n')
    print(normalized_df.info())


<class 'pandas.core.frame.DataFrame'>
Index: 13026 entries, 8746 to 21889
Data columns (total 24 columns):
 #   Column                                   Non-Null Count  Dtype         
---  ------                                   --------------  -----         
 0   Area                                     0 non-null      object        
 1   Road Name                                13026 non-null  object        
 2   Carriageway Start Name                   13006 non-null  object        
 3   Carriageway End Name                     13016 non-null  object        
 4   Description (location of traffic count)  13024 non-null  object        
 5   Direction                                13026 non-null  object        
 6   Count Start Date                         13026 non-null  datetime64[ns]
 7   5 Day ADT                                13025 non-null  float64       
 8   7 Day ADT                                13025 non-null  float64       
 9   Saturday Volume                          