In [4]:
import pandas as pd
import numpy as np
df = pd.read_csv('/Users/allig/ads/507/train_data.csv') # Load dataset
print(df.info())  # Shows column names, types, and null values
print(df.head())  # Displays the first few rows

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101017 entries, 0 to 101016
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Created               101017 non-null  object 
 1   CancelTime            15326 non-null   object 
 2   DepartureTime         101017 non-null  object 
 3   BillID                101017 non-null  int64  
 4   TicketID              101017 non-null  float64
 5   ReserveStatus         101017 non-null  int64  
 6   UserID                42543 non-null   float64
 7   Male                  101017 non-null  bool   
 8   Price                 101017 non-null  float64
 9   CouponDiscount        101017 non-null  float64
 10  From                  101017 non-null  object 
 11  To                    101017 non-null  object 
 12  Domestic              101017 non-null  int64  
 13  VehicleType           93473 non-null   object 
 14  VehicleClass          62567 non-null   object 
 15  

In [5]:
# Convert date columns to datetime
date_columns = ['Created', 'CancelTime', 'DepartureTime']
df[date_columns] = df[date_columns].apply(pd.to_datetime, errors='coerce')

# Drop columns with excessive missing values
df = df.drop(columns=['HashPassportNumber_p'])  # Too many missing values

# Fill missing values using assignment (avoiding inplace modification)
df['CancelTime'] = df['CancelTime'].fillna(pd.NaT)  # Keep as datetime but empty
df['UserID'] = df['UserID'].fillna(-1)  # Use -1 to indicate missing UserID
df['VehicleType'] = df['VehicleType'].fillna('Unknown')  # Fill missing VehicleType with 'Unknown'
df['VehicleClass'] = df['VehicleClass'].fillna('Unknown')  # Fill missing VehicleClass with 'Unknown'

# Convert boolean to integer
df['Male'] = df['Male'].astype(int)

# Convert float columns to integer if appropriate
df['TicketID'] = df['TicketID'].astype('Int64')  # Use Int64 to keep NaN handling

# Convert categorical columns
categorical_columns = ['From', 'To', 'TripReason', 'VehicleType', 'VehicleClass', 'Vehicle']
df[categorical_columns] = df[categorical_columns].astype('category')

# Convert 'Cancel' to integer
df['Cancel'] = df['Cancel'].astype(int)

# Feature engineering: extract time-based features
df['Created_Year'] = df['Created'].dt.year
df['Created_Month'] = df['Created'].dt.month
df['Created_Day'] = df['Created'].dt.day
df['Created_Hour'] = df['Created'].dt.hour

df['Departure_Year'] = df['DepartureTime'].dt.year
df['Departure_Month'] = df['DepartureTime'].dt.month
df['Departure_Day'] = df['DepartureTime'].dt.day
df['Departure_Hour'] = df['DepartureTime'].dt.hour

# Calculate time difference between booking and departure
df['Booking_Departure_Diff'] = (df['DepartureTime'] - df['Created']).dt.total_seconds() / 3600  # in hours

# Save cleaned dataset
df.to_csv('/Users/allig/ads/507/train_data_cleaned.csv', index=False)

# Display summary
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101017 entries, 0 to 101016
Data columns (total 30 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   Created                 101017 non-null  datetime64[ns]
 1   CancelTime              15326 non-null   datetime64[ns]
 2   DepartureTime           101017 non-null  datetime64[ns]
 3   BillID                  101017 non-null  int64         
 4   TicketID                101017 non-null  Int64         
 5   ReserveStatus           101017 non-null  int64         
 6   UserID                  101017 non-null  float64       
 7   Male                    101017 non-null  int32         
 8   Price                   101017 non-null  float64       
 9   CouponDiscount          101017 non-null  float64       
 10  From                    101017 non-null  category      
 11  To                      101017 non-null  category      
 12  Domestic                101017