# Week 4 Notebook: Data Preprocessing
The goal of this week's assignment is to continue to preprocess our data by cleaning it, treating issues such as outliers and missing values, transforming variables, and making the data model-ready. 

In [1]:
### Import packages
import os
import zipfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
!pip install geopy
from geopy.geocoders import Nominatim
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


### Load the dataframe

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")
interim_data_folder = os.path.join(data_folder,"interim")
processed_data_folder = os.path.join(data_folder, "processed")

raw_data_file = os.path.join(raw_data_folder, 'rawSampledData.csv')

In [3]:
df = pd.read_csv(raw_data_file)

In [4]:
df.columns

Index(['datetime', 'timestamp', 'hour', 'day', 'month', 'timezone', 'source',
       'destination', 'cab_type', 'product_id', 'name', 'price', 'distance',
       'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperatureMax

## Split Train Test Validation

In [4]:
def train_val_test_split(df):
    # Suffle the dataset and calculate the size of validation and test sets

    df = df.sample(frac=1, random_state=123)

    val_size = int(len(df) * 0.2)
    test_size = int(len(df) * 0.1)

    # Select rows based on the val_size and test_size to store as train set, val set, and test set
    train_df = df.iloc[val_size + test_size:]
    val_df = df.iloc[:val_size]
    test_df = df.iloc[val_size:val_size + test_size]
    return train_df, val_df, test_df

train_df, val_df, test_df = train_val_test_split(df)

## Missing Value Imputation
Uber Taxi Fee Breakdown
 - base fare: $2.60 for first 1/7 mile
 - per minute fare: $0.47
 - per mile: $2.8


In [5]:
def taxi_price_calculator(distance, time):
    base_fare = 2.60
    per_min_fare = 0.47
    per_mile_fare = 2.8
    price = base_fare + distance * per_mile_fare + time * per_min_fare
    return price

Since we only have one time stamp rather the duration of each ride. We will need to figure out a way to estimate the time costed for each Taxi ride for a more accurate imputation. Here are the Steps.

1. Get the unique records for locations as a lst and save the unique combination of sources as a csv
2. Get the longtitude and latitude for these loctaions and save it as a dict with location as the key and value being [lat, long]
3. Read the csv and create sourece latitude, soure longtitude, desitination latitude, destination longtitude columns based on the location dict
4. Use the Mapbox Direction API to pull time estimated by driving
5. Save the csv with source, destination, time_estimated

Please refer to the code folder to get the code for how we get the estimated duration for the taxi rides.

In [6]:
unique_combos = df[["source","destination"]].drop_duplicates()
unique_combos_dir = os.path.join(interim_data_folder, "unique_combo.csv")
unique_combos.to_csv(unique_combos_dir)

locations = pd.unique(df[["source","destination"]].values.ravel())
print(locations)

['Theatre District' 'Fenway' 'Beacon Hill' 'Haymarket Square'
 'Northeastern University' 'North Station' 'Back Bay' 'Financial District'
 'South Station' 'Boston University' 'North End' 'West End']


In [7]:
def get_longtitude_latitude(location):
    # calling the Nominatim tool and create Nominatim class
    loc = Nominatim(user_agent="Geopy Library")
    location = location+", Boston"
    getLoc = loc.geocode(location)
    return getLoc.latitude, getLoc.longitude

We commented out the part below for memory management, but we left the code block to demonstrat how the longtitue and latitude were added to the unique combinations of sources and destinations. The file was later used to get the estimated duration of trip.

In [8]:
location_dict = {}

for location in locations:
    lat, long = get_longtitude_latitude(location)
    if lat is not None and long is not None:
        location_dict[location] = [lat, long]

unique_combos["source_lat"] = unique_combos['source'].apply(lambda x: location_dict[x][0])
unique_combos["source_long"] = unique_combos['source'].apply(lambda x: location_dict[x][1])

unique_combos["destination_lat"] = unique_combos['destination'].apply(lambda x: location_dict[x][0])
unique_combos["destination_long"] = unique_combos['destination'].apply(lambda x: location_dict[x][1])

ride_locations_dir = os.path.join(interim_data_folder, "ride_locations.csv")
unique_combos.to_csv(ride_locations_dir, index = False)

The  `get_eta` function requires api_key, so I saved the dataframe locally for reviewing convinience. Pleaser refer to the `time_calulator.py` under the codes folder for more details how the time was imputed.

In [9]:
rides_with_eta = os.path.join(interim_data_folder, "rides_with_etas.csv")
time_df = pd.read_csv(rides_with_eta)
time_df.head()

Unnamed: 0,source,destination,eta_minutes
0,North End,West End,7.75105
1,Beacon Hill,South Station,10.896667
2,North Station,Fenway,13.93795
3,North End,Beacon Hill,12.600567
4,Boston University,North Station,15.7431


In [10]:
train_df = pd.merge(train_df, time_df[['source', 'destination', 'eta_minutes']], on=['source', 'destination'], how = 'left')

In [11]:
train_df.head()

Unnamed: 0,datetime,timestamp,hour,day,month,timezone,source,destination,cab_type,product_id,...,uvIndexTime,temperatureMin,temperatureMinTime,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime,eta_minutes
0,2018-11-27 08:06:22,1543306000.0,8,27,11,America/New_York,Boston University,Financial District,Lyft,lyft,...,1543338000,36.13,1543377600,46.83,1543320000,32.05,1543377600,43.85,1543320000,20.181533
1,2018-12-14 14:30:06,1544798000.0,14,14,12,America/New_York,North End,Back Bay,Uber,6c84fd89-3f11-4782-9b50-97c468b19529,...,1544806800,27.05,1544781600,46.67,1544814000,24.47,1544785200,43.88,1544817600,19.5087
2,2018-12-01 02:58:02,1543633000.0,2,1,12,America/New_York,South Station,Theatre District,Lyft,lyft_premier,...,1543593600,28.64,1543575600,42.57,1543600800,27.2,1543568400,40.51,1543611600,1789.339967
3,2018-12-03 07:23:05,1543822000.0,7,3,12,America/New_York,North Station,North End,Uber,6c84fd89-3f11-4782-9b50-97c468b19529,...,1543852800,42.96,1543896000,57.87,1543852800,39.41,1543896000,57.2,1543852800,5.562867
4,2018-12-03 22:18:06,1543875000.0,22,3,12,America/New_York,North End,Back Bay,Lyft,lyft_premier,...,1543852800,42.89,1543896000,57.27,1543852800,39.54,1543896000,56.6,1543852800,19.5087


In [12]:
train_df.loc[train_df['name'] == 'Taxi', 'price'] = train_df.loc[train_df['name'] == 'Taxi'].apply(
    lambda row: taxi_price_calculator(row['distance'], row['eta_minutes']), axis=1)

In [13]:
df_taxi = train_df[train_df["name" ]== "Taxi"]
df_taxi.isnull().sum()

datetime                       0
timestamp                      0
hour                           0
day                            0
month                          0
timezone                       0
source                         0
destination                    0
cab_type                       0
product_id                     0
name                           0
price                          0
distance                       0
surge_multiplier               0
latitude                       0
longitude                      0
temperature                    0
apparentTemperature            0
short_summary                  0
long_summary                   0
precipIntensity                0
precipProbability              0
humidity                       0
windSpeed                      0
windGust                       0
windGustTime                   0
visibility                     0
temperatureHigh                0
temperatureHighTime            0
temperatureLow                 0
temperatur

In [14]:
train_df = train_df.drop('eta_minutes', axis = 1)

In [15]:
df_na = train_df[train_df['price'].isna()]
missing_percentage_after_imputing =train_df['price'].isna().sum() / len(train_df) * 100
print(f"Percentage of missing values in 'price' after imputing: {missing_percentage_after_imputing:.2f}%")

Percentage of missing values in 'price' after imputing: 0.00%


# Drop Uneeded Columns

There are some columns that are irrelevant to the modeling because they are just unique identifiers:
- `id`
- `product_id`

In addition, we also dropped `timezone` since all data is within the same timezone.

`datetime` and`timestamp` were dropped to reduce redundacy with other time features.

In [16]:
cols_to_drop = ['id', 'timestamp', 'timezone', 'product_id', 'datetime']

train_df = train_df.drop(columns=cols_to_drop, errors='ignore')
test_df = test_df.drop(columns=cols_to_drop, errors='ignore')
val_df = val_df.drop(columns=cols_to_drop, errors='ignore')

In [17]:
train_df.columns

Index(['hour', 'day', 'month', 'source', 'destination', 'cab_type', 'name',
       'price', 'distance', 'surge_multiplier', 'latitude', 'longitude',
       'temperature', 'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperatureMax', 'apparentTemperatureMaxTime'],
      dtype='object')

In [18]:
object_columns = train_df.select_dtypes(include=['object']).columns.tolist()

# Display the object columns
print("Object Columns:")
print(object_columns)

Object Columns:
['source', 'destination', 'cab_type', 'name', 'short_summary', 'long_summary', 'icon']


## Categorical Variables Encoding
### Encoding the `short_summary` variable

In [19]:
df['short_summary'].unique()

array([' Mostly Cloudy ', ' Light Rain ', ' Overcast ', ' Clear ',
       ' Partly Cloudy ', ' Rain ', ' Foggy ', ' Drizzle ',
       ' Possible Drizzle '], dtype=object)


The short_summary variable will be divided into binary categories:

- Mostly Cloudy
- Rain
- Clear
- Partly Cloudy
- Overcast
- Light Rain
- Foggy
- Possible Drizzle
- Drizzle

In [20]:
train_df['short_summary'] = train_df['short_summary'].str.strip().str.replace(' ', '_')
val_df['short_summary'] = val_df['short_summary'].str.strip().str.replace(' ', '_')
test_df['short_summary'] = test_df['short_summary'].str.strip().str.replace(' ', '_')

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'short_summary' column
train_encoded_summary = encoder.fit_transform(train_df[['short_summary']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['short_summary']])
test_encoded_summary = encoder.transform(test_df[['short_summary']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['short_summary']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['short_summary']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['short_summary']))

# Concatenate the one-hot encoded 'short_summary' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# # Check the resulting dataframe
# print(train_df.head())
# print(val_df.head())
# print(test_df.head())

### Encoding the `long_summary` variable

In [21]:
df['long_summary'].unique()

array([' Partly cloudy throughout the day. ',
       ' Light rain until evening. ',
       ' Mostly cloudy throughout the day. ',
       ' Rain until morning, starting again in the evening. ',
       ' Light rain in the morning. ', ' Overcast throughout the day. ',
       ' Foggy in the morning. ',
       ' Light rain in the morning and overnight. ',
       ' Rain throughout the day. ', ' Possible drizzle in the morning. ',
       ' Rain in the morning and afternoon. '], dtype=object)

The long_summary variable will be divided into binary categories:

- Rain throughout the day
- Rain until morning, starting again in the evening
- Light rain in the morning
- Partly cloudy thoughout the day
- Light rain in the morning and overnight
- Light rain until evening
- Foggy in the morning
- Overcast throughout the day
- Possible drizzle in the morning
- Rain in the morning and afternoon

In [22]:
train_df['long_summary'] = train_df['long_summary'].str.strip().str.replace(' ', '_')
val_df['long_summary'] = val_df['long_summary'].str.strip().str.replace(' ', '_')
test_df['long_summary'] = test_df['long_summary'].str.strip().str.replace(' ', '_')

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'long_summary' column
train_encoded_summary = encoder.fit_transform(train_df[['long_summary']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['long_summary']])
test_encoded_summary = encoder.transform(test_df[['long_summary']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['long_summary']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['long_summary']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['long_summary']))

# Concatenate the one-hot encoded 'long_summary' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# # Check the resulting dataframe
# print(train_df.head())
# print(val_df.head())
# print(test_df.head())

### Encoding the icon variable

In [23]:
df['icon'].unique()

array([' partly-cloudy-day ', ' rain ', ' cloudy ', ' clear-night ',
       ' clear-day ', ' partly-cloudy-night ', ' fog '], dtype=object)

The one hot-encoded categories for the icon feature will be:

- partly cloudy night
- rain
- clear night
- cloudy
- fog
- clear-day
- partly cloudy day

In [24]:
# Strip leading/trailing spaces and replace blank spaces with underscores in the icon column
train_df['icon'] = train_df['icon'].str.strip().str.replace(' ', '_')
val_df['icon'] = val_df['icon'].str.strip().str.replace(' ', '_')
test_df['icon'] = test_df['icon'].str.strip().str.replace(' ', '_')


# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'long_summary' column
train_encoded_summary = encoder.fit_transform(train_df[['icon']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['icon']])
test_encoded_summary = encoder.transform(test_df[['icon']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['icon']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['icon']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['icon']))

# Concatenate the one-hot encoded 'long_summary' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# # Check the resulting dataframe
# print(train_df.head())
# print(val_df.head())
# print(test_df.head())

### One-hot-encoding `source` and destination

In [25]:
print("Unique values for source:")
print(df['source'].unique())

print("Unique values for destination:")
print(df['destination'].unique())

Unique values for source:
['Theatre District' 'Beacon Hill' 'Northeastern University' 'Fenway'
 'Back Bay' 'Haymarket Square' 'South Station' 'Financial District'
 'Boston University' 'North End' 'North Station' 'West End']
Unique values for destination:
['Fenway' 'Haymarket Square' 'North Station' 'Back Bay' 'Beacon Hill'
 'Financial District' 'Theatre District' 'South Station'
 'Boston University' 'West End' 'Northeastern University' 'North End']


The one hot-encoded categories for these two features will be:

- Haymarket Square
- Back Bay
- North End
- North Station
- Beacon Hill
- Boston University
- Fenway
- South Station
- Theatre District
- West End
- Financial District
- Northeastern University

In [26]:
train_df['source'] = train_df['source'].str.strip().str.replace(' ', '_')
val_df['source'] = val_df['source'].str.strip().str.replace(' ', '_')
test_df['source'] = test_df['source'].str.strip().str.replace(' ', '_')

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'source' column
train_encoded_summary = encoder.fit_transform(train_df[['source']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['source']])
test_encoded_summary = encoder.transform(test_df[['source']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['source']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['source']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['source']))

# Concatenate the one-hot encoded 'source' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# # Check the resulting dataframe
# print(train_df.head())
# print(val_df.head())
# print(test_df.head())

In [27]:
# Step 1: Extract the categories from the fitted OneHotEncoder for 'source'
source_categories = encoder.categories_[0]

# Initialize a new OneHotEncoder for 'destination'
destination_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Strip leading/trailing spaces and replace blank spaces with underscores in the destination column
train_df['destination'] = train_df['destination'].str.strip().str.replace(' ', '_')
val_df['destination'] = val_df['destination'].str.strip().str.replace(' ', '_')
test_df['destination'] = test_df['destination'].str.strip().str.replace(' ', '_')

# Fit the encoder on the training data 'destination' column
train_encoded_destination = destination_encoder.fit_transform(train_df[['destination']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_destination = destination_encoder.transform(val_df[['destination']])
test_encoded_destination = destination_encoder.transform(test_df[['destination']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_destination_df = pd.DataFrame(train_encoded_destination, columns=destination_encoder.get_feature_names_out(['destination']))
val_encoded_destination_df = pd.DataFrame(val_encoded_destination, columns=destination_encoder.get_feature_names_out(['destination']))
test_encoded_destination_df = pd.DataFrame(test_encoded_destination, columns=destination_encoder.get_feature_names_out(['destination']))

# Concatenate the one-hot encoded 'destination' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_destination_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_destination_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_destination_df], axis=1)

# # Check the resulting dataframe
# print("Train DataFrame:")
# print(train_df.head())
# print("\nValidation DataFrame:")
# print(val_df.head())
# print("\nTest DataFrame:")
# print(test_df.head())


### Encoding the `name` variable

In [28]:
df['name'].unique()

array(['Lyft XL', 'Taxi', 'Lyft', 'Lux Black XL', 'Black SUV', 'UberX',
       'UberPool', 'UberXL', 'Lux', 'WAV', 'Shared', 'Lux Black', 'Black'],
      dtype=object)

In [29]:
train_df['name'] = train_df['name'].str.strip().str.replace(' ', '_')
val_df['name'] = val_df['name'].str.strip().str.replace(' ', '_')
test_df['name'] = test_df['name'].str.strip().str.replace(' ', '_')

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'name' column
train_encoded_summary = encoder.fit_transform(train_df[['name']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['name']])
test_encoded_summary = encoder.transform(test_df[['name']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['name']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['name']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['name']))

# Concatenate the one-hot encoded 'name' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# # Check the resulting dataframe
# print(train_df.head())
# print(val_df.head())
# print(test_df.head())

### Encoding the `cab_type` variable

In [30]:
df['cab_type'].unique()

array(['Lyft', 'Uber'], dtype=object)

In [31]:
train_df['cab_type'] = train_df['cab_type'].str.strip()
val_df['cab_type'] = val_df['cab_type'].str.strip()
test_df['cab_type'] = test_df['cab_type'].str.strip()

# Ensure there are no leading or trailing spaces in the 'cab_type' column
train_df['cab_type'] = train_df['cab_type'].str.strip()
val_df['cab_type'] = val_df['cab_type'].str.strip()
test_df['cab_type'] = test_df['cab_type'].str.strip()

# Create dummy variables for the 'cab_type' column
train_df = pd.get_dummies(train_df, columns=['cab_type'], drop_first=True)
val_df = pd.get_dummies(val_df, columns=['cab_type'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['cab_type'], drop_first=True)

# # Check the resulting dataframe
# print(train_df.head())
# print(val_df.head())
# print(test_df.head())

In [32]:
cat_columns_to_drop = ['source',
 'destination',
 'name',
 'short_summary',
 'long_summary',
 'icon']

In [33]:
train_df = train_df.drop(columns = cat_columns_to_drop)
val_df = val_df.drop(columns = cat_columns_to_drop)
test_df = test_df.drop(columns = cat_columns_to_drop)

In [34]:
print("\nDataFrame Information using .info():")
train_df.info()


DataFrame Information using .info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Columns: 110 entries, hour to cab_type_Uber
dtypes: bool(1), float64(92), int64(17)
memory usage: 58.3 MB


## Standardization and PCA
The kernel kept dying at this point, so we cleared the memories and commented out the code we used to standardize the features. Instead, we imported the standardized files generated by the code blocks for memory management.

In [35]:
y_train = train_df[['price']]
y_val = val_df[['price']]
y_test = test_df[['price']]

X_train = train_df.drop('price', axis = 1)
X_val = val_df.drop('price', axis = 1)
X_test = test_df.drop('price', axis = 1)

In [36]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.fit_transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [37]:
# Save the scaled DataFrames as Parquet files

X_train_scaled_df.to_parquet(os.path.join(processed_data_folder, 'X_train_scaled.parquet'), index=False)
X_val_scaled_df.to_parquet(os.path.join(processed_data_folder, 'X_val_scaled.parquet'), index=False)
X_test_scaled_df.to_parquet(os.path.join(processed_data_folder, 'X_test_scaled.parquet'), index=False)

# Save the target variables (if needed)
y_train.to_parquet(os.path.join(processed_data_folder, 'y_train.parquet'), index=False)
y_val.to_parquet(os.path.join(processed_data_folder, 'y_val.parquet'), index=False)
y_test.to_parquet(os.path.join(processed_data_folder, 'y_test.parquet'), index=False)

In [39]:
# Initialize PCA, you can adjust n_components as needed (for example, n_components=0.95 for 95% variance)
pca = PCA(n_components=0.95)


X_train_scaled = X_train_scaled_df.to_numpy()
X_val_scaled = X_val_scaled_df.to_numpy()
X_test_scaled = X_test_scaled_df.to_numpy()

# Fit PCA on the scaled training data
X_train_pca = pca.fit_transform(X_train_scaled)

# Transform the validation and test sets using the same PCA
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Convert the PCA results back to DataFrames for easier handling
X_train_pca_df = pd.DataFrame(X_train_pca)
X_val_pca_df = pd.DataFrame(X_val_pca)
X_test_pca_df = pd.DataFrame(X_test_pca)

# Optional: Rename the columns for clarity
X_train_pca_df.columns = [f'PC{i+1}' for i in range(X_train_pca_df.shape[1])]
X_val_pca_df.columns = [f'PC{i+1}' for i in range(X_val_pca_df.shape[1])]
X_test_pca_df.columns = [f'PC{i+1}' for i in range(X_test_pca_df.shape[1])]

In [40]:
print(X_train_pca_df.shape)
print(X_val_pca_df.shape)
print(X_test_pca_df.shape)

(70000, 56)
(20000, 56)
(10000, 56)


PCA reduces the df to 56 components, which is very useful for models like linear regression which would suffer from high-dimentional dataset.

In [41]:
# Save PCA DataFrames as Parquet files
train_pca_path = os.path.join(processed_data_folder, 'X_train_pca.parquet')
val_pca_path = os.path.join(processed_data_folder, 'X_val_pca.parquet')
test_pca_path = os.path.join(processed_data_folder, 'X_test_pca.parquet')

X_train_pca_df.to_parquet(train_pca_path, index=False)
X_val_pca_df.to_parquet(val_pca_path, index=False)
X_test_pca_df.to_parquet(test_pca_path, index=False)