# Week 5 Notebook: Engineer Features
The goal of this week's assignment is to engineer new features and reduce the dimensionality of the dataset.

### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile
from sklearn.preprocessing import OneHotEncoder


### Read data as dataframe

In [None]:
"""
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")

uber_file_path = os.path.join(raw_data_folder, "uber.csv.zip")
lyft_file_path = os.path.join(raw_data_folder, "lyft.csv.zip")
"""

In [None]:
"""
if os.path.exists(uber_file_path):
    with zipfile.ZipFile(uber_file_path, 'r') as zip_ref:
        zip_ref.extractall(raw_data_folder)
    print(f"Uber file extracted to: {raw_data_folder}")
else:
    print(f"Uber file not found: {uber_file_path}")

if os.path.exists(lyft_file_path):
    with zipfile.ZipFile(lyft_file_path, 'r') as zip_ref:
        zip_ref.extractall(raw_data_folder)
    print(f"Lyft file extracted to: {raw_data_folder}")
else:
    print(f"Lyft file not found: {lyft_file_path}")
"""

In [None]:
"""
uber_csv_path = os.path.join(raw_data_folder, "uber.csv")
lyft_csv_path = os.path.join(raw_data_folder, "lyft.csv")

uber_df = pd.read_csv(uber_csv_path)
lyft_df = pd.read_csv(lyft_csv_path)
    
df = pd.concat([uber_df, lyft_df], ignore_index=True)
"""

In [2]:
# Load the data as a pandas dataframe.
df = pd.read_csv('rawSampledData.csv')
print(df.columns)

Index(['datetime', 'timestamp', 'hour', 'day', 'month', 'timezone', 'source',
       'destination', 'cab_type', 'product_id', 'name', 'price', 'distance',
       'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperatureMax

### Split the dataset into training, testing, and validation sets
- training set is 70% of the dataframe
- validation set is 20% of the dataframe
- test set is 10% of the dataframe

In [3]:
def train_val_test_split(df):
    # Shuffle the dataset and calculate the size of validation and test sets

    df = df.sample(frac=1, random_state=123)

    val_size = int(len(df) * 0.2)
    test_size = int(len(df) * 0.1)

    # Select rows based on the val_size and test_size to store as train set, val set, and test set
    train_df = df.iloc[val_size + test_size:]
    val_df = df.iloc[:val_size]
    test_df = df.iloc[val_size:val_size + test_size]
    return train_df, val_df, test_df

train_df, val_df, test_df = train_val_test_split(df)


In [4]:
cat_col = [col for col in df.columns if df[col].dtype == 'object']
cat_col = [
 'timezone',
 'source',
 'destination',
 'cab_type',
 'product_id',
 'name',
 'short_summary',
 'long_summary',
 'icon']
num_col = [col for col in df.columns if col not in cat_col]

# Remove 'id' and 'datetime' from the list of column names
num_col = [col for col in num_col if col not in ['id', 'datetime']]

print(f"cat_col = {len(cat_col)}\nnum_col = {len(num_col)}\n")

cat_col = 9
num_col = 46



In [5]:
num_col

['timestamp',
 'hour',
 'day',
 'month',
 'price',
 'distance',
 'surge_multiplier',
 'latitude',
 'longitude',
 'temperature',
 'apparentTemperature',
 'precipIntensity',
 'precipProbability',
 'humidity',
 'windSpeed',
 'windGust',
 'windGustTime',
 'visibility',
 'temperatureHigh',
 'temperatureHighTime',
 'temperatureLow',
 'temperatureLowTime',
 'apparentTemperatureHigh',
 'apparentTemperatureHighTime',
 'apparentTemperatureLow',
 'apparentTemperatureLowTime',
 'dewPoint',
 'pressure',
 'windBearing',
 'cloudCover',
 'uvIndex',
 'visibility.1',
 'ozone',
 'sunriseTime',
 'sunsetTime',
 'moonPhase',
 'precipIntensityMax',
 'uvIndexTime',
 'temperatureMin',
 'temperatureMinTime',
 'temperatureMax',
 'temperatureMaxTime',
 'apparentTemperatureMin',
 'apparentTemperatureMinTime',
 'apparentTemperatureMax',
 'apparentTemperatureMaxTime']

### Missing Value Imputation 

In [6]:
def taxi_price_calculator(distance, time):
    base_fare = 2.60
    per_min_fare = 0.47
    per_mile_fare = 2.8
    price = base_fare + distance * per_mile_fare + time * per_min_fare
    return price

In [7]:
unique_combos = df[["source","destination"]].drop_duplicates()
unique_combos.to_csv("unique_combo.csv")

locations = pd.unique(df[["source","destination"]].values.ravel())
print(locations)

['Theatre District' 'Fenway' 'Beacon Hill' 'Haymarket Square'
 'Northeastern University' 'North Station' 'Back Bay' 'Financial District'
 'South Station' 'Boston University' 'North End' 'West End']


In [8]:
from get_long_lat import get_longtitude_latitude

location_dict = {}

for location in locations:
    lat, long = get_longtitude_latitude(location)
    if lat is not None and long is not None:
        location_dict[location] = [lat, long]

ModuleNotFoundError: No module named 'geopy'

In [None]:
unique_combos["source_lat"] = unique_combos['source'].apply(lambda x: location_dict[x][0])
unique_combos["source_long"] = unique_combos['source'].apply(lambda x: location_dict[x][1])

unique_combos["destination_lat"] = unique_combos['destination'].apply(lambda x: location_dict[x][0])
unique_combos["destination_long"] = unique_combos['destination'].apply(lambda x: location_dict[x][1])

In [None]:
rides_with_eta = os.path.join(saved_data_folder, "rides_with_etas.csv")
time_df = pd.read_csv(rides_with_eta)
time_df.head()

In [None]:
df = pd.merge(df, time_df[['source', 'destination', 'eta_minutes']], on=['source', 'destination'], how = 'left')
df.head()

In [None]:
df.loc[df['name'] == 'Taxi', 'price'] = df.loc[df['name'] == 'Taxi'].apply(
    lambda row: taxi_price_calculator(row['distance'], row['eta_minutes']), axis=1)

In [None]:
df_taxi = df[df["name" ]== "Taxi"]
df_taxi.isnull().sum()

In [None]:
df = df.drop('eta_minutes', axis = 1)

In [None]:
df_na = df[df['price'].isna()]
missing_percentage_after_imputing = df['price'].isna().sum() / len(df) * 100
print(f"Percentage of missing values in 'price' after imputing: {missing_percentage_after_imputing:.2f}%")

In [None]:
# Check if the prices have been correctly imputed
df_taxi = df[df["name"] == "Taxi"]
df_taxi[['distance', 'price']].head()

In [None]:
# Check for any missing values after imputing
df.isnull().sum()

In [None]:
df_na = df[df['price'].isna()]
missing_percentage_after_imputing = df['price'].isna().sum() / len(df) * 100
print(f"Percentage of missing values in 'price' after imputing: {missing_percentage_after_imputing:.2f}%")

# **One-hot Encoding Categorical Variables**

### **Encoding the short_summary variable**

In [None]:
#This lists all of the unique values in the short_summary
print(df['short_summary'].unique())


The short_summary variable will be divided into binary categories: 
* Mostly Cloudy
* Rain
* Clear
* Partly Cloudy
* Overcast
* Light Rain
* Foggy
* Possible Drizzle
* Drizzle


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Ensure there are no leading or trailing spaces in the 'short_summary' column 
train_df['short_summary'] = train_df['short_summary'].str.strip()
val_df['short_summary'] = val_df['short_summary'].str.strip()
test_df['short_summary'] = test_df['short_summary'].str.strip()

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'short_summary' column
train_encoded_summary = encoder.fit_transform(train_df[['short_summary']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['short_summary']])
test_encoded_summary = encoder.transform(test_df[['short_summary']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['short_summary']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['short_summary']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['short_summary']))

# Concatenate the one-hot encoded 'short_summary' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# Drop the original 'short_summary' column from each dataset
train_df.drop('short_summary', axis=1, inplace=True)
val_df.drop('short_summary', axis=1, inplace=True)
test_df.drop('short_summary', axis=1, inplace=True)

# Check the resulting dataframe
print(train_df.head())
print(val_df.head())
print(test_df.head())

### **Encoding the long_summary variable**

In [None]:
#List all of the unique values in the long_summary
print(df['long_summary'].unique())


The long_summary variable will be divided into binary categories:

* Rain throughout the day
* Rain until morning, starting again in the evening 
* Light rain in the morning
* Partly cloudy thoughout the day
* Light rain in the morning and overnight
* Light rain until evening 
* Foggy in the morning
* Overcast throughout the day
* Possible drizzle in the morning
* Rain in the morning and afternoon

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Ensure there are no leading or trailing spaces in the 'long_summary' column 
train_df['long_summary'] = train_df['long_summary'].str.strip()
val_df['long_summary'] = val_df['long_summary'].str.strip()
test_df['long_summary'] = test_df['long_summary'].str.strip()

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'long_summary' column
train_encoded_summary = encoder.fit_transform(train_df[['long_summary']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['long_summary']])
test_encoded_summary = encoder.transform(test_df[['long_summary']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['long_summary']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['long_summary']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['long_summary']))

# Concatenate the one-hot encoded 'long_summary' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# Drop the original 'long_summary' column from each dataset
train_df.drop('long_summary', axis=1, inplace=True)
val_df.drop('long_summary', axis=1, inplace=True)
test_df.drop('long_summary', axis=1, inplace=True)

# Check the resulting dataframe
print(train_df.head())
print(val_df.head())
print(test_df.head())

### **Encoding the icon variable**

In [None]:
#Lists all of the unique icon values
print(df['icon'].unique())


The one hot-encoded categories for the icon feature will be:
* partly cloudy night
* rain
* clear night
* cloudy
* fog
* clear-day
* partly cloudy day

In [None]:

# Ensure there are no leading or trailing spaces in the 'icon' column 
train_df['icon'] = train_df['icon'].str.strip()
val_df['icon'] = val_df['icon'].str.strip()
test_df['icon'] = test_df['icon'].str.strip()

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'long_summary' column
train_encoded_summary = encoder.fit_transform(train_df[['icon']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['icon']])
test_encoded_summary = encoder.transform(test_df[['icon']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['icon']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['icon']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['icon']))

# Concatenate the one-hot encoded 'long_summary' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# Drop the original 'long_summary' column from each dataset
train_df.drop('icon', axis=1, inplace=True)
val_df.drop('icon', axis=1, inplace=True)
test_df.drop('icon', axis=1, inplace=True)

# Check the resulting dataframe
print(train_df.head())
print(val_df.head())
print(test_df.head())

### **The timezone variable**
The timezone variable is America/New_York for every ride in the dataset so we decided to drop it as it does not provide any useful information distinguishing between data points and will therefore add no additional value to our analysis. 


In [None]:
#Lists all of the unique timezone values
print(df['timezone'].unique())


In [33]:
# Dropping the 'timezone' column after splitting
train_df = train_df.drop('timezone', axis=1)
val_df = val_df.drop('timezone', axis=1)
test_df = test_df.drop('timezone', axis=1)

In [None]:
print(df['source'].unique())

The one hot-encoded categories for the source feature will be:
* Haymarket Square
* Back Bay
* North End
* North Station
* Beacon Hill 
* Boston University
* Fenway
* South Station
* Theatre District
* West End
* Financial District
* Northeastern University

In [None]:
# Ensure there are no leading or trailing spaces in the 'source' column 
train_df['source'] = train_df['source'].str.strip()
val_df['source'] = val_df['source'].str.strip()
test_df['source'] = test_df['source'].str.strip()

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'source' column
train_encoded_summary = encoder.fit_transform(train_df[['source']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['source']])
test_encoded_summary = encoder.transform(test_df[['source']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['source']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['source']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['source']))

# Concatenate the one-hot encoded 'source' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# Drop the original 'source' column from each dataset
train_df.drop('source', axis=1, inplace=True)
val_df.drop('source', axis=1, inplace=True)
test_df.drop('source', axis=1, inplace=True)

# Check the resulting dataframe
print(train_df.head())
print(val_df.head())
print(test_df.head())

In [25]:
### **The name variable**

In [None]:
print(df['name'].unique())

The one hot-encoded categories for the name feature will be:
* Shared
* Lux
* Lyft
* Lux Black XL
* Lyft XL
* Lux Black
* UberXL
* Black
* UberX
* WAV
* Black SUV
* UberPool
* Taxi

In [None]:
# Ensure there are no leading or trailing spaces in the 'name' column 
train_df['name'] = train_df['name'].str.strip()
val_df['name'] = val_df['name'].str.strip()
test_df['name'] = test_df['name'].str.strip()

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'name' column
train_encoded_summary = encoder.fit_transform(train_df[['name']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['name']])
test_encoded_summary = encoder.transform(test_df[['name']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['name']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['name']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['name']))

# Concatenate the one-hot encoded 'name' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

# Drop the original 'name' column from each dataset
train_df.drop('name', axis=1, inplace=True)
val_df.drop('name', axis=1, inplace=True)
test_df.drop('name', axis=1, inplace=True)

# Check the resulting dataframe
print(train_df.head())
print(val_df.head())
print(test_df.head())