# Week 5 Notebook: Feature Engineering
The goal of this week's assignment is to engineer new features and reduce the dimensionality of the dataset. 

### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile
from sklearn.preprocessing import OneHotEncoder


### Read data as dataframe

In [None]:
"""
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")

uber_file_path = os.path.join(raw_data_folder, "uber.csv.zip")
lyft_file_path = os.path.join(raw_data_folder, "lyft.csv.zip")
"""

In [None]:
"""
if os.path.exists(uber_file_path):
    with zipfile.ZipFile(uber_file_path, 'r') as zip_ref:
        zip_ref.extractall(raw_data_folder)
    print(f"Uber file extracted to: {raw_data_folder}")
else:
    print(f"Uber file not found: {uber_file_path}")

if os.path.exists(lyft_file_path):
    with zipfile.ZipFile(lyft_file_path, 'r') as zip_ref:
        zip_ref.extractall(raw_data_folder)
    print(f"Lyft file extracted to: {raw_data_folder}")
else:
    print(f"Lyft file not found: {lyft_file_path}")
"""

In [None]:
"""
uber_csv_path = os.path.join(raw_data_folder, "uber.csv")
lyft_csv_path = os.path.join(raw_data_folder, "lyft.csv")

uber_df = pd.read_csv(uber_csv_path)
lyft_df = pd.read_csv(lyft_csv_path)
    
df = pd.concat([uber_df, lyft_df], ignore_index=True)
"""

In [2]:
# Load the data as a pandas dataframe.
df = pd.read_csv('rawSampledData.csv')
print(df.columns)

Index(['datetime', 'timestamp', 'hour', 'day', 'month', 'timezone', 'source',
       'destination', 'cab_type', 'product_id', 'name', 'price', 'distance',
       'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperatureMax

In [None]:
df.head()

### Split the dataset into training, testing, and validation sets
- training set is 70% of the dataframe
- validation set is 20% of the dataframe
- test set is 10% of the dataframe

In [3]:
def train_val_test_split(df):
    # Shuffle the dataset and calculate the size of validation and test sets

    df = df.sample(frac=1, random_state=123)

    val_size = int(len(df) * 0.2)
    test_size = int(len(df) * 0.1)

    # Select rows based on the val_size and test_size to store as train set, val set, and test set
    train_df = df.iloc[val_size + test_size:]
    val_df = df.iloc[:val_size]
    test_df = df.iloc[val_size:val_size + test_size]
    return train_df, val_df, test_df

train_df, val_df, test_df = train_val_test_split(df)


## Feature Engineering

### Remove unnecessary features

In [4]:
df.drop(columns=['timezone', 'product_id', 'visibility.1'], inplace=True)

### Create new variables

In [6]:
# Rush Hour
# 6-9 AM or 4-6 PM 
df['rush_hour'] = df['hour'].apply(lambda x: 1 if (6 <= x <= 9 or 16 <= x <= 18) else 0)

# Weekend 
# Saturday or Sunday
df['datetime'] = pd.to_datetime(df['datetime'])
df['weekend'] = df['datetime'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)

# Holiday
holidays = ['2018-11-11', '2018-11-22','2018-12-24','2018-12-25','2018-12-31'] 
df['holiday'] = df['datetime'].dt.date.astype(str).isin(holidays).astype(int)

# Extreme Weather
# Apparent temperature less than 20 degrees AND Wind gust above 25)
df['extreme_weather'] = df.apply(lambda row: 1 if row['apparentTemperature'] < 20 and row['windGust'] > 25 else 0, axis=1)

# Snow
# Below 32 degrees and precipitation intensity above 0
df['snow'] = df.apply(lambda row: 1 if row['temperature'] < 32 and row['precipIntensity'] > 0 else 0, axis=1)

# Reduced Visability
# Visibility less than 5 miles
df['reduced_visibility'] = df['visibility'].apply(lambda x: 1 if x < 5 else 0)

# Dinner Time
# 5-8 PM
df['dinner_time'] = df['hour'].apply(lambda x: 1 if 17 <= x <= 20 else 0)

# Friday or Saturday night
# Friday or Saturday nights between 5 PM and 2 AM)
df['fri_sat_night'] = df.apply(lambda row: 1 if (17 <= row['hour'] <= 23 and row['datetime'].dayofweek in [4, 5]) or 
                                          (0 <= row['hour'] <= 2 and row['datetime'].dayofweek in [5, 6]) else 0, axis=1)

# Late Night
# 10 PM -4 AM
df['late_night'] = df['hour'].apply(lambda x: 1 if (22 <= x <= 23 or 0 <= x <= 4) else 0)

# Late Night Weekend
# Friday or Saturday nights between 10 PM and 4 AM)
df['late_night_weekend'] = df.apply(lambda row: 1 if ((row['hour'] >= 22 and row['datetime'].dayofweek == 4) or 
                                                     (row['hour'] <= 4 and row['datetime'].dayofweek == 5) or
                                                     (row['hour'] >= 22 and row['datetime'].dayofweek == 5) or
                                                     (row['hour'] <= 4 and row['datetime'].dayofweek == 6)) else 0, axis=1)

# Bruins or Celtics play in Boston
bruins = ['2018-11-5','2018-11-8','2018-11-10','2018-11-11','2018-11-23','2018-11-29','2018-12-1','2018-12-8','2018-12-11','2018-12-16','2018-12-20','2018-12-22','2018-12-27'] 
celtics = ['2018-11-1','2018-11-14','2018-11-16','2018-11-17','2018-11-21','2018-11-30','2018-12-6','2018-12-10','2018-12-14','2018-12-19','2018-12-21','2018-12-23','2018-12-25']
game_dates = bruins + celtics
df['game_day'] = df['datetime'].apply(lambda x: 1 if x in game_dates else 0)
