# Week 6 Notebook: Model training, hyperparameter tuning, and model evaluation
The goal of this week's assignment is to use one modeling method with 3 different hyperparameter settings of the method. 

### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


### Read data as dataframe

In [2]:
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

data_folder = os.path.join(parent_dir,"data")
raw_data_folder = os.path.join(data_folder,"raw")
interim_data_folder = os.path.join(data_folder,"interim")
processed_data_folder = os.path.join(data_folder, "processed")

raw_data_file = os.path.join(raw_data_folder, 'rawSampledData.csv')

In [3]:
# Load the data as a pandas dataframe.
df = pd.read_csv(raw_data_file)
print(df.columns)

Index(['datetime', 'timestamp', 'hour', 'day', 'month', 'timezone', 'source',
       'destination', 'cab_type', 'product_id', 'name', 'price', 'distance',
       'surge_multiplier', 'latitude', 'longitude', 'temperature',
       'apparentTemperature', 'short_summary', 'long_summary',
       'precipIntensity', 'precipProbability', 'humidity', 'windSpeed',
       'windGust', 'windGustTime', 'visibility', 'temperatureHigh',
       'temperatureHighTime', 'temperatureLow', 'temperatureLowTime',
       'apparentTemperatureHigh', 'apparentTemperatureHighTime',
       'apparentTemperatureLow', 'apparentTemperatureLowTime', 'icon',
       'dewPoint', 'pressure', 'windBearing', 'cloudCover', 'uvIndex',
       'visibility.1', 'ozone', 'sunriseTime', 'sunsetTime', 'moonPhase',
       'precipIntensityMax', 'uvIndexTime', 'temperatureMin',
       'temperatureMinTime', 'temperatureMax', 'temperatureMaxTime',
       'apparentTemperatureMin', 'apparentTemperatureMinTime',
       'apparentTemperatureMax

In [4]:
df.head()

Unnamed: 0,datetime,timestamp,hour,day,month,timezone,source,destination,cab_type,product_id,...,precipIntensityMax,uvIndexTime,temperatureMin,temperatureMinTime,temperatureMax,temperatureMaxTime,apparentTemperatureMin,apparentTemperatureMinTime,apparentTemperatureMax,apparentTemperatureMaxTime
0,2018-12-13 20:40:15,1544734000.0,20,13,12,America/New_York,Theatre District,Fenway,Lyft,lyft_plus,...,0.0001,1544716800,18.29,1544688000,33.83,1544731200,13.79,1544688000,32.85,1544734800
1,2018-12-02 09:53:05,1543744000.0,9,2,12,America/New_York,Beacon Hill,Haymarket Square,Uber,8cf7e821-f0d3-49c6-8eba-e679c0ebcf6a,...,0.0894,1543770000,36.4,1543726800,50.94,1543788000,35.78,1543748400,50.27,1543788000
2,2018-11-28 21:53:08,1543442000.0,21,28,11,America/New_York,Northeastern University,North Station,Lyft,lyft,...,0.0,1543420800,33.7,1543399200,42.6,1543438800,29.88,1543399200,36.56,1543435200
3,2018-12-13 05:15:05,1544678000.0,5,13,12,America/New_York,Fenway,Back Bay,Lyft,lyft_luxsuv,...,0.0001,1544716800,17.98,1544688000,33.82,1544731200,13.52,1544688000,32.84,1544734800
4,2018-12-02 09:48:00,1543744000.0,9,2,12,America/New_York,Northeastern University,Beacon Hill,Uber,6d318bcc-22a3-4af6-bddd-b409bfce1546,...,0.0894,1543770000,36.4,1543726800,50.94,1543788000,35.78,1543748400,50.27,1543788000


In [5]:
df['datetime'] = pd.to_datetime(df['datetime'])

## Split the dataset into training, testing, and validation sets
- training set is 70% of the dataframe
- validation set is 20% of the dataframe
- test set is 10% of the dataframe

In [6]:
def train_val_test_split(df):
    # Shuffle the dataset and calculate the size of validation and test sets

    df = df.sample(frac=1, random_state=123)

    val_size = int(len(df) * 0.2)
    test_size = int(len(df) * 0.1)

    # Select rows based on the val_size and test_size to store as train set, val set, and test set
    train_df = df.iloc[val_size + test_size:]
    val_df = df.iloc[:val_size]
    test_df = df.iloc[val_size:val_size + test_size]
    return train_df, val_df, test_df

train_df, val_df, test_df = train_val_test_split(df)

## Feature Engineering

### Create new variables
#### Weather-related Variable
There are several variables, `icon`, `short_summary`, and `long_summary`, that are very similar in its context. We are thinking about combining them into one variable to reduce redundancy and dimensionality.


In [7]:
print('Unique classes of icon:')
print(df['icon'].unique())
print('')
print('Unique classes of short_summary:')
print(df['short_summary'].unique())
print('')
print('Unique classes of long_summary:')
print(df['long_summary'].unique())

Unique classes of icon:
[' partly-cloudy-day ' ' rain ' ' cloudy ' ' clear-night ' ' clear-day '
 ' partly-cloudy-night ' ' fog ']

Unique classes of short_summary:
[' Mostly Cloudy ' ' Light Rain ' ' Overcast ' ' Clear ' ' Partly Cloudy '
 ' Rain ' ' Foggy ' ' Drizzle ' ' Possible Drizzle ']

Unique classes of long_summary:
[' Partly cloudy throughout the day. ' ' Light rain until evening. '
 ' Mostly cloudy throughout the day. '
 ' Rain until morning, starting again in the evening. '
 ' Light rain in the morning. ' ' Overcast throughout the day. '
 ' Foggy in the morning. ' ' Light rain in the morning and overnight. '
 ' Rain throughout the day. ' ' Possible drizzle in the morning. '
 ' Rain in the morning and afternoon. ']


One way we can appraoch the problem is to create a custom weighting system by assigning scores to each type of weather event based on severity or relevance. We weight it by evaluating the weather conditions, considering all three variables and visibility as visibility plays a significant role in determining whether weather conditions will impact driving safety. 

In order to calculate the weight, we first need to understand how these variables reflect the weather on that specific date and assess how severe the conditions are for driving.

In [8]:
weather_df = df[['icon', 'short_summary', 'long_summary']].drop_duplicates()
weather_df.shape

(59, 3)

In [9]:
weather_df[weather_df['icon'] == ' rain ']

Unnamed: 0,icon,short_summary,long_summary
1,rain,Light Rain,Light rain until evening.
24,rain,Rain,"Rain until morning, starting again in the eve..."
33,rain,Drizzle,Light rain in the morning.
39,rain,Light Rain,Rain throughout the day.
43,rain,Drizzle,"Rain until morning, starting again in the eve..."
46,rain,Light Rain,"Rain until morning, starting again in the eve..."
85,rain,Light Rain,Light rain in the morning.
140,rain,Possible Drizzle,Light rain until evening.
147,rain,Light Rain,Light rain in the morning and overnight.
198,rain,Possible Drizzle,"Rain until morning, starting again in the eve..."


In [10]:
weather_df[weather_df['icon'] == ' cloudy ']

Unnamed: 0,icon,short_summary,long_summary
2,cloudy,Overcast,Mostly cloudy throughout the day.
5,cloudy,Overcast,"Rain until morning, starting again in the eve..."
6,cloudy,Overcast,Light rain in the morning.
8,cloudy,Overcast,Overcast throughout the day.
16,cloudy,Overcast,Partly cloudy throughout the day.
17,cloudy,Overcast,Rain throughout the day.
28,cloudy,Overcast,Light rain until evening.
36,cloudy,Overcast,Light rain in the morning and overnight.
89,cloudy,Overcast,Foggy in the morning.
933,cloudy,Overcast,Possible drizzle in the morning.


In [11]:
weather_df[weather_df['icon'] == ' fog ']

Unnamed: 0,icon,short_summary,long_summary
32,fog,Foggy,"Rain until morning, starting again in the eve..."
70,fog,Foggy,Foggy in the morning.
6997,fog,Foggy,Rain in the morning and afternoon.


Precipitation intensity between 0.1 to 0.4 in/h indicates steady rain, which may cause minor inconveniences but typically does not result in flooding. There is no indication of severe rain in our data. However, visibility below 1 mile can significantly impact driving conditions. As a result, we plan to incorporate visibility into our approach for weighting the effects of weather.

- By default, every row's weather severity condition is set to 1. 
- If the rain classified as anything other than "light", we assign it a severity level of 3 (indicating the highest severity).
- For all other conditions (cloudy, foggy, or light rain), we assign as severity level of 2. 
- Since there are no instances of severe rain in the dataset, we will adjust the severity score further based on visibility.

In [12]:
df[['precipIntensity', 'visibility']].describe()

Unnamed: 0,precipIntensity,visibility
count,100000.0,100000.0
mean,0.008998,8.463778
std,0.02704,2.603025
min,0.0,0.717
25%,0.0,8.432
50%,0.0,9.88
75%,0.0,9.996
max,0.1447,10.0


In [13]:
def weather_severity(row):
    # Initialize severity
    severity = 1

    if 'rain' in row['icon'].strip().lower():
        # If rain is present in the icon, check for light or drizzle
        if 'light' in row['short_summary'].lower() or 'drizzle' in row['short_summary'].lower() or \
           'light' in row['long_summary'].lower() or 'drizzle' in row['long_summary'].lower():
            severity = 2  # Moderate severity for light rain or drizzle
        else:
            severity = 3  # Highest severity for rain without light or drizzle
    elif 'cloudy' in row['icon'].lower() or 'fog' in row['icon'].lower():
        severity = 2  # Moderate severity for clouds and fog

    # Adjust severity based on visibility
    if row['visibility'] < 1:  # Low visibility (less than 1)
        severity += 1  # Increase severity by 1
    elif row['visibility'] >= 7:  # High visibility (7 or more)
        severity -= 1  # Decrease severity by 1
        severity = max(severity, 1)  # Ensure severity doesn't go below 1

    return severity


#### Time-related Variable

In [14]:
def add_time_features(df):
    # Create rush_hour feature
    df['rush_hour'] = df['hour'].apply(lambda x: 1 if (6 <= x <= 9 or 16 <= x <= 18) else 0)

    # Create weekend feature
    df['weekend'] = df['datetime'].dt.dayofweek.apply(lambda x: 1 if x >= 5 else 0)

    # Define game dates for Bruins and Celtics
    bruins = [
        '2018-11-05', '2018-11-08', '2018-11-10', '2018-11-11',
        '2018-11-23', '2018-11-29', '2018-12-01', '2018-12-08',
        '2018-12-11', '2018-12-16', '2018-12-20', '2018-12-22',
        '2018-12-27'
    ] 
    celtics = [
        '2018-11-01', '2018-11-14', '2018-11-16', '2018-11-17',
        '2018-11-21', '2018-11-30', '2018-12-06', '2018-12-10',
        '2018-12-14', '2018-12-19', '2018-12-21', '2018-12-23',
        '2018-12-25'
    ]
    game_dates = bruins + celtics

    # Create game_day feature
    df['game_day'] = df['datetime'].apply(lambda x: 1 if x.strftime('%Y-%m-%d') in game_dates else 0)

    return df

In [15]:
train_df = add_time_features(train_df)
val_df = add_time_features(val_df)
test_df = add_time_features(test_df)

## Data Preprocessing
Since we conducted feature engineering on the original data before preprocessing, we will run the pipeline again to get the prepare the data for modeling

### Missing Value Imputation
Uber Taxi Fee Breakdown
 - base fare: $2.60 for first 1/7 mile
 - per minute fare: $0.47
 - per mile: $2.8


In [16]:
def taxi_price_calculator(distance, time):
    base_fare = 2.60
    per_min_fare = 0.47
    per_mile_fare = 2.8
    price = base_fare + distance * per_mile_fare + time * per_min_fare
    return price

In [17]:
rides_with_eta = os.path.join(interim_data_folder, "rides_with_etas.csv")

time_df = pd.read_csv(rides_with_eta)
train_df = pd.merge(train_df, time_df[['source', 'destination', 'eta_minutes']], on=['source', 'destination'], how = 'left')

In [18]:
train_df.loc[train_df['name'] == 'Taxi', 'price'] = train_df.loc[train_df['name'] == 'Taxi'].apply(
    lambda row: taxi_price_calculator(row['distance'], row['eta_minutes']), axis=1)

train_df = train_df.drop('eta_minutes', axis = 1)

In [19]:
df_na = train_df[train_df['price'].isna()]
missing_percentage_after_imputing =train_df['price'].isna().sum() / len(train_df) * 100
print(f"Percentage of missing values in 'price' after imputing: {missing_percentage_after_imputing:.2f}%")

Percentage of missing values in 'price' after imputing: 0.00%


In [20]:
#Imputing missing price values in the validation set 

# Process the validation dataset
val_df = pd.merge(val_df, time_df[['source', 'destination', 'eta_minutes']], on=['source', 'destination'], how='left')
val_df.loc[val_df['name'] == 'Taxi', 'price'] = val_df.loc[val_df['name'] == 'Taxi'].apply(
    lambda row: taxi_price_calculator(row['distance'], row['eta_minutes']), axis=1)
val_df = val_df.drop('eta_minutes', axis=1)

# Check for missing values in the validation set
df_na_val = val_df[val_df['price'].isna()]
missing_percentage_val = val_df['price'].isna().sum() / len(val_df) * 100
print(f"Percentage of missing values in 'price' after imputing (validation): {missing_percentage_val:.2f}%")

Percentage of missing values in 'price' after imputing (validation): 0.00%


### Drop Uneeded Columns
Some columns are not relevant for modeling because they function solely as unique identifiers and do not provide meaningful information for preduction purposes:
- `id`
- `product_id`

Additionally, we also dropped `timezone` since all data is within the same timezone.

`datetime` and`timestamp` were dropped to reduce redundacy with other time features.

`visibility.1` is duplicated with  `visibility`

In [21]:
columns_to_drop = ['timezone', 'datetime', 'product_id', 'visibility.1', 'timestamp','short_summary', 'long_summary', 'icon']

train_df = train_df.drop(columns=columns_to_drop)
val_df = val_df.drop(columns=columns_to_drop)
test_df = test_df.drop(columns=columns_to_drop)

### Encoding Categorical Variables
#### Check dtypes

In [22]:
object_columns = train_df.select_dtypes(include=['object']).columns.tolist()

# Display the object columns
print("Object Columns:")
print(object_columns)

Object Columns:
['source', 'destination', 'cab_type', 'name']



#### Encoding `source` and `destination` variables

In [23]:
train_df['source'] = train_df['source'].str.strip().str.replace(' ', '_')
val_df['source'] = val_df['source'].str.strip().str.replace(' ', '_')
test_df['source'] = test_df['source'].str.strip().str.replace(' ', '_')

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'source' column
train_encoded_summary = encoder.fit_transform(train_df[['source']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['source']])
test_encoded_summary = encoder.transform(test_df[['source']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['source']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['source']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['source']))

# Concatenate the one-hot encoded 'source' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

In [24]:
# Step 1: Extract the categories from the fitted OneHotEncoder for 'source'
source_categories = encoder.categories_[0]

# Initialize a new OneHotEncoder for 'destination'
destination_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Strip leading/trailing spaces and replace blank spaces with underscores in the destination column
train_df['destination'] = train_df['destination'].str.strip().str.replace(' ', '_')
val_df['destination'] = val_df['destination'].str.strip().str.replace(' ', '_')
test_df['destination'] = test_df['destination'].str.strip().str.replace(' ', '_')

# Fit the encoder on the training data 'destination' column
train_encoded_destination = destination_encoder.fit_transform(train_df[['destination']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_destination = destination_encoder.transform(val_df[['destination']])
test_encoded_destination = destination_encoder.transform(test_df[['destination']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_destination_df = pd.DataFrame(train_encoded_destination, columns=destination_encoder.get_feature_names_out(['destination']))
val_encoded_destination_df = pd.DataFrame(val_encoded_destination, columns=destination_encoder.get_feature_names_out(['destination']))
test_encoded_destination_df = pd.DataFrame(test_encoded_destination, columns=destination_encoder.get_feature_names_out(['destination']))

# Concatenate the one-hot encoded 'destination' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_destination_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_destination_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_destination_df], axis=1)

#### Encoding 'cab_type' and 'name' variables

In [25]:
train_df['name'] = train_df['name'].str.strip().str.replace(' ', '_')
val_df['name'] = val_df['name'].str.strip().str.replace(' ', '_')
test_df['name'] = test_df['name'].str.strip().str.replace(' ', '_')

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit the encoder on the training data 'name' column
train_encoded_summary = encoder.fit_transform(train_df[['name']])

# Apply the encoder to validation and test sets using the trained categories from the training data
val_encoded_summary = encoder.transform(val_df[['name']])
test_encoded_summary = encoder.transform(test_df[['name']])

# Convert the encoded arrays back to pandas DataFrames with appropriate column names
train_encoded_summary_df = pd.DataFrame(train_encoded_summary, columns=encoder.get_feature_names_out(['name']))
val_encoded_summary_df = pd.DataFrame(val_encoded_summary, columns=encoder.get_feature_names_out(['name']))
test_encoded_summary_df = pd.DataFrame(test_encoded_summary, columns=encoder.get_feature_names_out(['name']))

# Concatenate the one-hot encoded 'name' columns back to the respective datasets
train_df = pd.concat([train_df.reset_index(drop=True), train_encoded_summary_df], axis=1)
val_df = pd.concat([val_df.reset_index(drop=True), val_encoded_summary_df], axis=1)
test_df = pd.concat([test_df.reset_index(drop=True), test_encoded_summary_df], axis=1)

In [26]:
train_df['cab_type'] = train_df['cab_type'].str.strip()
val_df['cab_type'] = val_df['cab_type'].str.strip()
test_df['cab_type'] = test_df['cab_type'].str.strip()

# Ensure there are no leading or trailing spaces in the 'cab_type' column
train_df['cab_type'] = train_df['cab_type'].str.strip()
val_df['cab_type'] = val_df['cab_type'].str.strip()
test_df['cab_type'] = test_df['cab_type'].str.strip()

# Create dummy variables for the 'cab_type' column
train_df = pd.get_dummies(train_df, columns=['cab_type'], drop_first=True)
val_df = pd.get_dummies(val_df, columns=['cab_type'], drop_first=True)
test_df = pd.get_dummies(test_df, columns=['cab_type'], drop_first=True)

#### Drop encoded categorical columns

In [27]:
cat_columns_to_drop = ['source',
 'destination',
 'name']

train_df = train_df.drop(columns = cat_columns_to_drop)
val_df = val_df.drop(columns = cat_columns_to_drop)
test_df = test_df.drop(columns = cat_columns_to_drop)

### Standardization and PCA
#### First, let's standardize our features

In [28]:
y_train = train_df[['price']]
y_val = val_df[['price']]
y_test = test_df[['price']]

X_train = train_df.drop('price', axis = 1)
X_val = val_df.drop('price', axis = 1)
X_test = test_df.drop('price', axis = 1)

In [29]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.fit_transform(X_val)
X_test_scaled = scaler.fit_transform(X_test)

X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val_scaled_df = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [30]:
# Save the scaled DataFrames as Parquet files

X_train_scaled_df.to_parquet(os.path.join(processed_data_folder, 'X_train_scaled.parquet'), index=False)
X_val_scaled_df.to_parquet(os.path.join(processed_data_folder, 'X_val_scaled.parquet'), index=False)
X_test_scaled_df.to_parquet(os.path.join(processed_data_folder, 'X_test_scaled.parquet'), index=False)

# Save the target variables (if needed)
y_train.to_parquet(os.path.join(processed_data_folder, 'y_train.parquet'), index=False)
y_val.to_parquet(os.path.join(processed_data_folder, 'y_val.parquet'), index=False)
y_test.to_parquet(os.path.join(processed_data_folder, 'y_test.parquet'), index=False)

#### Principal component analysis

In [31]:
# Initialize PCA, you can adjust n_components as needed (for example, n_components=0.95 for 95% variance)
pca = PCA(n_components=0.95)


X_train_scaled = X_train_scaled_df.to_numpy()
X_val_scaled = X_val_scaled_df.to_numpy()
X_test_scaled = X_test_scaled_df.to_numpy()

# Fit PCA on the scaled training data
X_train_pca = pca.fit_transform(X_train_scaled)

# Transform the validation and test sets using the same PCA
X_val_pca = pca.transform(X_val_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Convert the PCA results back to DataFrames for easier handling
X_train_pca_df = pd.DataFrame(X_train_pca)
X_val_pca_df = pd.DataFrame(X_val_pca)
X_test_pca_df = pd.DataFrame(X_test_pca)

# Optional: Rename the columns for clarity
X_train_pca_df.columns = [f'PC{i+1}' for i in range(X_train_pca_df.shape[1])]
X_val_pca_df.columns = [f'PC{i+1}' for i in range(X_val_pca_df.shape[1])]
X_test_pca_df.columns = [f'PC{i+1}' for i in range(X_test_pca_df.shape[1])]

In [32]:
print(X_train_pca_df.shape)
print(X_val_pca_df.shape)
print(X_test_pca_df.shape)

(70000, 45)
(20000, 45)
(10000, 45)


In [33]:
# Save PCA DataFrames as Parquet files
train_pca_path = os.path.join(processed_data_folder, 'X_train_pca.parquet')
val_pca_path = os.path.join(processed_data_folder, 'X_val_pca.parquet')
test_pca_path = os.path.join(processed_data_folder, 'X_test_pca.parquet')

X_train_pca_df.to_parquet(train_pca_path, index=False)
X_val_pca_df.to_parquet(val_pca_path, index=False)
X_test_pca_df.to_parquet(test_pca_path, index=False)

### Linear Regression

#### The first modeling method we will use is linear regression. We will use ridge regression (L2) to vary the regularization strength. 

In [34]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score

In [35]:
# Define three models with varying regularization strengths
ridge_model_1 = Ridge(alpha=0.001)  # low regularization
ridge_model_2 = Ridge(alpha=0.01)  # medium regularization
ridge_model_3 = Ridge(alpha=1) # high regularization

In [36]:
# Train each Ridge model
ridge_model_1.fit(X_train_scaled, y_train)
ridge_model_2.fit(X_train_scaled, y_train)
ridge_model_3.fit(X_train_scaled, y_train)

In [37]:
# Predict on the training and validation datasets
y_train_pred_1 = ridge_model_1.predict(X_train_scaled)
y_val_pred_1 = ridge_model_1.predict(X_val_scaled)

y_train_pred_2 = ridge_model_2.predict(X_train_scaled)
y_val_pred_2 = ridge_model_2.predict(X_val_scaled)

y_train_pred_3 = ridge_model_3.predict(X_train_scaled)
y_val_pred_3 = ridge_model_3.predict(X_val_scaled)

In [38]:
# Calculate evaluation metrics (MSE and R-squared) for training and validation datasets
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return mse, r2

In [39]:
# Check for NaN values in the training and validation sets
print("NaN values in X_train_scaled:", np.isnan(X_train_scaled).sum())
print("NaN values in X_val_scaled:", np.isnan(X_val_scaled).sum())
print("NaN values in y_train:", np.isnan(y_train).sum())
print("NaN values in y_val:", np.isnan(y_val).sum())


NaN values in X_train_scaled: 0
NaN values in X_val_scaled: 0
NaN values in y_train: price    0
dtype: int64
NaN values in y_val: price    0
dtype: int64


In [40]:
# Evaluate Ridge model 1
train_mse_1, train_r2_1 = evaluate_model(y_train, y_train_pred_1)
val_mse_1, val_r2_1 = evaluate_model(y_val, y_val_pred_1)

# Evaluate Ridge model 2
train_mse_2, train_r2_2 = evaluate_model(y_train, y_train_pred_2)
val_mse_2, val_r2_2 = evaluate_model(y_val, y_val_pred_2)

# Evaluate Ridge model 3
train_mse_3, train_r2_3 = evaluate_model(y_train, y_train_pred_3)
val_mse_3, val_r2_3 = evaluate_model(y_val, y_val_pred_3)

In [41]:
# Print results
print("Model 1 (alpha=0.1) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_1, train_r2_1))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_1, val_r2_1))

print("Model 2 (alpha=1.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_2, train_r2_2))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_2, val_r2_2))

print("Model 3 (alpha=10.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_3, train_r2_3))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_3, val_r2_3))

Model 1 (alpha=0.1) - Training MSE: 6868.4819, R2: 0.2146
Validation MSE: 6475.0053, R2: 0.2055
Model 2 (alpha=1.0) - Training MSE: 6868.4965, R2: 0.2146
Validation MSE: 6475.1198, R2: 0.2055
Model 3 (alpha=10.0) - Training MSE: 6868.8433, R2: 0.2145
Validation MSE: 6475.6536, R2: 0.2054


In [42]:
#Investigating the similar outputs
print(ridge_model_1.coef_)
print(ridge_model_2.coef_)
print(ridge_model_3.coef_)
#Based on the similar coefficients for the Ridge models with different alpha values it seems like Ridge regularization is not playing a major role in altering the model's predictions. This might be because the features are already well scaled. 


[[-8.88054871e-01 -6.48325275e+00 -8.96070865e+00  1.59479198e+00
   1.30806395e+00  1.40753263e-01  6.04357959e-02  1.88058422e+01
  -4.63258217e+00 -2.22116946e-01 -1.15274108e-01  1.03061618e+01
   8.25073696e-02 -1.22082874e+00  2.21943328e+01  6.55676712e-01
  -1.18436423e+02  8.42498081e+01  3.31387514e+00 -1.20633913e+02
   6.25660407e+01 -4.11222992e+02 -2.48571953e+00  8.69638276e+01
  -2.01279114e+01 -2.19894965e+00 -8.92567905e-01  3.18794515e-01
  -1.06505890e-01 -1.65180168e+00 -2.74156315e+02  3.29602016e+02
  -4.87967910e-01 -5.03045270e-01 -1.77606906e+02  4.30379457e+00
   5.76577664e+01  1.07437961e+02 -7.60960688e-02 -5.03901127e+00
  -3.05633853e+01 -5.37219128e+01  4.36925197e+02 -5.30218049e-01
  -8.01163085e-02  3.68394959e-01 -2.88799115e+00 -3.23628554e+00
   1.96754770e-01  3.85922586e-01 -2.63877668e+00 -6.07111159e-01
  -4.26911098e-02 -2.84148153e+00  5.52724588e-01 -1.65070858e-01
   1.45600442e+01 -3.14517784e+00 -3.03899909e+00 -2.91868051e+00
   2.00246

In [43]:
#Continuation of ridge regression exploration
#Testing with higher alpha values (alpha=100)
ridge_model_4 = Ridge(alpha=100) 
ridge_model_4.fit(X_train_scaled, y_train)
y_train_pred_4 = ridge_model_4.predict(X_train_scaled)
y_val_pred_4 = ridge_model_4.predict(X_val_scaled)
# Evaluate Ridge model 3
train_mse_4, train_r2_4 = evaluate_model(y_train, y_train_pred_4)
print("Model 4 (alpha=100.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_4, train_r2_4))


#Testing with higher alpha values (alpha=1000)
ridge_model_5 = Ridge(alpha=1000) 
ridge_model_5.fit(X_train_scaled, y_train)
y_train_pred_5 = ridge_model_5.predict(X_train_scaled)
y_val_pred_5 = ridge_model_5.predict(X_val_scaled)
# Evaluate Ridge model 3
train_mse_5, train_r2_5 = evaluate_model(y_train, y_train_pred_5)
print("Model 5 (alpha=1000.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_5, train_r2_5))


Model 4 (alpha=100.0) - Training MSE: 6869.4575, R2: 0.2145
Model 5 (alpha=1000.0) - Training MSE: 6870.2419, R2: 0.2144


In [44]:
#3 different Lasso regularization terms
from sklearn.linear_model import Lasso

# Define three models with varying regularization strengths for Lasso
lasso_model_1 = Lasso(alpha=0.001)  # low regularization
lasso_model_2 = Lasso(alpha=0.01)   # medium regularization
lasso_model_3 = Lasso(alpha=1)      # high regularization

# Train each Lasso model
lasso_model_1.fit(X_train_scaled, y_train)
lasso_model_2.fit(X_train_scaled, y_train)
lasso_model_3.fit(X_train_scaled, y_train)

# Predict on the training and validation datasets
y_train_pred_lasso_1 = lasso_model_1.predict(X_train_scaled)
y_val_pred_lasso_1 = lasso_model_1.predict(X_val_scaled)

y_train_pred_lasso_2 = lasso_model_2.predict(X_train_scaled)
y_val_pred_lasso_2 = lasso_model_2.predict(X_val_scaled)

y_train_pred_lasso_3 = lasso_model_3.predict(X_train_scaled)
y_val_pred_lasso_3 = lasso_model_3.predict(X_val_scaled)

# Calculate evaluation metrics (MSE and R-squared) for training and validation datasets
train_mse_lasso_1, train_r2_lasso_1 = evaluate_model(y_train, y_train_pred_lasso_1)
val_mse_lasso_1, val_r2_lasso_1 = evaluate_model(y_val, y_val_pred_lasso_1)

train_mse_lasso_2, train_r2_lasso_2 = evaluate_model(y_train, y_train_pred_lasso_2)
val_mse_lasso_2, val_r2_lasso_2 = evaluate_model(y_val, y_val_pred_lasso_2)

train_mse_lasso_3, train_r2_lasso_3 = evaluate_model(y_train, y_train_pred_lasso_3)
val_mse_lasso_3, val_r2_lasso_3 = evaluate_model(y_val, y_val_pred_lasso_3)

# Print results
print("Lasso Model 1 (alpha=0.001) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_lasso_1, train_r2_lasso_1))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_lasso_1, val_r2_lasso_1))

print("Lasso Model 2 (alpha=0.01) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_lasso_2, train_r2_lasso_2))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_lasso_2, val_r2_lasso_2))

print("Lasso Model 3 (alpha=1.0) - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_lasso_3, train_r2_lasso_3))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_lasso_3, val_r2_lasso_3))

  model = cd_fast.enet_coordinate_descent(


Lasso Model 1 (alpha=0.001) - Training MSE: 6869.2573, R2: 0.2145
Validation MSE: 6475.0757, R2: 0.2055
Lasso Model 2 (alpha=0.01) - Training MSE: 6869.7284, R2: 0.2144
Validation MSE: 6474.2080, R2: 0.2056
Lasso Model 3 (alpha=1.0) - Training MSE: 6887.8521, R2: 0.2124
Validation MSE: 6476.0189, R2: 0.2054


In [45]:
#Generating polynomial features with NO regularization
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score

# Step 1: Generate polynomial features
poly = PolynomialFeatures(degree=2)  # Adjust the degree as needed
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)

# Step 2: Fit the Linear Regression model
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)

# Step 3: Make predictions
y_train_pred_poly = poly_model.predict(X_train_poly)
y_val_pred_poly = poly_model.predict(X_val_poly)

# Step 4: Calculate evaluation metrics (MSE and R-squared)
train_mse_poly, train_r2_poly = evaluate_model(y_train, y_train_pred_poly)
val_mse_poly, val_r2_poly = evaluate_model(y_val, y_val_pred_poly)

# Print results
print("Polynomial Regression Model - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_poly, train_r2_poly))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_poly, val_r2_poly))

Polynomial Regression Model - Training MSE: 2.8405, R2: 0.9997
Validation MSE: 59460830145836358828032.0000, R2: -7295900558419439616.0000


In [None]:
#Polynomial feature with lasso regression
from sklearn.linear_model import Lasso

X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)

# Step 2: Fit the Lasso regression model
lasso_model = Lasso(alpha=1.0)  # Adjust alpha for different regularization strengths
lasso_model.fit(X_train_poly, y_train)

# Step 3: Make predictions
y_train_pred_lasso = lasso_model.predict(X_train_poly)
y_val_pred_lasso = lasso_model.predict(X_val_poly)

# Step 4: Calculate evaluation metrics (MSE and R-squared)
train_mse_lasso, train_r2_lasso = evaluate_model(y_train, y_train_pred_lasso)
val_mse_lasso, val_r2_lasso = evaluate_model(y_val, y_val_pred_lasso)

# Print results
print("Lasso Regression Model - Training MSE: {:.4f}, R2: {:.4f}".format(train_mse_lasso, train_r2_lasso))
print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_lasso, val_r2_lasso))

In [47]:
#Polynomial feature with 3 different ridge regressions 
from sklearn.linear_model import Ridge

# Step 1: Generate polynomial features
poly = PolynomialFeatures(degree=2)  # Adjust the degree as needed
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)

# Define the alpha values you want to test
alpha_values = [1.0, 10.0, 100.0]  # You can adjust these values as needed

# Loop through the different alpha values
for alpha in alpha_values:
    # Step 2: Fit the Ridge regression model with the current alpha
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train_poly, y_train)

    # Step 3: Make predictions
    y_train_pred_ridge = ridge_model.predict(X_train_poly)
    y_val_pred_ridge = ridge_model.predict(X_val_poly)

    # Step 4: Calculate evaluation metrics (MSE and R-squared)
    train_mse_ridge, train_r2_ridge = evaluate_model(y_train, y_train_pred_ridge)
    val_mse_ridge, val_r2_ridge = evaluate_model(y_val, y_val_pred_ridge)

    # Print results for the current alpha
    print("Ridge Regression Model (alpha={}) - Training MSE: {:.4f}, R2: {:.4f}".format(alpha, train_mse_ridge, train_r2_ridge))
    print("Validation MSE: {:.4f}, R2: {:.4f}".format(val_mse_ridge, val_r2_ridge))

Ridge Regression Model (alpha=1.0) - Training MSE: 2.8320, R2: 0.9997
Validation MSE: 9.7312, R2: 0.9988
Ridge Regression Model (alpha=10.0) - Training MSE: 2.8392, R2: 0.9997
Validation MSE: 9.6947, R2: 0.9988
Ridge Regression Model (alpha=100.0) - Training MSE: 2.8588, R2: 0.9997
Validation MSE: 9.4152, R2: 0.9988
