# Import Libraries & Global Constants

In [29]:
# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import seaborn as sns 
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Seed for reproducibility
SEED = 42

# Load Data & filter relevant columns

In [30]:
# Load the data from the CSV file
listings = pd.read_csv('data/listings.csv')

In [31]:
# keep only useful columns (see Lukas' list on GoogleDrive)
# plus remove neighbourhood & all things 'review'
# also remove 'property type' as mapping this on an ordinal scale is too complex
columns_to_keep = [
    'price', 'host_since', 'host_response_time', 'host_response_rate',
    'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
    'host_total_listings_count', 'host_has_profile_pic',
    'host_identity_verified',
    'latitude', 'longitude', 'room_type', 'accommodates', 
    'bathrooms', 'bedrooms', 'beds', 'minimum_nights', 'maximum_nights',
    'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'availability_30',
    'availability_60', 'availability_90', 'availability_365', 'instant_bookable', 'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms'
]

# Keep only the specified columns
listings = listings[columns_to_keep]


# Exploratory Data Analysis

In [32]:
# List the columns in the data
print(listings.columns)


Index(['price', 'host_since', 'host_response_time', 'host_response_rate',
       'host_acceptance_rate', 'host_is_superhost', 'host_listings_count',
       'host_total_listings_count', 'host_has_profile_pic',
       'host_identity_verified', 'latitude', 'longitude', 'room_type',
       'accommodates', 'bathrooms', 'bedrooms', 'beds', 'minimum_nights',
       'maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms'],
      dtype='object')


In [33]:
# List the types of the data
print(listings.dtypes)


price                                            object
host_since                                       object
host_response_time                               object
host_response_rate                               object
host_acceptance_rate                             object
host_is_superhost                                object
host_listings_count                               int64
host_total_listings_count                         int64
host_has_profile_pic                             object
host_identity_verified                           object
latitude                                        float64
longitude                                       float64
room_type                                        object
accommodates                                      int64
bathrooms                                       float64
bedrooms                                        float64
beds                                            float64
minimum_nights                                  

In [34]:
listings.describe()


Unnamed: 0,host_listings_count,host_total_listings_count,latitude,longitude,accommodates,bathrooms,bedrooms,beds,minimum_nights,maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,availability_30,availability_60,availability_90,availability_365,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms
count,10099.0,10099.0,10099.0,10099.0,10099.0,8884.0,9926.0,8889.0,10099.0,10099.0,10099.0,10099.0,10099.0,10099.0,10099.0,10099.0,10099.0,10099.0,10099.0,10099.0
mean,2.864046,3.471532,59.921761,10.759973,3.439251,1.144023,1.603768,1.63348,3.973067,314.948411,4.126527,393.328042,13.831072,25.142489,38.363996,134.049609,2.466482,2.093178,0.353401,0.019507
std,10.04627,11.127133,0.020585,0.043214,1.810203,0.405963,0.944112,1.410624,12.245705,363.470626,12.988633,416.183568,11.006389,20.821545,31.670901,122.260856,9.174954,9.098139,1.397438,0.266123
min,1.0,1.0,59.81726,10.59105,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,1.0,1.0,59.91329,10.736792,2.0,1.0,1.0,1.0,2.0,30.0,2.0,30.0,2.0,6.0,10.0,22.0,1.0,1.0,0.0,0.0
50%,1.0,1.0,59.92282,10.76267,3.0,1.0,1.0,1.0,2.0,365.0,2.0,365.0,14.0,21.0,30.0,90.0,1.0,1.0,0.0,0.0
75%,2.0,2.0,59.931701,10.779821,4.0,1.0,2.0,2.0,3.0,365.0,3.9,365.0,25.0,44.0,69.0,241.0,1.0,1.0,0.0,0.0
max,131.0,178.0,60.041562,10.942936,16.0,5.0,10.0,16.0,500.0,1500.0,500.0,1500.0,30.0,60.0,90.0,365.0,94.0,94.0,21.0,6.0


In [35]:
# Identify missing values in the listings DataFrame
missing_values = listings.isnull().sum()

#Print the count of missing values for all the columns with missing values
print("Count of missing values for all the columns with missing values:")
print(missing_values[missing_values > 0])

# Amount of columns with missing values
print("\nAmount of columns with missing values:")
print(len(missing_values[missing_values > 0]))


Count of missing values for all the columns with missing values:
price                   1249
host_response_time      2117
host_response_rate      2117
host_acceptance_rate    1309
host_is_superhost        104
bathrooms               1215
bedrooms                 173
beds                    1210
dtype: int64

Amount of columns with missing values:
8


In [36]:
# Identify outliers
def count_outliers_z_score(df):
    outlier_counts = {}
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        mean = df[column].mean()
        std_dev = df[column].std()
        
        # Calculate Z-scores
        z_scores = (df[column] - mean) / std_dev
        
        # Count outliers
        outliers = df[(z_scores < -3) | (z_scores > 3)]
        outlier_counts[column] = len(outliers)
    
    # Converts to DF for pretty printing
    outlier_counts_df = pd.DataFrame(list(outlier_counts.items()), columns=['Column', 'Outlier Count'])
    return outlier_counts_df

# Get the number of outliers in each column using Z-score
outlier_counts_listings = count_outliers_z_score(listings)

# Display the outlier counts as a table
print("\nOutlier counts using Z-score:")
print(outlier_counts_listings)



Outlier counts using Z-score:
                                          Column  Outlier Count
0                            host_listings_count            111
1                      host_total_listings_count            114
2                                       latitude            208
3                                      longitude            121
4                                   accommodates            152
5                                      bathrooms            226
6                                       bedrooms             95
7                                           beds            157
8                                 minimum_nights             55
9                                 maximum_nights              1
10                        minimum_nights_avg_ntm             60
11                        maximum_nights_avg_ntm              0
12                               availability_30              0
13                               availability_60              0
14       

In [37]:
# Select only the columns with numeric data
numeric_listings = listings.select_dtypes(include=['float64', 'int64'])

# Visualize correlations between features for float values
""" plt.figure(figsize=(12, 8))
sns.heatmap(numeric_listings.corr(), annot=False, cmap='coolwarm', linewidths=0.5)
plt.show() """

" plt.figure(figsize=(12, 8))\nsns.heatmap(numeric_listings.corr(), annot=False, cmap='coolwarm', linewidths=0.5)\nplt.show() "

# Data cleaning



In [38]:
def clean_price(df):
    df_copy = df.copy()
    # Remove the commas and dollar signs from the price column
    df_copy['price'] = df_copy['price'].str.replace("$", "").str.replace(",", "").astype(float)
    return df_copy

def clean_host_since(df):
    df_copy = df.copy()
    # convert to datetime column
    df_copy['host_since'] = pd.to_datetime(df_copy['host_since'], format='%Y-%m-%d')
    # Get today's date
    today = pd.to_datetime(datetime.now().date())
    # Calculate the difference in days between 'host_since' and today
    df_copy['host_since'] = (today - df_copy['host_since']).dt.days.astype('float64')
    return df_copy

def clean_host_response_time(df):
    df_copy = df.copy()
    # Define the mapping from response times to ordinal values
    response_time_map = {
        'within an hour': 1,
        'within a few hours': 2,
        'within a day': 3,
        'a few days or more': 4
    }    
    # Apply the mapping to the 'host_response_time' column, replacing values with the ordinal ones
    df_copy['host_response_time'] = df_copy['host_response_time'].map(response_time_map)
    return df_copy

def clean_room_type(df):
    df_copy = df.copy()
    # Define the mapping from room type to ordinal values
    room_type_map = {
        'Shared room': 1,
        'Hotel room': 2,
        'Private room': 2,
        'Entire home/apt': 3
    }    
    # Apply the mapping to the 'room_type' column, replacing values with the ordinal ones
    df_copy['room_type'] = df_copy['room_type'].map(room_type_map)
    return df_copy

def clean_host_response_rate(df):
    df_copy = df.copy()
    # Remove the percent sign from the column
    df_copy['host_response_rate'] = df_copy['host_response_rate'].str.replace("%", "").astype(float)
    return df_copy

def clean_host_acceptance_rate(df):
    df_copy = df.copy()
    # Remove the percent sign from the column
    df_copy['host_acceptance_rate'] = df_copy['host_acceptance_rate'].str.replace("%", "").astype(float)
    return df_copy

def clean_host_is_superhost(df):
    df_copy = df.copy()
    # Define the mapping to boolean values
    boolean_map = {
        't': True,
        'f': False
    }    
    # Apply the mapping to the 'host_is_superhost' column, replacing values with the boolean ones
    df_copy['host_is_superhost'] = df_copy['host_is_superhost'].map(boolean_map).astype(bool)
    return df_copy

def clean_host_has_profile_pic(df):
    df_copy = df.copy()
    # Define the mapping to boolean values
    boolean_map = {
        't': True,
        'f': False
    }    
    # Apply the mapping to the 'host_has_profile_pic' column, replacing values with the boolean ones
    df_copy['host_has_profile_pic'] = df_copy['host_has_profile_pic'].map(boolean_map).astype(bool)
    return df_copy

def clean_host_identity_verified(df):
    df_copy = df.copy()
    # Define the mapping to boolean values
    boolean_map = {
        't': True,
        'f': False
    }    
    # Apply the mapping to the 'host_identity_verified' column, replacing values with the boolean ones
    df_copy['host_identity_verified'] = df_copy['host_identity_verified'].map(boolean_map).astype(bool)
    return df_copy

def clean_instant_bookable(df):
    df_copy = df.copy()
    # Define the mapping to boolean values
    boolean_map = {
        't': True,
        'f': False
    }    
    # Apply the mapping to the 'instant_bookable' column, replacing values with the boolean ones
    df_copy['instant_bookable'] = df_copy['instant_bookable'].map(boolean_map).astype(bool)
    return df_copy

def drop_string_columns(df):
    #Drop all non-numeric columns
    df_copy = df.select_dtypes(include=['float64', 'int64'])
    return df_copy

def drop_empty_columns(df):
    # Drop columns with no values at all
    df_copy = df.dropna(axis=1, how='all')
    return df_copy

def fill_missing_values_mean(df):
    # Fill missing values with the mean of the column
    df_copy = df.fillna(df.mean())
    return df_copy

def fill_missing_values_mice(df):
    # Initialize the IterativeImputer (MICE)
    imputer = IterativeImputer()
    # Fit and transform the data
    df_copy = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    return df_copy


In [39]:
# Clean columns in the listings data


cleaned_listings = clean_price(listings)
cleaned_listings = clean_host_since(cleaned_listings)
cleaned_listings = clean_host_response_time(cleaned_listings)
cleaned_listings = clean_room_type(cleaned_listings)
cleaned_listings = clean_host_response_rate(cleaned_listings)
cleaned_listings = clean_host_acceptance_rate(cleaned_listings)
cleaned_listings = clean_host_is_superhost(cleaned_listings)
cleaned_listings = clean_host_has_profile_pic(cleaned_listings)
cleaned_listings = clean_host_identity_verified(cleaned_listings)
cleaned_listings = clean_instant_bookable(cleaned_listings)
# cleaned_listings = drop_string_columns(cleaned_listings)
cleaned_listings = drop_empty_columns(cleaned_listings)
# cleaned_listings = fill_missing_values_mean(cleaned_listings)
cleaned_listings = fill_missing_values_mice(cleaned_listings)

# Assert all values are numeric
assert cleaned_listings.dtypes.all() != np.dtype('O'), 'Not all values are numeric'


  df_copy['price'] = df_copy['price'].str.replace("$", "").str.replace(",", "").astype(float)


In [40]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_info_rows', 100)
cleaned_listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10099 entries, 0 to 10098
Data columns (total 30 columns):
 #   Column                                        Dtype  
---  ------                                        -----  
 0   price                                         float64
 1   host_since                                    float64
 2   host_response_time                            float64
 3   host_response_rate                            float64
 4   host_acceptance_rate                          float64
 5   host_is_superhost                             float64
 6   host_listings_count                           float64
 7   host_total_listings_count                     float64
 8   host_has_profile_pic                          float64
 9   host_identity_verified                        float64
 10  latitude                                      float64
 11  longitude                                     float64
 12  room_type                                     float64
 13  a

# Preprocessing 

In [41]:
# Split the data into training and test sets where y is the price and X is the rest of the data
y = cleaned_listings['price']
X = cleaned_listings.drop('price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [42]:
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the scaled data back to DataFrame for better readability
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)


# Modeling



## Linear Regressor

In [43]:
# Initialize the model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Print the coefficients and intercept
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)
print("R^2:", model.score(X_test, y_test))


Coefficients: [ 1.29239693e-02  7.09659602e+01 -4.41660780e+00  5.36404114e+00
 -1.38714327e+01 -3.01073615e+01  3.10899262e+01  2.92482606e+02
  4.90053782e+01 -4.35344310e+03 -2.77028265e+03  4.31582907e+02
  1.72692014e+02  6.25188003e+02  2.61894510e+02 -1.89024968e+01
  2.70693601e-01  1.55865446e-01  2.08818675e-01 -2.51403946e-02
 -1.50809625e+01  2.01036421e+01  3.46589851e+00 -1.38457549e-01
  2.81343722e+01 -8.46640903e+02  8.44266393e+02  8.02203237e+02
  7.59331235e+02]
Intercept: 288482.9582557991
R^2: 0.27227037345203386


### Results of Linear regression
- Surprisingly good

## DecisionTreeRegressor

In [44]:
model = DecisionTreeRegressor(max_depth=15, random_state=SEED)
model.fit(X_train, y_train)
predictions = model.predict(X_test)

# Print the R^2 score
print("R^2:", model.score(X_test, y_test))

# Print the first 5 predictions rounding to 1 decimal place
print(np.round(predictions[:5], 1))

# Print the first 5 actual values
print(np.round(y_test[:5].values,1))


R^2: -1.773695927644197
[1384.5 1159.5 1262.7 1255.7  769.8]
[ 700. 1000. 1200. 1429.  800.]


### Results of Decision Tree
- Bad. Forget about it.


## Random Forest Regressor 


In [None]:
# Define the parameter grid
param_grid = {
    'n_estimators': [25, 50],
    'max_depth': [None, 10,],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [2, 4]
}

# Initialize the model
rf_model = RandomForestRegressor(random_state=SEED)

# Initialize the grid search
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=25; total time=   1.1s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=25; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=25; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=25; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=25; total time=   1.3s
[CV] END max_depth=None, min_samples_leaf=4, min_samples_split=2, n_estimators=25; total time=   1.0s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   2.4s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=5, n_estimators=25; total time=   1.2s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   2.5s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50;

GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 10], 'min_samples_leaf': [2, 4],
                         'min_samples_split': [2, 5],
                         'n_estimators': [25, 50]},
             verbose=2)

In [None]:
# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best R^2 score: ", grid_search.best_score_)

# Pick the best estimator to make predictions
best_rf_model = grid_search.best_estimator_
rf_predictions = best_rf_model.predict(X_test_scaled)

# Print the R^2 score on the test set
print("R^2 on test set: ", best_rf_model.score(X_test_scaled, y_test))

# Print the first 5 predictions rounding to 1 decimal place
print(np.round(rf_predictions[:5], 1))

# Print the first 5 actual values
print(np.round(y_test[:5].values, 1))

Best parameters found:  {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best R^2 score:  0.1497939979033689
R^2 on test set:  0.2642691511720283
[1496.2 1151.3 1172.9 1438.5 1077.6]
[ 700. 1000. 1200. 1429.  800.]


In [None]:
residuals = y_test - rf_predictions
""" plt.scatter(rf_predictions, residuals)
plt.hlines(0, min(rf_predictions), max(rf_predictions), colors='r', linestyles='dashed')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show() """


" plt.scatter(rf_predictions, residuals)\nplt.hlines(0, min(rf_predictions), max(rf_predictions), colors='r', linestyles='dashed')\nplt.xlabel('Predicted Values')\nplt.ylabel('Residuals')\nplt.title('Residual Plot')\nplt.show() "

#### Results Random Forest
- Better

### Gradient Boosting Regressor

In [None]:
learning_rates = [0.01, 0.02, 0.022, 0.025, 0.05]

for lr in learning_rates:
    gb_model = GradientBoostingRegressor(learning_rate=lr, random_state=SEED)
    gb_model.fit(X_train_scaled, y_train)
    gb_predictions = gb_model.predict(X_test_scaled)
    
    print(f"Learning rate: {lr}")
    print("R^2:", gb_model.score(X_test_scaled, y_test))
    print("First 5 predictions:", np.round(gb_predictions[:5], 1))
    print("First 5 actual values:", np.round(y_test[:5].values, 1))
    print("\n")


Learning rate: 0.01
R^2: 0.22486948965833142
First 5 predictions: [1582.  1277.2 1413.4 1413.4 1277.2]
First 5 actual values: [ 700. 1000. 1200. 1429.  800.]


Learning rate: 0.02
R^2: 0.27783933299832464
First 5 predictions: [1702.9 1166.9 1332.9 1359.9 1189.4]
First 5 actual values: [ 700. 1000. 1200. 1429.  800.]


Learning rate: 0.022
R^2: 0.2855549910086481
First 5 predictions: [1866.4 1145.4 1335.  1353.7 1199.6]
First 5 actual values: [ 700. 1000. 1200. 1429.  800.]


Learning rate: 0.025
R^2: 0.2846546391366154
First 5 predictions: [1951.5 1128.8 1349.6 1363.  1203.5]
First 5 actual values: [ 700. 1000. 1200. 1429.  800.]


Learning rate: 0.05
R^2: 0.08986095870945021
First 5 predictions: [1874.6 1116.1 1266.9 1311.  1207.7]
First 5 actual values: [ 700. 1000. 1200. 1429.  800.]




### Results Gradient Boosting
- Okayish, best with low learning rate 

## k-Nearest Neighbors 


In [None]:
# Define the parameter grid
param_grid = {
    'n_neighbors': list(range(10, 30)),  
    'weights': ['uniform', 'distance']
}

# Initialize the model
knn_model = KNeighborsRegressor()

# Initialize the grid search
grid_search_knn = GridSearchCV(estimator=knn_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search_knn.fit(X_train_scaled, y_train)



Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV] END ....................n_neighbors=10, weights=uniform; total time=   0.8s
[CV] END ....................n_neighbors=10, weights=uniform; total time=   0.9s
[CV] END ...................n_neighbors=10, weights=distance; total time=   0.8s
[CV] END ...................n_neighbors=10, weights=distance; total time=   0.8s
[CV] END ....................n_neighbors=11, weights=uniform; total time=   0.8s
[CV] END ....................n_neighbors=11, weights=uniform; total time=   0.7s
[CV] END ...................n_neighbors=10, weights=distance; total time=   0.9s
[CV] END ....................n_neighbors=10, weights=uniform; total time=   1.1s
[CV] END ...................n_neighbors=11, weights=distance; total time=   0.8s
[CV] END ...................n_neighbors=12, weights=distance; total time=   0.6s
[CV] END ...................n_neighbors=11, weights=distance; total time=   0.8s
[CV] END ...................n_neighbors=11, wei

GridSearchCV(cv=3, estimator=KNeighborsRegressor(), n_jobs=-1,
             param_grid={'n_neighbors': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                         20, 21, 22, 23, 24, 25, 26, 27, 28,
                                         29],
                         'weights': ['uniform', 'distance']},
             verbose=2)

In [None]:
# Print the best parameters and the best score
print("Best parameters found: ", grid_search_knn.best_params_)
print("Best R^2 score: ", grid_search_knn.best_score_)

# Use the best estimator to make predictions
best_knn_model = grid_search_knn.best_estimator_
knn_predictions = best_knn_model.predict(X_test_scaled)

# Print the R^2 score on the test set
print("R^2 on test set: ", best_knn_model.score(X_test_scaled, y_test))

# Print the first 5 predictions rounding to 1 decimal place
print(np.round(knn_predictions[:5], 1))

# Print the first 5 actual values
print(np.round(y_test[:5].values, 1))

Best parameters found:  {'n_neighbors': 29, 'weights': 'distance'}
Best R^2 score:  0.10717125733733461
R^2 on test set:  0.2583721192716234
[1391.4  924.  1107.2 1407.8 1027.3]
[ 700. 1000. 1200. 1429.  800.]


### Results
- quite okay

## Neural Network 

In [None]:
# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive']
}

# Initialize the model
mlp_model = MLPRegressor(random_state=SEED, max_iter=500)

# Initialize the grid search
grid_search_mlp = GridSearchCV(estimator=mlp_model, param_grid=param_grid, cv=2, verbose=2)

# Fit the grid search to the data
grid_search_mlp.fit(X_train_scaled, y_train)



Fitting 2 folds for each of 16 candidates, totalling 32 fits




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam; total time=   3.1s




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam; total time=   3.0s




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=adaptive, solver=adam; total time=   3.0s




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=adaptive, solver=adam; total time=   3.2s




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   4.8s




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   4.7s




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   4.8s




[CV] END activation=relu, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   4.9s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam; total time=   3.2s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam; total time=   3.3s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=adaptive, solver=adam; total time=   3.4s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=adaptive, solver=adam; total time=   3.5s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   5.3s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   5.5s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   5.4s




[CV] END activation=relu, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   5.4s




[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam; total time=   4.5s




[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam; total time=   4.5s




[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=adaptive, solver=adam; total time=   4.6s




[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(50,), learning_rate=adaptive, solver=adam; total time=   4.4s




[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   7.1s




[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   7.4s




[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   7.2s




[CV] END activation=tanh, alpha=0.0001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   7.4s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam; total time=   5.2s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=constant, solver=adam; total time=   4.7s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=adaptive, solver=adam; total time=   4.6s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(50,), learning_rate=adaptive, solver=adam; total time=   4.7s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   7.5s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=constant, solver=adam; total time=   7.6s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   7.4s




[CV] END activation=tanh, alpha=0.001, hidden_layer_sizes=(100,), learning_rate=adaptive, solver=adam; total time=   7.7s




GridSearchCV(cv=2, estimator=MLPRegressor(max_iter=500, random_state=42),
             param_grid={'activation': ['relu', 'tanh'],
                         'alpha': [0.0001, 0.001],
                         'hidden_layer_sizes': [(50,), (100,)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['adam']},
             verbose=2)

In [None]:
# Print the best parameters and the best score
print("Best parameters found: ", grid_search_mlp.best_params_)
print("Best R^2 score: ", grid_search_mlp.best_score_)

# Use the best estimator to make predictions
best_mlp_model = grid_search_mlp.best_estimator_
mlp_predictions = best_mlp_model.predict(X_test_scaled)

# Print the R^2 score on the test set
print("R^2 on test set: ", best_mlp_model.score(X_test_scaled, y_test))

# Print the first 5 predictions rounding to 1 decimal place
print(np.round(mlp_predictions[:5], 1))

# Print the first 5 actual values
print(np.round(y_test[:5].values, 1))


# Create a function


Best parameters found:  {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}
Best R^2 score:  0.12465880670204954
R^2 on test set:  0.2845684004661386
[3313.2  905.8  925.9 1449.6  867.1]
[ 700. 1000. 1200. 1429.  800.]


### Results
- Needs way more experimenting