**Finding the right per night listing price for an airbnb listing in the new york city**

in order to do data cleaning we need to load the dataset:

In [77]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from scipy import stats
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_absolute_error,explained_variance_score,max_error
df=pd.read_csv('AB_NYC_2019.csv')

after loading the dataset we will start to do datacleaning in several steps:


1.dentifying and dealing with missing values

In [78]:
a=len(df)
print('missing values: ', df.isnull().sum())
df.dropna(inplace=True)
print('number of rows before dropping: ', a)
print('number of rows after dropping: ', len(df))
print('number of reduced rows', a-len(df))

missing values:  id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64
number of rows before dropping:  48895
number of rows after dropping:  38821
number of reduced rows 10074


2.Check for duplicates and drop them

In [79]:
df.drop_duplicates(inplace=True)

3.Check for data types and correct them if necessary

In [80]:
df['price'] = df['price'].astype(float)

4.Handling outlier

In [81]:

# Define function to identify and remove outliers using IQR method
def remove_outliers_iqr(data, column, threshold=1.5):
    # Calculate first and third quartiles
    q1, q3 = np.percentile(data[column], [25, 75])
    # Calculate IQR (Interquartile Range)
    iqr = q3 - q1
    # Calculate upper and lower bounds
    lower_bound = q1 - (iqr * threshold)
    upper_bound = q3 + (iqr * threshold)
    # Return filtered data without outliers
    return data.loc[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

# Apply function to remove outliers for each column of interest
df = remove_outliers_iqr(df, 'price')
df = remove_outliers_iqr(df, 'minimum_nights')
df = remove_outliers_iqr(df, 'calculated_host_listings_count')

5.Correct inconsistencies by removing any rows with a minimum_nights value of 0

In [82]:
df = df[df['minimum_nights'] > 0]

6.Useless columns:

some feature does not have any meaning in our project scope(price prediction using regression) so we will drop them :

In [83]:
df.drop(['id', 'name', 'host_name', 'last_review','host_id','reviews_per_month'], axis=1, inplace=True)

7.handling categorical variables:

some columns such as 'neighbourhood_group', 'neighbourhood', and 'room_type' are categorical variables. You can convert them to numerical data using the get_dummies() method.

In [84]:
df = pd.get_dummies(df, columns=['neighbourhood_group','neighbourhood', 'room_type'])
df[:2]

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,calculated_host_listings_count,availability_365,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,...,neighbourhood_Williamsbridge,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
1,40.75362,-73.98377,225.0,1,45,2,355,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,40.68514,-73.95976,89.0,1,270,1,194,0,1,0,...,0,0,0,0,0,0,0,1,0,0


8.Handling errors:

In [85]:
# Remove data with errors
df = df.loc[df['price'] > 0]

Total number of dataset and features after cleaning:

In [86]:
num_features = df.shape[1]
num_samples = df.shape[0]

print(f"The dataset has {num_features} features and {num_samples} samples.")

The dataset has 232 features and 29181 samples.


**Comparing different models:**

1.KNN**

we have outlier so we use min max and standard scaler technique:

In [87]:

# Step 1: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df.drop('price', axis=1), df['price'], test_size=0.2, random_state=42)

# Step 2: Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 3: Define the parameter grids for uniform and distance weights
param_grid_uniform = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform']
}

param_grid_distance = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['distance']
}

# Step 4: Define the cross-validation scheme
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Step 5: Train and evaluate the models for uniform and distance weights
knn_uniform = KNeighborsRegressor()
grid_search_uniform = GridSearchCV(knn_uniform, param_grid_uniform, cv=kf, scoring='neg_mean_squared_error')
grid_search_uniform.fit(X_train_scaled, y_train)

knn_distance = KNeighborsRegressor()
grid_search_distance = GridSearchCV(knn_distance, param_grid_distance, cv=kf, scoring='neg_mean_squared_error')
grid_search_distance.fit(X_train_scaled, y_train)

# Step 6: Select the best model
best_knn_uniform = grid_search_uniform.best_estimator_
best_knn_distance = grid_search_distance.best_estimator_

# Step 7: Evaluate the final model on the test set
yhat_uniform = best_knn_uniform.predict(X_test_scaled)
mse_uniform = mean_squared_error(y_test, yhat_uniform)

yhat_distance = best_knn_distance.predict(X_test_scaled)
mse_distance = mean_squared_error(y_test, yhat_distance)

# Step 8: Calculate R-squared score, absolute error, root mean square error, explained variance score, and max error
r2_uniform = r2_score(y_test, yhat_uniform)
r2_distance = r2_score(y_test, yhat_distance)

ae_uniform = mean_absolute_error(y_test, yhat_uniform)
ae_distance = mean_absolute_error(y_test, yhat_distance)

rmse_uniform = mean_squared_error(y_test, yhat_uniform, squared=False)
rmse_distance = mean_squared_error(y_test, yhat_distance, squared=False)

evs_uniform = explained_variance_score(y_test, yhat_uniform)
evs_distance = explained_variance_score(y_test, yhat_distance)

me_uniform = max_error(y_test, yhat_uniform)
me_distance = max_error(y_test, yhat_distance)

# Step 9: Create a table of results for different parameter options using Pandas
results = pd.DataFrame({
    'Weights': ['Uniform', 'Distance'],
    'Best KNN Model': [best_knn_uniform, best_knn_distance],
    'MSE': [mse_uniform, mse_distance],
    'R-squared': [r2_uniform, r2_distance],
    'Absolute Error': [ae_uniform, ae_distance],
    'RMSE': [rmse_uniform, rmse_distance],
    'Explained Variance Score': [evs_uniform, evs_distance],
    'Max Error': [me_uniform, me_distance]
})

print(results)



    Weights                                     Best KNN Model          MSE  \
0   Uniform                KNeighborsRegressor(n_neighbors=15)  1988.188828   
1  Distance  KNeighborsRegressor(n_neighbors=15, weights='d...  1942.615834   

   R-squared  Absolute Error       RMSE  Explained Variance Score   Max Error  
0   0.520746       32.374690  44.589111                  0.520749  241.400000  
1   0.531732       31.801213  44.075116                  0.531831  245.881398  


**2.Decision tree**

In [88]:

# Step 1: Define the parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 9, 11, 13, 15],
    'min_samples_split': [2, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 2, 4, 8],
    'criterion': ['mse', 'friedman_mse', 'mae'],
    'splitter': ['best', 'random']
}

# Step 2: Define the cross-validation scheme
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Step 3: Train and evaluate the models
dt = DecisionTreeRegressor()
grid_search = GridSearchCV(dt, param_grid, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Step 4: Select the best model
best_dt = grid_search.best_estimator_

# Step 5: Evaluate the final model on the test set
y_pred = best_dt.predict(X_test)

# Calculate evaluation metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
evs = explained_variance_score(y_test, y_pred)
me = max_error(y_test, y_pred)

print("Best Decision tree model:", best_dt)
print("R-squared score: {:.2f}".format(r2))
print("Absolute error: {:.2f}".format(mae))
print("Root mean square error: {:.2f}".format(rmse))
print("Explained variance score: {:.2f}".format(evs))
print("Max error: {:.2f}".format(me))
print("MSE:", mean_squared_error(y_test, y_pred))


Best Decision tree model: DecisionTreeRegressor(criterion='friedman_mse', max_depth=7, min_samples_leaf=8,
                      min_samples_split=25)
R-squared score: 0.53
Absolute error: 32.04
Root mean square error: 43.94
Explained variance score: 0.53
Max error: 232.61
MSE: 1930.8039534937818


3360 fits failed out of a total of 5040.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1680 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.9/dist-packages/sklearn/tree/_classes.py", line 1247, in fit
    super().fit(
  File "/usr/local/lib/python3.9/dist-packages/sklearn/tree/_classes.py", line 177, in fit
    self._validate_params()
  File "/usr/local/lib/python3.9/dist-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.9/dist-packages/sklearn/

**3.Linear regression**

In [89]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, explained_variance_score, max_error

# Step 1: Define the parameter grid
param_grid = {
    'fit_intercept': [True, False],
}

# Step 2: Define the cross-validation scheme
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Step 3: Train and evaluate the models
lr = LinearRegression()
grid = GridSearchCV(lr, param_grid, cv=kf, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

# Step 4: Select the best model and print the results
print("Linear regression model:")
results = pd.DataFrame(grid.cv_results_)
cols_to_show = ['params', 'mean_test_score', 'std_test_score']
print(results[cols_to_show].sort_values(by='mean_test_score', ascending=False))
print("Best parameters:", grid.best_params_)
print("MSE:", -grid.best_score_)

# Step 5: Evaluate the best model on the test set
best_lr = grid.best_estimator_
y_pred = best_lr.predict(X_test)

# R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

# Mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute error:", mae)

# Root mean square error
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root mean square error:", rmse)

# Explained variance score
evs = explained_variance_score(y_test, y_pred)
print("Explained variance score:", evs)

# Maximum error
max_err = max_error(y_test, y_pred)
print("Maximum error:", max_err)



Linear regression model:
                     params  mean_test_score  std_test_score
1  {'fit_intercept': False}    -2.019074e+03    9.657079e+01
0   {'fit_intercept': True}    -2.018485e+16    2.966088e+16
Best parameters: {'fit_intercept': False}
MSE: 2019.0744426458011
R-squared score: 0.5324967403269235
Mean absolute error: 32.34937624427801
Root mean square error: 44.03909610023903
Explained variance score: 0.5325643007432128
Maximum error: 241.72930853569778


4.Ridge

In [90]:
# Step 1: Define the parameter grid
param_grid = {
    'fit_intercept': [True, False],
}

# Step 2: Define the cross-validation scheme
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Step 3: Train and evaluate the models
lr = LinearRegression()
grid = GridSearchCV(lr, param_grid, cv=kf, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

# Step 4: Select the best model and print the results
print("Linear regression model:")
results = pd.DataFrame(grid.cv_results_)
cols_to_show = ['params', 'mean_test_score', 'std_test_score']
print(results[cols_to_show].sort_values(by='mean_test_score', ascending=False))
print("Best parameters:", grid.best_params_)
print("MSE:", -grid.best_score_)

# Step 5: Evaluate the best model on the test set
best_lr = grid.best_estimator_
y_pred = best_lr.predict(X_test)

# R-squared score
r2 = r2_score(y_test, y_pred)
print("R-squared score:", r2)

# Mean absolute error
mae = mean_absolute_error(y_test, y_pred)
print("Mean absolute error:", mae)

# Root mean square error
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Root mean square error:", rmse)

# Explained variance score
evs = explained_variance_score(y_test, y_pred)
print("Explained variance score:", evs)

# Maximum error
max_err = max_error(y_test, y_pred)
print("Maximum error:", max_err)



Linear regression model:
                     params  mean_test_score  std_test_score
1  {'fit_intercept': False}    -2.019074e+03    9.657079e+01
0   {'fit_intercept': True}    -2.018485e+16    2.966088e+16
Best parameters: {'fit_intercept': False}
MSE: 2019.0744426458011
R-squared score: 0.5324967403269235
Mean absolute error: 32.34937624427801
Root mean square error: 44.03909610023903
Explained variance score: 0.5325643007432128
Maximum error: 241.72930853569778


5.Lasso

In [91]:
# Step 1: Define the parameter grid
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Step 2: Define the cross-validation scheme
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Step 3: Train and evaluate the models
lasso = Lasso()
grid_search = GridSearchCV(lasso, param_grid=param_grid, cv=kf, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = -grid_search.best_score_

# Step 4: Select the best model
print("Lasso model:")
print("Best alpha:", best_model.alpha)
print("MSE:", mse)

# Step 5: Calculate and print additional evaluation metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
evs = explained_variance_score(y_test, y_pred)
me = max_error(y_test, y_pred)

print("R-squared score:", r2)
print("Absolute error:", mae)
print("Root mean square error:", rmse)
print("Explained variance score:", evs)
print("Max error:", me)



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso model:
Best alpha: 0.001
MSE: 1963.140104611334
R-squared score: 0.5370094646965045
Absolute error: 32.232724765138904
Root mean square error: 43.82602996837923
Explained variance score: 0.5370484248184892
Max error: 229.56455865329553


  model = cd_fast.enet_coordinate_descent(


**Conclusion:** Ensemble Learning:


In [92]:
knn_best = KNeighborsRegressor(n_neighbors=15,weights= 'distance')
dt_best = DecisionTreeRegressor(criterion='friedman_mse', max_depth=7, min_samples_leaf=8,min_samples_split=20)
lr = LinearRegression()
ridge_best = Ridge(alpha=10)
lasso_best = Lasso(alpha=0.01)

# Fit the models with the best hyperparameters
knn_best.fit(X_train, y_train)
dt_best.fit(X_train, y_train)
ridge_best.fit(X_train, y_train)
lasso_best.fit(X_train, y_train)
lr.fit(X_train, y_train)
# Use each individual model to predict the target variable on the test set
knn_pred = knn_best.predict(X_test)
dt_pred = dt_best.predict(X_test)
ridge_pred = ridge_best.predict(X_test)
lasso_pred = lasso_best.predict(X_test)
lr_pred = lr.predict(X_test)
# Step 3: Concatenate the predictions into a new feature matrix
X_meta = np.column_stack((knn_pred, dt_pred, ridge_pred, lasso_pred,lr_pred))

# Fine-tune the meta-model
meta_params = {'fit_intercept': [True, False]}
meta_grid = GridSearchCV(LinearRegression(), meta_params, cv=5, scoring='neg_mean_squared_error')
meta_grid.fit(X_meta, y_test)
meta_best = meta_grid.best_estimator_

# Use the best meta-model to predict the target variable on new data
ensemble_pred = meta_best.predict(X_meta)

# Compute the mean squared error of the ensemble model on the test set
mse = mean_squared_error(y_test, ensemble_pred)
print("Ensemble model MSE:", mse)

Ensemble model MSE: 1871.9161379030688


In [93]:

# Calculate R-squared score
r2 = r2_score(y_test, ensemble_pred)
print("Ensemble model R-squared:", r2)

# Calculate mean absolute error
mae = mean_absolute_error(y_test, ensemble_pred)
print("Ensemble model MAE:", mae)

# Calculate root mean squared error
rmse = mean_squared_error(y_test, ensemble_pred, squared=False)
print("Ensemble model RMSE:", rmse)

# Calculate explained variance score
evs = explained_variance_score(y_test, ensemble_pred)
print("Ensemble model explained variance score:", evs)

# Calculate max error
max_error = max_error(y_test, ensemble_pred)
print("Ensemble model max error:", max_error)

Ensemble model R-squared: 0.5487738726264704
Ensemble model MAE: 31.5186528274675
Ensemble model RMSE: 43.265646163013315
Ensemble model explained variance score: 0.5487755331468478
Ensemble model max error: 229.70795098043374
