<a href="https://colab.research.google.com/github/amanmehra-23/AirBnB-price-prediction/blob/main/AirBnBModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import  LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter('ignore')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Airbnb/final_data.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66660 entries, 0 to 66659
Data columns (total 28 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      66660 non-null  int64  
 1   log_price               66660 non-null  float64
 2   property_type           66660 non-null  object 
 3   room_type               66660 non-null  object 
 4   amenities               66660 non-null  object 
 5   accommodates            66660 non-null  int64  
 6   bathrooms               66660 non-null  float64
 7   bed_type                66660 non-null  object 
 8   cancellation_policy     66660 non-null  object 
 9   cleaning_fee            66660 non-null  bool   
 10  city                    66660 non-null  object 
 11  description             66660 non-null  object 
 12  first_review            66660 non-null  object 
 13  host_has_profile_pic    66660 non-null  object 
 14  host_identity_verified  66660 non-null

In [None]:
amenities_count = []
for i in df["amenities"]:
    amenities_count.append(len(i))

df["amenities"] = amenities_count


In [None]:
categorical_col = []
numerical_col = []
for column in df.columns:

    if df[column].dtypes != "float64" and df[column].dtypes != "int64":
        categorical_col.append(column)
    else:
        numerical_col.append(column)

In [None]:
print(numerical_col)
len(numerical_col)

['id', 'log_price', 'amenities', 'accommodates', 'bathrooms', 'host_response_rate', 'latitude', 'longitude', 'number_of_reviews', 'review_scores_rating', 'bedrooms', 'beds']


12

In [None]:
print(categorical_col)
len(categorical_col)

['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'cleaning_fee', 'city', 'description', 'first_review', 'host_has_profile_pic', 'host_identity_verified', 'host_since', 'instant_bookable', 'last_review', 'name', 'neighbourhood', 'zipcode']


16

In [None]:
categorical_col = [col for col in categorical_col if col not in ["zipcode", "last_review", "first_review", "name","description"]]

In [None]:
numerical_col = [col for col in numerical_col if col not in ["id", "log_price"]]

In [None]:
print(categorical_col)
print(numerical_col)

['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'cleaning_fee', 'city', 'host_has_profile_pic', 'host_identity_verified', 'host_since', 'instant_bookable', 'neighbourhood']
['amenities', 'accommodates', 'bathrooms', 'host_response_rate', 'latitude', 'longitude', 'number_of_reviews', 'review_scores_rating', 'bedrooms', 'beds']


In [None]:
y = df["log_price"]

In [None]:
le = LabelEncoder()
for col in categorical_col:
    df[col] = le.fit_transform(df[col])

In [None]:

numerical_df = df[numerical_col]
categorical_df = df[categorical_col]
X = pd.concat([numerical_df, categorical_df], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=101)
X_train.shape, X_test.shape


((53328, 21), (13332, 21))

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for model_name, model in models.items():
    print(f"Training {model_name}")

    # Train model
    model.fit(X_train, y_train)

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(model_name)
    model_list.append(model_name)

    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))
    print('----------------------------------')
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    print('='*35)
    print('\n')


Training Linear Regression
Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.4811
- Mean Absolute Error: 0.3628
- R2 Score: 0.5393
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.4786
- Mean Absolute Error: 0.3605
- R2 Score: 0.5503


Training Lasso
Lasso
Model performance for Training set
- Root Mean Squared Error: 0.6987
- Mean Absolute Error: 0.5467
- R2 Score: 0.0284
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.7022
- Mean Absolute Error: 0.5511
- R2 Score: 0.0318


Training Ridge
Ridge
Model performance for Training set
- Root Mean Squared Error: 0.4811
- Mean Absolute Error: 0.3628
- R2 Score: 0.5393
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.4786
- Mean Absolute Error: 0.3605
- R2 Score: 0.5503


Training K-Neighbors Regressor
K-Neighbors Regressor
Model performance for Training set
- Root Mean Square

In [None]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,R2_Score
7,CatBoosting Regressor,0.720768
6,XGBRegressor,0.71343
5,Random Forest Regressor,0.700225
2,Ridge,0.550329
0,Linear Regression,0.550328
8,AdaBoost Regressor,0.473604
4,Decision Tree,0.411149
1,Lasso,0.031767
3,K-Neighbors Regressor,-0.036423


In [80]:
# Define the best parameters obtained from hyperparameter tuning
xgb_best_params = {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
catboost_best_params = {'depth': 7, 'learning_rate': 0.2, 'n_estimators': 300}

# Initialize and train XGBoost model with best parameters
xgb_model = XGBRegressor(**xgb_best_params)
xgb_model.fit(X_train, y_train)

# Initialize and train CatBoost model with best parameters
catboost_model = CatBoostRegressor(**catboost_best_params, verbose=False)
catboost_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)
y_pred_catboost = catboost_model.predict(X_test)

# Evaluate models using the evaluate_model function
xgb_mae, xgb_rmse, xgb_r2 = evaluate_model(y_test, y_pred_xgb)
catboost_mae, catboost_rmse, catboost_r2 = evaluate_model(y_test, y_pred_catboost)

# Print evaluation metrics
print("XGBoost Model Evaluation:")
print("- MAE:", xgb_mae)
print("- RMSE:", xgb_rmse)
print("- R2 Score:", xgb_r2)
print("\nCatBoost Model Evaluation:")
print("- MAE:", catboost_mae)
print("- RMSE:", catboost_rmse)
print("- R2 Score:", catboost_r2)

XGBoost Model Evaluation:
- MAE: 0.2732536829955739
- RMSE: 0.3778894623213432
- R2 Score: 0.7196204932259123

CatBoost Model Evaluation:
- MAE: 0.27355529034156234
- RMSE: 0.37784529994617044
- R2 Score: 0.719686022978254


The R2 score Of Catboost On Test case came out to be the highest at .072 on test case and 0.76 on training data
on hyper parameter tunning also there wasnt an increase in r2 score