In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_excel('FinalDataset-2.xlsx')

#Print first few rows and shape of the dataset
print(df.head())
print(df.shape)

          RentID  Year  RegionID     Region Province UnitID  \
0   Q05013T45654  2020       185  Montmagny   Quebec     IV   
1   Q05020T15153  2020       186     Granby   Quebec      I   
2   Q05020T18234  2020       185  Montmagny   Quebec      I   
3   Q05021T21664  2020       185  Montmagny   Quebec     II   
4   Q05021T28563  2020       186     Granby   Quebec     II   

              UnitType StructureID  \
0  Three bedroom units           C   
1       Bachelor units           D   
2       Bachelor units           D   
3    One bedroom units           D   
4    One bedroom units           D   

                                       StructureType  RentValue Status  
0       Apartment structures of three units and over        685      T  
1  Row and apartment structures of three units an...        455      T  
2  Row and apartment structures of three units an...        513      T  
3  Row and apartment structures of three units an...        496      T  
4  Row and apartment struct

In [5]:
# Define features and target variable
#separate the features (X) from the target variable (y). 
#The RentID and RentValue columns are dropped from X, and RentValue is assigned to y as the target variable we want to predict.
X = df.drop(columns=['RentID', 'RentValue'])
y = df['RentValue']

# Identify categorical and numerical columns - identify which columns in the dataset are categorical and which are numerical. 
categorical_columns = ['Region', 'Province', 'UnitType', 'StructureType', 'Status']
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline for categorical features 
# one-hot coding - converting categorical data into binary 1s or 0s
#The parameter handle_unknown='ignore' ensures that any unknown categories in the test set are ignored.
categorical_preprocessor = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for numerical features using one-hot encoding
# 'StandardScaler' standardizes the features by removing the mean and scaling to unit variance
numerical_preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing pipelines- numerical features using standard scaling
# combines the preprocessing steps for both categorical and numerical features into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_columns),
        ('cat', categorical_preprocessor, categorical_columns)
    ])

# Define the model
#fits a RandomForestRegressor model. 
#The RandomForestRegressor is set with n_estimators=100 (the number of trees in the forest) and random_state=42 (for reproducibility
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Random Forest Regressor Root Mean Squared Error: {rmse}")

# Generate new data for the years 2025, 2026, 2027 for all combinations of existing regions, unit types, etc.
unique_combinations = df.drop(columns=['RentID', 'RentValue', 'Year']).drop_duplicates()

years = [2025, 2026, 2027]
new_data_list = []

for year in years:
    temp_data = unique_combinations.copy()
    temp_data['Year'] = year
    new_data_list.append(temp_data)

new_data = pd.concat(new_data_list, ignore_index=True)

# Ensure the lengths of columns are correct by verifying each field
assert all(len(new_data[col]) == len(new_data['Year']) for col in new_data.columns)

# Since the new_data contains categorical values, they need to be preprocessed using the same encoder used in the training pipeline
# Transform the new data using the preprocessor and make predictions
new_data_preprocessed = model.named_steps['preprocessor'].transform(new_data)
predictions = model.named_steps['regressor'].predict(new_data_preprocessed)

# Add the predictions to the new_data DataFrame
new_data['PredictedRentValue'] = predictions

# Save the new dataset with predictions to a CSV file
output_file_path = 'PredictedRentValues_2025_2027_regression_new.xlsx'
new_data.to_excel(output_file_path, index=False)

# Return the new data as output
print(new_data.head(101))


Random Forest Regressor Root Mean Squared Error: 89.25821098094676
     RegionID     Region Province UnitID             UnitType StructureID  \
0         185  Montmagny   Quebec     IV  Three bedroom units           C   
1         186     Granby   Quebec      I       Bachelor units           D   
2         185  Montmagny   Quebec      I       Bachelor units           D   
3         185  Montmagny   Quebec     II    One bedroom units           D   
4         186     Granby   Quebec     II    One bedroom units           D   
..        ...        ...      ...    ...                  ...         ...   
96        194   Val-d'Or   Quebec    III    Two bedroom units           C   
97        194   Val-d'Or   Quebec     IV  Three bedroom units           C   
98        194   Val-d'Or   Quebec      I       Bachelor units           D   
99        194   Val-d'Or   Quebec     II    One bedroom units           D   
100       194   Val-d'Or   Quebec    III    Two bedroom units           D   

        

In [9]:
# Define features and target variable
#separate the features (X) from the target variable (y). 
#The RentID and RentValue columns are dropped from X, and RentValue is assigned to y as the target variable we want to predict.
X = df.drop(columns=['RentID', 'RentValue'])
y = df['RentValue']

# Identify categorical and numerical columns - identify which columns in the dataset are categorical and which are numerical. 
categorical_columns = ['Region', 'Province', 'UnitType', 'StructureType', 'Status']
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline for categorical features 
# one-hot coding - converting categorical data into binary 1s or 0s
#The parameter handle_unknown='ignore' ensures that any unknown categories in the test set are ignored.
categorical_preprocessor = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for numerical features using one-hot encoding
# 'StandardScaler' standardizes the features by removing the mean and scaling to unit variance
numerical_preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing pipelines- numerical features using standard scaling
# combines the preprocessing steps for both categorical and numerical features into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_columns),
        ('cat', categorical_preprocessor, categorical_columns)
    ])

# Define the model
#fits a RandomForestRegressor model. 
#The RandomForestRegressor is set with n_estimators=100 (the number of trees in the forest) and random_state=42 (for reproducibility
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Random Forest Regressor Root Mean Squared Error: {rmse}")

# Generate new data for the years 2025, 2026, 2027 for all combinations of existing regions, unit types, etc.
unique_combinations = df.drop(columns=['RentID', 'RentValue', 'Year']).drop_duplicates()

years = [2025, 2026, 2027]
new_data_list = []

for year in years:
    temp_data = unique_combinations.copy()
    temp_data['Year'] = year
    new_data_list.append(temp_data)

new_data = pd.concat(new_data_list, ignore_index=True)

# Ensure the lengths of columns are correct by verifying each field
assert all(len(new_data[col]) == len(new_data['Year']) for col in new_data.columns)

# Since the new_data contains categorical values, they need to be preprocessed using the same encoder used in the training pipeline
# Transform the new data using the preprocessor and make predictions
new_data_preprocessed = model.named_steps['preprocessor'].transform(new_data)
predictions = model.named_steps['regressor'].predict(new_data_preprocessed)

# Add the predictions to the new_data DataFrame
new_data['PredictedRentValue'] = predictions

# Save the new dataset with predictions to a CSV file
output_file_path = 'PredictedRentValues_2025_2027_regression.csv'
new_data.to_csv(output_file_path, index=False)

# Return the new data as output
print(new_data.head(101))


Root Mean Squared Error: 89.25821098094676
     RegionID     Region Province UnitID             UnitType StructureID  \
0         185  Montmagny   Quebec     IV  Three bedroom units           C   
1         186     Granby   Quebec      I       Bachelor units           D   
2         185  Montmagny   Quebec      I       Bachelor units           D   
3         185  Montmagny   Quebec     II    One bedroom units           D   
4         186     Granby   Quebec     II    One bedroom units           D   
..        ...        ...      ...    ...                  ...         ...   
96        194   Val-d'Or   Quebec    III    Two bedroom units           C   
97        194   Val-d'Or   Quebec     IV  Three bedroom units           C   
98        194   Val-d'Or   Quebec      I       Bachelor units           D   
99        194   Val-d'Or   Quebec     II    One bedroom units           D   
100       194   Val-d'Or   Quebec    III    Two bedroom units           D   

                                

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Define features and target variable
X = df.drop(columns=['RentID', 'RentValue'])
y = df['RentValue']

# Identify categorical and numerical columns
categorical_columns = ['Region', 'Province', 'UnitType', 'StructureType', 'Status']
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline for categorical features
categorical_preprocessor = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for numerical features
numerical_preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_columns),
        ('cat', categorical_preprocessor, categorical_columns)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Gradient Boosting Regressor Root Mean Squared Error: {rmse}")

# Generate new data for the years 2025, 2026, 2027
unique_combinations = df.drop(columns=['RentID', 'RentValue', 'Year']).drop_duplicates()

years = [2025, 2026, 2027]
new_data_list = []

for year in years:
    temp_data = unique_combinations.copy()
    temp_data['Year'] = year
    new_data_list.append(temp_data)

new_data = pd.concat(new_data_list, ignore_index=True)

# Ensure the lengths of columns are correct by verifying each field
assert all(len(new_data[col]) == len(new_data['Year']) for col in new_data.columns)

# Transform the new data using the preprocessor and make predictions
new_data_preprocessed = model.named_steps['preprocessor'].transform(new_data)
predictions = model.named_steps['regressor'].predict(new_data_preprocessed)

# Add the predictions to the new_data DataFrame
new_data['PredictedRentValue'] = predictions

# Save the new dataset with predictions to a CSV file
output_file_path = 'PredictedRentValues_2025_2027_gradient_boosting.csv'
new_data.to_csv(output_file_path, index=False)

# Return the new data as output
print(new_data.head(101))


Gradient Boosting Regressor Root Mean Squared Error: 153.99158515240853
     RegionID     Region Province UnitID             UnitType StructureID  \
0         185  Montmagny   Quebec     IV  Three bedroom units           C   
1         186     Granby   Quebec      I       Bachelor units           D   
2         185  Montmagny   Quebec      I       Bachelor units           D   
3         185  Montmagny   Quebec     II    One bedroom units           D   
4         186     Granby   Quebec     II    One bedroom units           D   
..        ...        ...      ...    ...                  ...         ...   
96        194   Val-d'Or   Quebec    III    Two bedroom units           C   
97        194   Val-d'Or   Quebec     IV  Three bedroom units           C   
98        194   Val-d'Or   Quebec      I       Bachelor units           D   
99        194   Val-d'Or   Quebec     II    One bedroom units           D   
100       194   Val-d'Or   Quebec    III    Two bedroom units           D   

   

In [14]:
pip install xgboost lightgbm catboost

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-2.1.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Collecting lightgbm
  Downloading lightgbm-4.4.0-py3-none-win_amd64.whl.metadata (19 kB)
Collecting catboost
  Downloading catboost-1.2.5-cp311-cp311-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading xgboost-2.1.0-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.1/124.9 MB 2.0 MB/s eta 0:01:03
   ---------------------------------------- 1.0/124.9 MB 12.1 MB/s eta 0:00:11
    --------------------------------------- 2.4/124.9 MB 19.4 MB/s eta 0:00:07
   - -------------------------------------- 4.0/124.9 MB 23.1 MB/s eta 0:00:06
   - -------------------------------------- 4.5/124.9 MB 20.6 MB/s eta 0:00:06
   - ----------------------------------

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

# Define features and target variable
X = df.drop(columns=['RentID', 'RentValue'])
y = df['RentValue']

# Identify categorical and numerical columns
categorical_columns = ['Region', 'Province', 'UnitType', 'StructureType', 'Status']
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline for categorical features
categorical_preprocessor = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for numerical features
numerical_preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_columns),
        ('cat', categorical_preprocessor, categorical_columns)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', CatBoostRegressor(iterations=200, random_state=42, verbose=0))
])

# Split the data into training and testing sets (70-30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"CatBoost Regressor RMSE: {rmse}")

# Generate new data for the years 2025, 2026, 2027 for all combinations of existing regions, unit types, etc.
unique_combinations = df.drop(columns=['RentID', 'RentValue', 'Year']).drop_duplicates()

years = [2025, 2026, 2027]
new_data_list = []

for year in years:
    temp_data = unique_combinations.copy()
    temp_data['Year'] = year
    new_data_list.append(temp_data)

new_data = pd.concat(new_data_list, ignore_index=True)

# Ensure the lengths of columns are correct by verifying each field
assert all(len(new_data[col]) == len(new_data['Year']) for col in new_data.columns)

# Since the new_data contains categorical values, they need to be preprocessed using the same encoder used in the training pipeline
# Transform the new data using the preprocessor and make predictions
new_data_preprocessed = model.named_steps['preprocessor'].transform(new_data)
predictions = model.named_steps['regressor'].predict(new_data_preprocessed)

# Add the predictions to the new_data DataFrame
new_data['PredictedRentValue'] = predictions

# Save the new dataset with predictions to a CSV file
output_file_path = 'PredictedRentValues_2025_2027_catboost.csv'
new_data.to_csv(output_file_path, index=False)

# Return the new data as output
print(new_data.head())


CatBoost Regressor RMSE: 99.15600569179853
   RegionID     Region Province UnitID             UnitType StructureID  \
0       185  Montmagny   Quebec     IV  Three bedroom units           C   
1       186     Granby   Quebec      I       Bachelor units           D   
2       185  Montmagny   Quebec      I       Bachelor units           D   
3       185  Montmagny   Quebec     II    One bedroom units           D   
4       186     Granby   Quebec     II    One bedroom units           D   

                                       StructureType Status  Year  \
0       Apartment structures of three units and over      T  2025   
1  Row and apartment structures of three units an...      T  2025   
2  Row and apartment structures of three units an...      T  2025   
3  Row and apartment structures of three units an...      T  2025   
4  Row and apartment structures of three units an...      T  2025   

   PredictedRentValue  
0          875.939861  
1          574.984552  
2          516.8350

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor


# Define features and target variable
X = df.drop(columns=['RentID', 'RentValue'])
y = df['RentValue']

# Identify categorical and numerical columns
categorical_columns = ['Region', 'Province', 'UnitType', 'StructureType', 'Status']
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline for categorical features
categorical_preprocessor = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing pipeline for numerical features
numerical_preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessing pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_preprocessor, numerical_columns),
        ('cat', categorical_preprocessor, categorical_columns)
    ])

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LGBMRegressor(n_estimators=200, random_state=42))
])

# Split the data into training and testing sets (60-40 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"LightGBM Regressor RMSE: {rmse}")

# Generate new data for the years 2025, 2026, 2027 for all combinations of existing regions, unit types, etc.
unique_combinations = df.drop(columns=['RentID', 'RentValue', 'Year']).drop_duplicates()

years = [2025, 2026, 2027]
new_data_list = []

for year in years:
    temp_data = unique_combinations.copy()
    temp_data['Year'] = year
    new_data_list.append(temp_data)

new_data = pd.concat(new_data_list, ignore_index=True)

# Ensure the lengths of columns are correct by verifying each field
assert all(len(new_data[col]) == len(new_data['Year']) for col in new_data.columns)

# Since the new_data contains categorical values, they need to be preprocessed using the same encoder used in the training pipeline
# Transform the new data using the preprocessor and make predictions
new_data_preprocessed = model.named_steps['preprocessor'].transform(new_data)
predictions = model.named_steps['regressor'].predict(new_data_preprocessed)

# Add the predictions to the new_data DataFrame
new_data['PredictedRentValue'] = predictions

# Save the new dataset with predictions to a CSV file
output_file_path = 'PredictedRentValues_2025_2027_lgbm.csv'
new_data.to_csv(output_file_path, index=False)

# Return the new data as output
print(new_data.head())


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 446
[LightGBM] [Info] Number of data points in the train set: 3970, number of used features: 133
[LightGBM] [Info] Start training from score 1018.056927
LightGBM Regressor RMSE: 101.72071691487274
   RegionID     Region Province UnitID             UnitType StructureID  \
0       185  Montmagny   Quebec     IV  Three bedroom units           C   
1       186     Granby   Quebec      I       Bachelor units           D   
2       185  Montmagny   Quebec      I       Bachelor units           D   
3       185  Montmagny   Quebec     II    One bedroom units           D   
4       186     Granby   Quebec     II    One bedroom units           D   

                                       StructureType Status  Year  \
0       Apartment structures