In [1]:
import pandas as pd

# Load the training set
train_set_path = './Datasets/final-data/train_set.csv'
train_set = pd.read_csv(train_set_path)

# Display the first few rows of the training set to understand its structure
train_set.head()


Unnamed: 0,town,flat_model,floor_area_sqm,lease_commence_date,latitude,longitude,subzone,planning_area,region,monthly_rent,...,mall_nearest_distance,mall_nearest,mall_count_within_1km,mrt_planned_nearest_distance,mrt_planned_nearest,mrt_planned_count_within_1km,flat_type_ordinal,distance_to_centroid_marina_bay,lease_duration,school_score
0,ang mo kio,new generation,67.0,1979,1.3666,103.855579,chong boon,ang mo kio,north-east region,2450,...,1036.0,Jubilee Square,0,765.0,Ang Mo Kio,1,2,8.879467,44,336.6
1,choa chu kang,improved,110.0,2003,1.401158,103.751472,yew tee,choa chu kang,west region,2950,...,598.0,Yew Tee Square,2,609.0,Yew Tee,1,4,17.988626,20,159.2
2,clementi,new generation,67.0,1978,1.316994,103.768507,clementi north,clementi,west region,1950,...,247.0,321 Clementi,3,424.0,Commonwealth,2,2,11.391051,45,330.5
3,bukit batok,model a,74.0,1984,1.347162,103.742808,bukit batok west,bukit batok,west region,1800,...,788.0,West Mall,1,775.0,Bukit Batok,1,2,15.280442,39,275.9
4,sembawang,model a,93.0,2002,1.450165,103.819307,sembawang central,sembawang,north region,2200,...,220.0,Sun Plaza,1,157.0,Sembawang,1,3,18.83574,21,319.5


In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix
correlation_matrix = train_set.corr()

# # Plotting the heatmap of the correlation matrix
# plt.figure(figsize=(12, 8))
# sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
# plt.title('Correlation Heatmap')
# plt.show()

# Showing the correlation of features with the target variable 'monthly_rent'
correlation_with_target = correlation_matrix['monthly_rent'].sort_values(ascending=False)
correlation_with_target

monthly_rent                       1.000000
coe_price                          0.523555
rent_approval_year                 0.504592
Year                               0.504592
flat_type_ordinal                  0.325429
floor_area_sqm                     0.310135
Stock_Price                        0.298779
lease_commence_date                0.229695
mrt_planned_count_within_1km       0.086577
mrt_count_within_1km               0.086457
mall_count_within_1km              0.062640
mrt_planned_nearest                0.039457
mrt_nearest                        0.035967
rent_approval_month                0.033383
Month                              0.033383
subzone                            0.007981
planning_area                      0.004862
longitude                         -0.000876
town                              -0.002161
mall_nearest                      -0.024042
school_score                      -0.039169
mrt_nearest_distance              -0.063785
mrt_planned_nearest_distance    

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Preprocessing: Encoding categorical features
categorical_features = train_set.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder() for col in categorical_features}

for col in categorical_features:
    train_set[col] = label_encoders[col].fit_transform(train_set[col])

# Feature Selection: Dropping features with very low correlation with the target
# We keep some features with moderate correlation and all categorical features
features_to_drop = correlation_with_target[abs(correlation_with_target) < 0.05].index
train_set_reduced = train_set.drop(features_to_drop, axis=1)

# Splitting the data into training and validation sets
X = train_set_reduced.drop('monthly_rent', axis=1)
y = train_set_reduced['monthly_rent']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# Predicting on the validation set
y_pred = rf_regressor.predict(X_val)

# Evaluating the model
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

mae, mse, r2


(390.4274266323954, 269936.60460674297, 0.4700704576062311)

In [6]:
from sklearn.preprocessing import PolynomialFeatures

# Feature Engineering: Experimenting with Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Splitting the polynomial features into training and validation sets
X_train_poly, X_val_poly, y_train_poly, y_val_poly = train_test_split(X_poly, y, test_size=0.2, random_state=42)

# Training a Random Forest Regressor with Polynomial Features
rf_regressor_poly = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor_poly.fit(X_train_poly, y_train_poly)

# Predicting on the validation set with Polynomial Features
y_pred_poly = rf_regressor_poly.predict(X_val_poly)

# Evaluating the model with Polynomial Features
mae_poly = mean_absolute_error(y_val_poly, y_pred_poly)
mse_poly = mean_squared_error(y_val_poly, y_pred_poly)
r2_poly = r2_score(y_val_poly, y_pred_poly)

mae_poly, mse_poly, r2_poly


KeyboardInterrupt: 

In [1]:
import pandas as pd
from tpot import TPOTRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load your datasets
train_set = pd.read_csv('./Datasets/final-data/train_set.csv')
test_set = pd.read_csv('./Datasets/final-data/test_set.csv')

# Assume 'X' contains the features and 'y' is the target variable in your datasets
X_train = train_set.drop('monthly_rent', axis=1)
y_train = train_set['monthly_rent']
X_test = test_set.drop('monthly_rent', axis=1)
y_test = test_set['monthly_rent']

# Drop the specified columns
columns_to_drop = ['town', 'region', 'mrt_nearest', 'mall_nearest', 'mrt_planned_nearest']
X_train = X_train.drop(columns_to_drop, axis=1)
X_test = X_test.drop(columns_to_drop, axis=1)

# Identify the remaining categorical columns
remaining_categorical_columns = ['flat_model', 'subzone', 'planning_area']

# Apply one-hot encoding to 'flat_model' and 'planning_area'
one_hot_columns = ['flat_model', 'planning_area']

# Reinitialize the encoders
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
label_encoder = LabelEncoder()

# Apply one-hot encoding to 'flat_model' and 'planning_area'
X_train_one_hot = encoder.fit_transform(X_train[one_hot_columns])
X_test_one_hot = encoder.transform(X_test[one_hot_columns])
X_train_one_hot = pd.DataFrame(X_train_one_hot, columns=encoder.get_feature_names_out(one_hot_columns))
X_test_one_hot = pd.DataFrame(X_test_one_hot, columns=encoder.get_feature_names_out(one_hot_columns))

# Apply label encoding to 'subzone'
X_train['subzone'] = label_encoder.fit_transform(X_train['subzone'])
X_test['subzone'] = label_encoder.transform(X_test['subzone'])

# Combine the processed data
X_train = X_train.drop(one_hot_columns, axis=1)
X_test = X_test.drop(one_hot_columns, axis=1)
X_train = pd.concat([X_train, X_train_one_hot], axis=1)
X_test = pd.concat([X_test, X_test_one_hot], axis=1)

# Align the training and test sets
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

X_train.shape, X_test.shape

((48000, 69), (12000, 69))

In [2]:
# Create a TPOT regressor
tpot = TPOTRegressor(
    generations=5,  # Number of iterations to the run pipeline optimization process
    population_size=20,  # Number of individuals to retain in the genetic programming population every generation
    verbosity=2,  # Show progress
    random_state=42,  # Seed for reproducibility
    n_jobs=-1,  # Use all available cores
)

# Run the TPOT optimization
tpot.fit(X_train, y_train)

# Score on the test set
print("Test Score: ", tpot.score(X_test, y_test))

# Export the best pipeline as a Python script
tpot.export('best_pipeline.py')


Optimization Progress:   0%|          | 0/120 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -240012.45349061466

Generation 2 - Current best internal CV score: -240012.45349061466

Generation 3 - Current best internal CV score: -239978.23408832456

Generation 4 - Current best internal CV score: -238751.41967053938

Generation 5 - Current best internal CV score: -238253.8070183262

Best pipeline: ExtraTreesRegressor(RidgeCV(input_matrix), bootstrap=False, max_features=0.4, min_samples_leaf=17, min_samples_split=10, n_estimators=100)
Test Score:  -236929.47970789435




In [3]:
from math import sqrt

# Create a TPOT regressor
tpot = TPOTRegressor(
    generations=5,  # Number of iterations to the run pipeline optimization process
    population_size=30,  # Number of individuals to retain in the genetic programming population every generation
    verbosity=2,  # Show progress
    random_state=42,  # Seed for reproducibility
    n_jobs=-1,  # Use all available cores
    scoring='neg_mean_squared_error',  # Use negative mean squared error for optimization
    warm_start=True  # Reuse the best model from the previous run
)

# Run the TPOT optimization
tpot.fit(X_train, y_train)

# Score on the test set using RMSE
neg_mse = tpot.score(X_test, y_test)
rmse = sqrt(-neg_mse)
print("Test RMSE: ", rmse)

# Export the best pipeline as a Python script
tpot.export('best_pipeline.py')

Optimization Progress:   0%|          | 0/180 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -240012.45349061466

Generation 2 - Current best internal CV score: -240012.45349061466

Generation 3 - Current best internal CV score: -239736.105788069

Generation 4 - Current best internal CV score: -238388.2665978133

Generation 5 - Current best internal CV score: -238007.00834292313

Best pipeline: RandomForestRegressor(ElasticNetCV(StandardScaler(input_matrix), l1_ratio=0.75, tol=0.01), bootstrap=True, max_features=0.5, min_samples_leaf=17, min_samples_split=14, n_estimators=100)
Test RMSE:  487.8109410530799


In [4]:
from math import sqrt

# Create a TPOT regressor
tpot = TPOTRegressor(
    generations=10,  # Number of iterations to the run pipeline optimization process
    population_size=20,  # Number of individuals to retain in the genetic programming population every generation
    verbosity=2,  # Show progress
    random_state=21,  # Seed for reproducibility
    n_jobs=-1,  # Use all available cores
    scoring='neg_mean_squared_error',  # Use negative mean squared error for optimization
    warm_start=True  # Reuse the best model from the previous run
)

# Run the TPOT optimization
tpot.fit(X_train, y_train)

# Score on the test set using RMSE
neg_mse = tpot.score(X_test, y_test)
rmse = sqrt(-neg_mse)
print("Test RMSE: ", rmse)

# Export the best pipeline as a Python script
tpot.export('best_pipeline.py')

Optimization Progress:   0%|          | 0/220 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -241887.7362343186

Generation 2 - Current best internal CV score: -241887.7362343186

Generation 3 - Current best internal CV score: -240823.10553396968

Generation 4 - Current best internal CV score: -240365.91596491076

Generation 5 - Current best internal CV score: -240365.91596491076

Generation 6 - Current best internal CV score: -240107.4151266945

Generation 7 - Current best internal CV score: -240107.4151266945

Generation 8 - Current best internal CV score: -240023.23758409685

Generation 9 - Current best internal CV score: -240023.23758409685

Generation 10 - Current best internal CV score: -237891.94109066128

Best pipeline: ExtraTreesRegressor(RidgeCV(input_matrix), bootstrap=True, max_features=1.0, min_samples_leaf=20, min_samples_split=20, n_estimators=100)
Test RMSE:  487.0264432087495




In [6]:
from math import sqrt

# Create a TPOT regressor
tpot = TPOTRegressor(
    generations=20,  # Number of iterations to the run pipeline optimization process
    population_size=30,  # Number of individuals to retain in the genetic programming population every generation
    verbosity=2,  # Show progress
    random_state=42,  # Seed for reproducibility
    n_jobs=-1,  # Use all available cores
    scoring='neg_mean_squared_error',  # Use negative mean squared error for optimization
    warm_start=True  # Reuse the best model from the previous run
)

# Run the TPOT optimization
tpot.fit(X_train, y_train)

# Score on the test set using RMSE
neg_mse = tpot.score(X_test, y_test)
rmse = sqrt(-neg_mse)
print("Test RMSE: ", rmse)

# Export the best pipeline as a Python script
tpot.export('best_pipeline.py')

Optimization Progress:   0%|          | 0/630 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -240012.45349061466

Generation 2 - Current best internal CV score: -240012.45349061466

Generation 3 - Current best internal CV score: -240012.45349061466

Generation 4 - Current best internal CV score: -239909.02814533917

Generation 5 - Current best internal CV score: -238253.8070183262

Generation 6 - Current best internal CV score: -238253.8070183262

Generation 7 - Current best internal CV score: -238253.8070183262

Generation 8 - Current best internal CV score: -238197.862959142

Generation 9 - Current best internal CV score: -238197.862959142

Generation 10 - Current best internal CV score: -238197.862959142

Generation 11 - Current best internal CV score: -238098.93263668864

Generation 12 - Current best internal CV score: -237737.3505477355

Generation 13 - Current best internal CV score: -237728.15459490352

Generation 14 - Current best internal CV score: -237728.15459490352

Generation 15 - Current best internal CV score: -237

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


Test RMSE:  484.13895153749746


In [8]:
from math import sqrt

config_dict = {
    'xgboost.XGBRegressor': {
        'n_estimators': [5, 50, 100, 250, 300, 500],
        'max_depth': [1, 3, 5, 7, 9],
        'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1, 10],
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [1, 2, 3, 4, 5],
        'n_jobs': [1]  # TPOT is already parallelized, so set n_jobs to 1 for XGBoost
    }
}

# Create a TPOT regressor
tpot = TPOTRegressor(
    generations=5,  # Number of iterations to the run pipeline optimization process
    population_size=10,  # Number of individuals to retain in the genetic programming population every generation
    verbosity=2,  # Show progress
    random_state=42,  # Seed for reproducibility
    n_jobs=-1,  # Use all available cores
    scoring='neg_mean_squared_error',  # Use negative mean squared error for optimization
    config_dict=config_dict,  # Custom configuration
    warm_start=True  # Reuse the best model from the previous run
)

# Run the TPOT optimization
tpot.fit(X_train, y_train)

# Score on the test set using RMSE
neg_mse = tpot.score(X_test, y_test)
rmse = sqrt(-neg_mse)
print("Test RMSE: ", rmse)

# Export the best pipeline as a Python script
tpot.export('best_pipeline.py')

Optimization Progress:   0%|          | 0/60 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -234589.1031424426

Generation 2 - Current best internal CV score: -234589.1031424426

Generation 3 - Current best internal CV score: -234589.1031424426

Generation 4 - Current best internal CV score: -234584.95256375504

Generation 5 - Current best internal CV score: -234584.95256375504

Best pipeline: XGBRegressor(input_matrix, learning_rate=0.02, max_depth=5, min_child_weight=5, n_estimators=500, n_jobs=1, subsample=1.0)
Test RMSE:  483.54129762004953


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [10]:
from math import sqrt

config_dict = {
    'xgboost.XGBRegressor': {
        'n_estimators': [400, 500, 600, 700, 1000],
        'max_depth': [4, 5, 6, 9],
        'learning_rate': [0.01, 0.015, 0.02, 0.025, 0.05, 0.1, 0.5, 1],
        'subsample': [0.8, 0.9, 1.0, 1.1, 1.2, 1.3,1.5],
        'min_child_weight': [4, 5, 6, 7, 8],
        'n_jobs': [1]  # TPOT is already parallelized, so set n_jobs to 1 for XGBoost
    }
}

# Create a TPOT regressor
tpot = TPOTRegressor(
    generations=10,  # Number of iterations to the run pipeline optimization process
    population_size=20,  # Number of individuals to retain in the genetic programming population every generation
    verbosity=2,  # Show progress
    random_state=42,  # Seed for reproducibility
    n_jobs=-1,  # Use all available cores
    scoring='neg_mean_squared_error',  # Use negative mean squared error for optimization
    config_dict=config_dict,  # Custom configuration
    warm_start=True  # Reuse the best model from the previous run
)

# Run the TPOT optimization
tpot.fit(X_train, y_train)

# Score on the test set using RMSE
neg_mse = tpot.score(X_test, y_test)
rmse = sqrt(-neg_mse)
print("Test RMSE: ", rmse)

# Export the best pipeline as a Python script
tpot.export('best_pipeline.py')

Optimization Progress:   0%|          | 0/220 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -233827.96483994595

Generation 2 - Current best internal CV score: -233827.96483994595

Generation 3 - Current best internal CV score: -233827.96483994595

Generation 4 - Current best internal CV score: -233688.91533545233

Generation 5 - Current best internal CV score: -233688.91533545233

Generation 6 - Current best internal CV score: -233688.91533545233

Generation 7 - Current best internal CV score: -233688.91533545233

Generation 8 - Current best internal CV score: -233688.91533545233

Generation 9 - Current best internal CV score: -233688.91533545233

Generation 10 - Current best internal CV score: -233688.91533545233

Best pipeline: XGBRegressor(input_matrix, learning_rate=0.025, max_depth=5, min_child_weight=8, n_estimators=600, n_jobs=1, subsample=0.8)
Test RMSE:  482.99408559647253


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [13]:
# Re-import necessary libraries and re-save the file after code execution state reset
import pandas as pd
import numpy as np

# Re-load the original dataset
train_file_path = ['./Datasets/final-data/train_set.csv', './Datasets/final-data/test_set.csv']
output_file_path = ['./Datasets/final-data/enhanced_train_set.csv', './Datasets/final-data/enhanced_test_set.csv']
for idx, path in enumerate(train_file_path):
    train_data = pd.read_csv(train_file_path[idx])
    
    # Reapply the transformations and feature engineering
    # Logarithmic transformation of 'monthly_rent'
    train_data['log_monthly_rent'] = np.log(train_data['monthly_rent'])
    
    # Temporal features
    train_data['lease_age_at_rent_approval'] = train_data['rent_approval_year'] - train_data['lease_commence_date']

    # Pad the 'Month' column with leading zeros to ensure two-digit format
    train_data['Month'] = train_data['Month'].apply(lambda x: f'{x:02d}')
    
    # Combine 'Year' and 'Month' into a single string and then convert to integer
    train_data['year_month'] = (train_data['Year'].astype(str) + train_data['Month']).astype(int)
    
    # Geographical clustering
    from sklearn.cluster import KMeans
    geo_features = train_data[['latitude', 'longitude']]
    kmeans = KMeans(n_clusters=5, random_state=0).fit(geo_features)
    train_data['geo_cluster'] = kmeans.labels_
    
    # Saving the enhanced dataset to a CSV file
    enhanced_dataset_csv_path = output_file_path[idx]
    train_data.to_csv(enhanced_dataset_csv_path, index=False)

enhanced_dataset_csv_path


  super()._check_params_vs_input(X, default_n_init=10)


PermissionError: [Errno 13] Permission denied: './Datasets/final-data/enhanced_train_set.csv'

In [14]:
import pandas as pd
from tpot import TPOTRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Load your datasets
train_set = pd.read_csv('./Datasets/final-data/enhanced_train_set.csv')
test_set = pd.read_csv('./Datasets/final-data/enhanced_test_set.csv')

# Assume 'X' contains the features and 'y' is the target variable in your datasets
X_train = train_set.drop('log_monthly_rent', axis=1)
y_train = train_set['log_monthly_rent']
X_test = test_set.drop('log_monthly_rent', axis=1)
y_test = test_set['log_monthly_rent']

# Drop the specified columns
columns_to_drop = ['town', 'region', 'mrt_nearest', 'mall_nearest', 'mrt_planned_nearest', 'Year', 'Month', 'monthly_rent']
X_train = X_train.drop(columns_to_drop, axis=1)
X_test = X_test.drop(columns_to_drop, axis=1)

# Identify the remaining categorical columns
remaining_categorical_columns = ['flat_model', 'subzone', 'planning_area']

# Apply one-hot encoding to 'flat_model' and 'planning_area'
one_hot_columns = ['flat_model', 'planning_area']

# Reinitialize the encoders
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
label_encoder = LabelEncoder()

# Apply one-hot encoding to 'flat_model' and 'planning_area'
X_train_one_hot = encoder.fit_transform(X_train[one_hot_columns])
X_test_one_hot = encoder.transform(X_test[one_hot_columns])
X_train_one_hot = pd.DataFrame(X_train_one_hot, columns=encoder.get_feature_names_out(one_hot_columns))
X_test_one_hot = pd.DataFrame(X_test_one_hot, columns=encoder.get_feature_names_out(one_hot_columns))

# Apply label encoding to 'subzone'
X_train['subzone'] = label_encoder.fit_transform(X_train['subzone'])
X_test['subzone'] = label_encoder.transform(X_test['subzone'])

# Combine the processed data
X_train = X_train.drop(one_hot_columns, axis=1)
X_test = X_test.drop(one_hot_columns, axis=1)
X_train = pd.concat([X_train, X_train_one_hot], axis=1)
X_test = pd.concat([X_test, X_test_one_hot], axis=1)

# Align the training and test sets
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

X_train.shape, X_test.shape

((48000, 70), (12000, 70))

In [15]:
config_dict = {
    'xgboost.XGBRegressor': {
        'n_estimators': [400, 500, 600, 700, 1000],
        'max_depth': [4, 5, 6, 9],
        'learning_rate': [0.01, 0.015, 0.02, 0.025, 0.05, 0.1, 0.5, 1],
        'subsample': [0.8, 0.9, 1.0, 1.1, 1.2, 1.3,1.5],
        'min_child_weight': [4, 5, 6, 7, 8],
        'n_jobs': [1],  # TPOT is already parallelized, so set n_jobs to 1 for XGBoost
    }
}

# Create a TPOT regressor
tpot = TPOTRegressor(
    generations=10,  # Number of iterations to the run pipeline optimization process
    population_size=20,  # Number of individuals to retain in the genetic programming population every generation
    verbosity=2,  # Show progress
    random_state=42,  # Seed for reproducibility
    n_jobs=-1,  # Use all available cores
    scoring='neg_mean_squared_error',  # Use negative mean squared error for optimization
    config_dict=config_dict,  # Custom configuration
    warm_start=True  # Reuse the best model from the previous run
)

# Run the TPOT optimization
tpot.fit(X_train, y_train)

Optimization Progress:   0%|          | 0/220 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.035066762105337296

Generation 2 - Current best internal CV score: -0.035066762105337296

Generation 3 - Current best internal CV score: -0.035066762105337296

Generation 4 - Current best internal CV score: -0.035066762105337296

Generation 5 - Current best internal CV score: -0.03504479889238586

Generation 6 - Current best internal CV score: -0.03501728804707829

Generation 7 - Current best internal CV score: -0.03501728804707829

Generation 8 - Current best internal CV score: -0.03501728804707829

Generation 9 - Current best internal CV score: -0.03501728804707829

Generation 10 - Current best internal CV score: -0.03501728804707829

Best pipeline: XGBRegressor(input_matrix, learning_rate=0.05, max_depth=4, min_child_weight=4, n_estimators=500, n_jobs=1, subsample=0.8)


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


NameError: name 'mean_squared_error' is not defined

In [18]:
best_model = tpot.fitted_pipeline_
best_model

In [16]:
from math import sqrt, exp
from sklearn.metrics import mean_squared_error

# Extract the best fitted pipeline from TPOT
best_model = tpot.fitted_pipeline_

# Make predictions on the test data
predictions_log_scale = best_model.predict(X_test)

# Convert predictions back to the original scale
predictions = np.exp(predictions_log_scale)

# Ensure y_test is on the original scale (monthly_rent, not log_monthly_rent)
# Calculate RMSE on the original scale of monthly rent
rmse = sqrt(mean_squared_error(y_test, predictions))
print("Test RMSE: ", rmse)

# Export the best pipeline as a Python script
tpot.export('best_pipeline_1.py')


Test RMSE:  2585.2487913373884


  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_categorical_dtype(dtype)
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)


In [19]:
# Create a TPOT regressor
tpot = TPOTRegressor(
    generations=10,  # Number of iterations to the run pipeline optimization process
    population_size=20,  # Number of individuals to retain in the genetic programming population every generation
    verbosity=2,  # Show progress
    random_state=42,  # Seed for reproducibility
    n_jobs=-1,  # Use all available cores
    scoring='neg_mean_squared_error',  # Use negative mean squared error for optimization
    warm_start=True  # Reuse the best model from the previous run
)

# Run the TPOT optimization
tpot.fit(X_train, y_train)

# Extract the best fitted pipeline from TPOT
best_model = tpot.fitted_pipeline_

# Make predictions on the test data
predictions_log_scale = best_model.predict(X_test)

# Convert predictions back to the original scale
predictions = np.exp(predictions_log_scale)

# Ensure y_test is on the original scale (monthly_rent, not log_monthly_rent)
# Calculate RMSE on the original scale of monthly rent
y_test_original_scale = np.exp(y_test)
rmse = sqrt(mean_squared_error(y_test_original_scale, predictions))
print("Test RMSE: ", rmse)

# Export the best pipeline as a Python script
tpot.export('best_pipeline_1.py')

Optimization Progress:   0%|          | 0/220 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -0.03595816267649651

Generation 2 - Current best internal CV score: -0.03595816267649651

Generation 3 - Current best internal CV score: -0.03580272778775873

Generation 4 - Current best internal CV score: -0.03576255521206333

Generation 5 - Current best internal CV score: -0.03576255521206333

Generation 6 - Current best internal CV score: -0.03576255521206333

Generation 7 - Current best internal CV score: -0.03571233439338654

Generation 8 - Current best internal CV score: -0.03571233439338654

Generation 9 - Current best internal CV score: -0.03567305049940886

Generation 10 - Current best internal CV score: -0.03567305049940886

Best pipeline: ExtraTreesRegressor(ElasticNetCV(RobustScaler(input_matrix), l1_ratio=0.75, tol=0.01), bootstrap=True, max_features=0.7000000000000001, min_samples_leaf=18, min_samples_split=6, n_estimators=100)
Test RMSE:  2581.64943442527


In [20]:

# Ensure y_test is on the original scale (monthly_rent, not log_monthly_rent)
# Calculate RMSE on the original scale of monthly rent
y_test_original_scale = np.exp(y_test)
rmse = sqrt(mean_squared_error(y_test_original_scale, predictions))
print("Test RMSE: ", rmse)

# Export the best pipeline as a Python script
tpot.export('best_pipeline_1.py')

Test RMSE:  489.892912764877
