In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="mlflow")

In [3]:
import mlflow

# mlflow.set_tracking_uri("http://localhost:8090")
mlflow.set_tracking_uri("postgresql+psycopg2://airflow:airflow@localhost:5432/mlflow")
# mlflow.set_tracking_uri("sqlite:///mlflow.db") # Local Database
# mlflow.set_experiment("rightmove-rent-prediction")
# mlflow.create_experiment(name="rightmove-rent-prediction", artifact_location="gs://rightmove-ml-artifacts/artifacts")
experiment_name = "rightmove-rent-prediction"
mlflow.set_experiment(experiment_name)


<Experiment: artifact_location='gs://rightmove-ml-artifacts/artifacts', creation_time=1707837412102, experiment_id='2', last_update_time=1707837412102, lifecycle_stage='active', name='rightmove-rent-prediction', tags={}>

# Load Data

In [4]:
from mlflow.data.pandas_dataset import PandasDataset
train_dataset_source_url = "data/train.csv"
val_dataset_source_url = "data/val.csv"

train_df = pd.read_csv(train_dataset_source_url, index_col=0)
val_df = pd.read_csv(val_dataset_source_url, index_col=0)

In [5]:
train_df.head()

Unnamed: 0,bedrooms,bathrooms,price,commercial,development,students,longitude,latitude,text,walk_score,restaurants,shopping,coffee,banks,parks,schools,books,entertainment,grocery
38830,3,1.0,24000.0,0,0,0,-0.056192,51.667706,"THREE BEDROOM HOUSE, TWO RECEPTION ROOMS, FITT...",15.133139,0.3179574,0.0,5.457925e-08,2.781163e-09,0.992856,0.823671,2.903329e-07,6.413508e-45,0.134351
55467,1,1.0,19800.0,0,0,0,-0.110176,51.568771,", Hunters Stanmore are proud to market this o...",85.781982,2.999974,0.8963245,1.999985,0.0,0.999051,0.991928,0.9999584,0.973646,3.0
39597,3,1.0,10500.0,0,0,0,-1.42608,53.68705,"Well Presented, Semi Detached, Unfurnished, Th...",17.884093,1.087511e-08,1.955617e-14,3.639962e-09,7.58971e-09,0.06292,0.287689,2.679829e-07,0.0,2.330664
58613,2,1.0,11400.0,0,0,0,-4.25954,55.820202,"Mod Ground Floor Flat, 2 Bedrooms, Unfurnished...",64.970217,1.795519,1.638024e-05,1.809908,0.6343372,0.991111,0.91047,0.5993718,0.0,2.999929
25056,3,,48000.0,0,0,0,-0.736478,51.425994,"Garden, Cooking basics, Dedicated workspace, W...",41.593268,3.87029e-06,1.016736e-19,1.092181,1.785391e-15,0.895333,0.999893,0.8105341,1.367258e-30,2.437928


In [6]:
train_df = train_df.dropna()
val_df = val_df.dropna()

In [7]:
def create_dataset(train_df, val_df, features):
    X_train = train_df[features]
    y_train = train_df[['price']]
    
    X_val = val_df[features]
    y_val = val_df[['price']]

    train_dataset: PandasDataset = mlflow.data.from_pandas(train_df[features], source=train_dataset_source_url)
    val_dataset: PandasDataset = mlflow.data.from_pandas(val_df[features], source=val_dataset_source_url)

    return X_train, y_train, X_val, y_val, train_dataset, val_dataset

# Linear Regression

## Numerical features

In [8]:
numerical_features = ['bedrooms', 'bathrooms', 'longitude', 'latitude']

In [9]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features)

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]


In [10]:
with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "Linear Regression")
    mlflow.log_param("Included Data", numerical_features)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(model, artifact_path="linear-regression")



## Walk Score

In [16]:
numerical_features_walk_score = ['bedrooms', 'bathrooms', 'longitude', 'latitude', 'walk_score']

In [17]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features_walk_score)

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]


In [18]:
with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "Linear Regression")
    mlflow.log_param("Included Data", numerical_features_walk_score)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(model, artifact_path="linear-regression-walkscore")



# Random Forest

In [21]:
from sklearn.ensemble import RandomForestRegressor

## Numerical

In [22]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features)

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]


In [23]:
with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "Random Forest")
    mlflow.log_param("Included Data", numerical_features)
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(model, artifact_path="random-forest")

  return fit_method(estimator, *args, **kwargs)


## Walk score

In [None]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features_walk_score)

with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "Random Forest")
    mlflow.log_param("Included Data", numerical_features_walk_score)
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(model, artifact_path="random-forest-walkscore")

# XGBoost

In [28]:
import xgboost as xgb

In [29]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features)

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]


In [31]:
with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "XGBoost")
    mlflow.log_param("Included Data", numerical_features)
    
    model = xgb.XGBRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)
    
    mlflow.xgboost.log_model(model, artifact_path="XGBoost")



## Walk score

In [33]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features_walk_score)

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]


In [34]:
with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "XGBoost")
    mlflow.log_param("Included Data", numerical_features_walk_score)
    
    model = xgb.XGBRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

    mlflow.xgboost.log_model(model, artifact_path="XGBoost-walkscore")



# Preprocessing

In [38]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:

X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features_walk_score)

with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "Random Forest")
    mlflow.log_param("Included Data", numerical_features_walk_score)
    mlflow.log_param("Standardized", True)

    pipeline_steps = [
    ('standardize', StandardScaler()),  # Standardize features
    ('regressor', RandomForestRegressor())  # Replace with any model, e.g., XGBClassifier()
    ]
    
    model = Pipeline(steps=pipeline_steps)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(model, artifact_path="random-forest-walkscore-pipeline")

## XGBoost preprocessed

In [41]:
with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "XGBoost")
    mlflow.log_param("Included Data", numerical_features)
    mlflow.log_param("Standardized", True)
    
    pipeline_steps = [
    ('standardize', StandardScaler()),  # Standardize features
    ('regressor', xgb.XGBRegressor())  # Replace with any model, e.g., XGBClassifier()
    ]
    
    model = Pipeline(steps=pipeline_steps)

    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)
    
    mlflow.sklearn.log_model(model, artifact_path="XGBoost-pipeline")



In [42]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features_walk_score)

with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "XGBoost")
    mlflow.log_param("Included Data", numerical_features_walk_score)
    mlflow.log_param("Standardized", True)
    
    pipeline_steps = [
    ('standardize', StandardScaler()),  # Standardize features
    ('regressor', xgb.XGBRegressor())  # Replace with any model, e.g., XGBClassifier()
    ]
    
    model = Pipeline(steps=pipeline_steps)

    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)
    
    mlflow.sklearn.log_model(model, artifact_path="XGBoost-pipeline")

  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]
  string_columns = trimmed_df.columns[(df.applymap(type) == str).all(0)]


# Finetune XGBoost

In [21]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

ModuleNotFoundError: No module named 'hyperopt'

In [22]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features_walk_score_binary)

NameError: name 'numerical_features_walk_score_binary' is not defined

In [82]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [84]:
def objective(params):
    with mlflow.start_run(): # Start a new MLflow run for each evaluation
        mlflow.set_tag("Model type", "XGBoost") # Log model tag
        mlflow.log_params(params) # Log parameters

        # Train the model
        booster = xgb.train(
            params=params,
            dtrain=train, # Your training dataset
            num_boost_round=1000,
            evals=[(valid, 'validation')], # Your validation dataset
            early_stopping_rounds=50
        )

        # Make predictions and calculate RMSE
        y_pred = booster.predict(valid) # Your validation dataset
        rmse = mean_squared_error(y_val, y_pred, squared=False) # y_val should be your validation target
        mlflow.log_metric("rmse", rmse) # Log RMSE metric

    return {'loss': rmse, 'status': STATUS_OK}

In [85]:
# Define the search space for Hyperopt
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror', # Updated objective for regression
    'seed': 42
}

In [None]:
# Run Hyperopt
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

print("Best Hyperparameters:", best_result)

In [96]:
best_result['max_depth'] = 94

## Save Random Forest

In [20]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features_walk_score_binary)

NameError: name 'numerical_features_walk_score_binary' is not defined

In [None]:
with mlflow.start_run():
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    mlflow.log_params(best_result)

    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "XGBoost")
    mlflow.log_param("Included Data", numerical_features_walk_score_binary)

    booster = xgb.train(
        params=best_result,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)

    # with open("models/preprocessor.b", "wb") as f_out:
    #     pickle.dump(dv, f_out)
    # mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")

In [14]:
X_train, y_train, X_val, y_val, train_dataset, val_dataset = create_dataset(train_df, val_df, numerical_features_walk_score)

In [17]:
from sklearn.ensemble import RandomForestRegressor

with mlflow.start_run():
    # Log the dataset to the MLflow Run. Specify the "training" context to indicate that the
    # dataset is used for model training
    mlflow.log_input(train_dataset, context="training")
    mlflow.log_input(val_dataset, context="validation")
    
    mlflow.set_tag("developer", "Alex") # Addind metadata
    
    mlflow.log_param("Model type", "Random Forest")
    mlflow.log_param("Included Data", numerical_features_walk_score)
    
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(model, artifact_path="random-forest-model")

  return fit_method(estimator, *args, **kwargs)
