In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor,AdaBoostRegressor

from sklearn.metrics import mean_absolute_error,r2_score

import category_encoders as ce

import seaborn as sns
import matplotlib.pyplot as plt

import mlflow
import mlflow.sklearn

In [2]:
path = '../data/processed/'
df = pd.read_csv(path + 'properties_post_feature_selection_V2.csv')

X = df.drop(columns=['price'])
y = df['price']
y_transformed = np.log1p(y)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

In [4]:
columns_to_encode = ['property_type', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category',
                                 'floor_category']
numerical_columns = ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']

preprocessor = ColumnTransformer(
                transformers=[
                    ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'pooja room', 'store room']),
                    ('sector', OneHotEncoder(drop='first',handle_unknown='ignore'), ['sector']),
                    ('cat', OrdinalEncoder(), columns_to_encode)

                ],
                remainder='passthrough'
            )

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

pipeline.fit(X_train,y_train)
predicted = pipeline.predict(X_test)
score = r2_score(y_test, predicted)

In [7]:
with mlflow.start_run(run_name='first_run'):

    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)

    # Make predictions on the test set
    predicted = pipeline.predict(X_test)

    # Calculate R2 score
    score = r2_score(y_test, predicted)

    # Log parameters and metrics to MLflow
    mlflow.log_params({
        'numerical_columns': numerical_columns,
        'columns_to_encode': columns_to_encode
    })
    
    mlflow.log_metric('r2_score', score)

    # Log the model
    mlflow.sklearn.log_model(pipeline, 'model')

# Print the R2 score for reference
print(f'R2 Score: {score}')

R2 Score: 0.9049781887233079
