#### Installing weight and biases library

In [None]:
!pip install wandb

## Loading the dataset: Used Car Price Prediction

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
import wandb
import os

In [None]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=10ABViLN4Q7vgIlLvepCduU4B3C6BneJR" )

In [None]:
cars_df.head(5)

In [None]:
x_columns = ['KM_Driven', 'Fuel_Type', 'age',
             'Transmission', 'Owner_Type', 'Seats',
             'make', 'mileage', 'engine',
             'power', 'Location']
## model of the car is not included in the model

In [None]:
cars_df.shape

In [None]:
cars_df = cars_df[x_columns + ['Price']].dropna()

In [None]:
cars_df.shape

## Identifying numerical and categorical features

In [None]:
cat_features = ['Fuel_Type',
                'Transmission', 'Owner_Type',
                'make', 'Location']

In [None]:
num_features = list(set(x_columns) - set(cat_features))

## Utility method for preparing the data

- Splitting the dataset
- Encoding Catgorical Variables

In [None]:
X = cars_df[x_columns]
y = cars_df.Price

In [None]:
# Split the dataset into train and test split
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    train_size = 0.8,
                                                    random_state = 100)

### Creating ML Pipeline

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
ohe_encoder = OneHotEncoder(handle_unknown='ignore')
scaler = StandardScaler()

## Creating the imputer for columns that have missing values
imputed_num_vars = ['Seats']
non_imputed_num_vars = list(set(num_features) - set(imputed_num_vars))
mean_imputer = SimpleImputer(strategy='mean')


## Pipeline for the applying imputation and then scaling
imputed_num_transformer = Pipeline( steps = [
        ('imputation', mean_imputer),
        ('scaler', scaler)])

non_imputed_num_transformer = Pipeline( steps = [('scaler', scaler)])


## Pipeline for OHE encoding the categorical columns
cat_transformer = Pipeline( steps = [('ohencoder', ohe_encoder)])

## The complete pipeline for applying the required transformatinons to the respective columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num_imputed', imputed_num_transformer, imputed_num_vars),
        ('num_not_imputed', non_imputed_num_transformer, non_imputed_num_vars),
        ('catvars', cat_transformer, cat_features)])

## Initilializing Weights and Biases

In [None]:
os.environ["WANDB_API_KEY"] = "ddb79e032887f0a8e6e54fc954e8fdc8a40c1af2"

## Baseline Model: Linear Regression

In [None]:
linear_reg = LinearRegression()

linear_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('linear_model', linear_reg)])
## Pipeline for the applying imputation and then scaling

linear_model.fit(x_train, y_train)

wandb.init(project='mlops_usedcar', config=None, tags = ['Linear Model', 'baseline', 'OHE Encoding'])
wandb.run.name = "LinearModel"
rmse = np.sqrt(mean_squared_error(y_test, linear_model.predict(x_test)))
r2 = linear_model.score(x_test, y_test)

wandb.log( {"rmse" : rmse,
            "r2": r2} )

wandb.Artifact("LinearModel",
               type = 'model',
               description = None)

wandb.save()
wandb.finish()

In [None]:
params = {"max_depth": 10}

dtree = DecisionTreeRegressor(**params)

dtree_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('dt_model', dtree)])


dtree_model.fit(x_train, y_train)

wandb.init(project='mlops_usedcar', config=params, tags = ['Decision Tree',
                                                           'OHE Encoding'])
wandb.run.name = "DecisionTree"
rmse = np.sqrt(mean_squared_error(y_test, dtree_model.predict(x_test)))
r2 = dtree_model.score(x_test, y_test)

wandb.log( {"rmse" : rmse,
            "r2": r2} )

wandb.Artifact("DecisionTree",
               type = 'model',
               description = params)

wandb.save()
wandb.finish()

## Manual Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = { "dt_model__max_depth" : range(5, 10)}

In [None]:
dtree = DecisionTreeRegressor()

dtree_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('dt_model', dtree)])

In [None]:
dt_grid = GridSearchCV(dtree_model,
                       param_grid = params,
                       cv = 10,
                       scoring = 'r2')

In [None]:
dt_grid.fit(x_train, y_train)

In [None]:
dt_grid.best_params_

In [None]:
dt_grid.best_score_

In [None]:
pd.DataFrame(dt_grid.cv_results_)

### Using Sweep Features

In [None]:
def train_decision_tree(config=None):
    # Initialize WandB
    with wandb.init(config=config):
        config = wandb.config

        dtree = DecisionTreeRegressor(max_depth=config.max_depth)

        dtree_model = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('dt_model', dtree)])
        dtree_model.fit(x_train, y_train)

        # Evaluate the model
        rmse = np.sqrt(mean_squared_error(y_test, dtree_model.predict(x_test)))
        r2 = dtree_model.score(x_test, y_test)

        # Log metrics to WandB
        wandb.log( {"rmse" : rmse,
                    "r2": r2,
                    "max_depth": config.max_depth} )


In [None]:
sweep_config = {
    "method": "grid",  # Can be 'grid', 'random', or 'bayes'
    "metric": {"name": "r2", "goal": "maximize"},
    "parameters": {
        "max_depth": {
            "values": [4, 6, 8, 12]  # Depths to evaluate
        },
    },
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project="mlops_usedcar")

In [None]:
wandb.agent(sweep_id,
            function=train_decision_tree)  # Run all experiments

## Get Experiment Details

In [None]:
api = wandb.Api()

all_runs = api.runs("awesomestats/mlops_usedcar", order="+summary_metrics.rmse")

for run in all_runs:
  print(f"Model Name: {run.name} and R2 {run.summary.get('r2')}")
  print(run.config)

### Storing the model into a file

In [None]:
from joblib import dump

MODEL_DIR = "./carsmodel"

os.mkdir(MODEL_DIR)
dump(linear_model, MODEL_DIR + "/" + 'cars.pkl')

### Logging the model artifact in the tracking tools (weights and Biases)

In [None]:
wandb.init(project='mlops_usedcar',
           config=None,
           tags = ['Final Model'])
wandb.run.name = "FinalModel"

In [None]:
model_artifact = wandb.Artifact("Linear_Model_UsedCar",
                                type = 'model',
                                description = 'Linear Model for used car price prediction')

In [None]:
model_artifact.add_dir(MODEL_DIR)

In [None]:
wandb.run.log_artifact(model_artifact)

In [None]:
wandb.save()
wandb.finish()

In [None]:
import sklearn
sklearn.__version__