# Model to predict the age of Abalone

In this notebook, you should implement a first version of a working machine learning model to predict the age of an Abalone.

A few guidelines:
- The model does not have to be complex. A simple linear regression model is enough.
- You should use MLflow to track your experiments. You can use the MLflow UI to compare your experiments.
- Do not push any MLflow data to the repository. Only the code to run the experiments is interesting and should be pushed.

### 01 - Import Statements

In [1]:
import mlflow
import numpy as np
import pandas as pd
import mlflow.sklearn
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from catboost import CatBoostRegressor
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder

### 02 - Loading Dataset

In [2]:
data_path = '../../abalone.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


### 03 - Data Preprocessing 

In [3]:
# Separating target variable from features
y = df['Rings'].copy()
X = df.drop(columns='Rings', axis=1).copy()

# Defining the preprocessor
preprocessor = ColumnTransformer(
    transformers=[ 
        ('sex', OneHotEncoder(), ['Sex']), 
        ('num', StandardScaler(), X.columns.difference(['Sex']))  
    ])

### 04 - Data model

In [4]:
# List of models
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('Support Vector Regressor', SVR(kernel='linear')),
    ('CatBoost', CatBoostRegressor(iterations=100, depth=6, learning_rate=0.1, verbose=False))
]

# Stocking the results
results = pd.DataFrame(columns=['Model', 'Train RMSE', 'Test RMSE', 'Train R2', 'Test R2'])

# Splitting train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Test our models
for model_name, model in models:

    # train and run pipeline 
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    # Extract and save results 
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    temp_df = pd.DataFrame({
        'Model': [model_name],
        'Train RMSE': [train_rmse],
        'Test RMSE': [test_rmse],
        'Train R2': [train_r2],
        'Test R2': [test_r2]
    })
    results = pd.concat([results, temp_df], ignore_index=True)

# Print results
print(results)


                      Model  Train RMSE  Test RMSE  Train R2   Test R2
0         Linear Regression    2.187288   2.210428  0.534788  0.548647
1             Random Forest    0.792708   2.257935  0.938897  0.529037
2  Support Vector Regressor    2.237391   2.256017  0.513231  0.529837
3                  CatBoost    1.911039   2.208460  0.644878  0.549450


### 05 - Tracking experiments on MLFlow

In [5]:
print(f"tracking URI : '{mlflow.get_tracking_uri()}")

tracking URI : 'file:///Users/vitrac/Desktop/M2%20HEC/13%20-%20MLops/project/xhec-mlops-project-student/notebooks/mlruns


In [6]:
from mlflow.tracking import MlflowClient

In [7]:
mlflow.set_experiment("Abalone Project")

2023/10/23 13:34:39 INFO mlflow.tracking.fluent: Experiment with name 'Abalone Project' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/vitrac/Desktop/M2%20HEC/13%20-%20MLops/project/xhec-mlops-project-student/notebooks/mlruns/868079838670296174', creation_time=1698060879513, experiment_id='868079838670296174', last_update_time=1698060879513, lifecycle_stage='active', name='Abalone Project', tags={}>

In [8]:
# Load data
df_copy = df.copy()
y = df_copy['Rings']
X = df_copy.drop(columns='Rings', axis=1)

#  Creating preprocessor
preprocessor = ColumnTransformer(
transformers=[ 
    ('sex', OneHotEncoder(), ['Sex']), 
    ('num', StandardScaler(), X.columns.difference(['Sex']))  
])
    
# Splitting train and test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a dataframe to store results 
results = pd.DataFrame(columns=['Model', 'Train RMSE', 'Test RMSE'])

In [9]:
# Model 1 Test 

model = LinearRegression()
model_name = 'Linear Regression'
        
# Create pipeline 
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

mlflow.end_run()

# Run on MLFlow
with mlflow.start_run() as run :

    run_id = run.info.run_id
    
    pipeline.fit(X_train, y_train)
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    model_params = model.get_params() 

    # Load data on mlflow
    mlflow.set_tag("model_name", model_name)
    mlflow.sklearn.log_model(model, "model")
    mlflow.log_params(model_params)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)

In [10]:
# Model 2 test 
model = RandomForestRegressor(n_estimators=100, random_state=42)
model = "Random Forest Regressor"
        
# Create pipeline 
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

mlflow.end_run()

# Run on MLFlow
with mlflow.start_run() as run :

    run_id = run.info.run_id
    
    pipeline.fit(X_train, y_train)
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    model_params = model.get_params() 

    # Load data on mlflow
        mlflow.set_tag("model_name", model_name)

    mlflow.set_tag("model_name", model_name)
    mlflow.sklearn.log_model(model, "model_name")
    mlflow.log_params(model_params)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)

In [11]:
# Model 3 test
model = SVR(kernel='linear')
        
# Create pipeline 
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

mlflow.end_run()

# Run on MLFlow
with mlflow.start_run() as run :

    run_id = run.info.run_id
    
    pipeline.fit(X_train, y_train)
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    model_params = model.get_params() 

    # Load data on mlflow
    mlflow.sklearn.log_model(model, "model_name")
    mlflow.log_params(model_params)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)

In [12]:
# Model 4 test 
model = CatBoostRegressor(n_estimators=100, verbose=False)
        
# Create pipeline 
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

mlflow.end_run()

# Run on MLFlow
with mlflow.start_run() as run :

    run_id = run.info.run_id
    
    pipeline.fit(X_train, y_train)
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    
    model_params = model.get_params() 

    # Load data on mlflow
    mlflow.sklearn.log_model(model, "model_name")
    mlflow.log_params(model_params)
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)

In [13]:
client = MlflowClient()
experiments =  client.search_experiments()
experiments

[<Experiment: artifact_location='file:///Users/vitrac/Desktop/M2%20HEC/13%20-%20MLops/project/xhec-mlops-project-student/notebooks/mlruns/868079838670296174', creation_time=1698060879513, experiment_id='868079838670296174', last_update_time=1698060879513, lifecycle_stage='active', name='Abalone Project', tags={}>,
 <Experiment: artifact_location='file:///Users/vitrac/Desktop/M2%20HEC/13%20-%20MLops/project/xhec-mlops-project-student/notebooks/mlruns/0', creation_time=1698060879506, experiment_id='0', last_update_time=1698060879506, lifecycle_stage='active', name='Default', tags={}>]

In [14]:
!mlflow ui --host 0.0.0.0 --port 5003

[2023-10-23 13:34:45 +0200] [18151] [INFO] Starting gunicorn 21.2.0
[2023-10-23 13:34:45 +0200] [18151] [INFO] Listening at: http://0.0.0.0:5003 (18151)
[2023-10-23 13:34:45 +0200] [18151] [INFO] Using worker: sync
[2023-10-23 13:34:45 +0200] [18153] [INFO] Booting worker with pid: 18153
[2023-10-23 13:34:45 +0200] [18154] [INFO] Booting worker with pid: 18154
[2023-10-23 13:34:46 +0200] [18155] [INFO] Booting worker with pid: 18155
[2023-10-23 13:34:46 +0200] [18156] [INFO] Booting worker with pid: 18156
