In [7]:
import os
import mlflow
from mlflow import log_metric, log_param, log_artifacts
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


# Load dataset
df = pd.read_csv("get_around_pricing_project_cleaned.csv")

# Separate target variable Y from features X
print("Separating labels from features...")
features_list = df.columns[:-1]
target_variable = df.columns[-1]

X = df.loc[:,features_list]
Y = df.loc[:,target_variable]

print("...Done.")
print()

# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

# Train/test splitting
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# Creating pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first'))
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])
    
X_train = preprocessor.fit_transform(X_train)

X_test = preprocessor.transform(X_test)


# Set your variables for your environment
EXPERIMENT_NAME="baseline_multivariate_linear_regression"

# Set tracking URI to your Heroku application
APP_URI =  os.getenv("APP_URI")
mlflow.set_tracking_uri(APP_URI)

# Set experiment's info 
mlflow.set_experiment(EXPERIMENT_NAME)

# Get our experiment info
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

# Call mlflow autolog
mlflow.sklearn.autolog()

with mlflow.start_run(experiment_id = experiment.experiment_id):
    # Specified Parameters 
    c = 0.1

    # Instanciate and fit the model 
    regressor = LinearRegression()
    regressor.fit(X_train, Y_train)

    # Predictions on train and test set
    Y_train_pred = regressor.predict(X_train)

    Y_test_pred = regressor.predict(X_test)


    # Model metrics
    r2_train = r2_score(Y_train, Y_train_pred)
    r2_test = r2_score(Y_test, Y_test_pred)

    # Print results 
    print("Multivariate Linear Regression model")
    print("R2 on train: {}".format(r2_train))
    print("R2 on test: {}".format(r2_test))

    # Log Metric 
    mlflow.log_metric("R2 on train", r2_train)
    mlflow.log_metric("R2 on test", r2_test)

    # Log Param
    mlflow.log_param("C", c)

    # Log model 
    mlflow.sklearn.log_model(regressor, "model")

    print(mlflow.get_artifact_uri())



2023/03/13 17:17:52 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b3d03a29b300476d8ad17f250cf1a5bc', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Separating labels from features...
...Done.

Found numeric features  ['mileage', 'engine_power']
Found categorical features  ['model_key', 'fuel', 'paint_color', 'car_type', 'private_parking_available', 'has_gps', 'has_air_conditioning', 'automatic_car', 'has_getaround_connect', 'has_speed_regulator', 'winter_tires']


2023/03/13 17:17:55 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b265e4f28c3741ddb0189eadcd998954', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Multivariate Linear Regression model
R2 on train: 0.7148524507554002
R2 on test: 0.7092296278847003
file:///C:/Study/Jedha/Fullstack/Projects/Bloc_5/getaround_working_copy/getaround/mlflow/mlruns/614718149335542315/761c49831f2040aba12a00d508b8979e/artifacts


In [8]:
type(X_train)

scipy.sparse._csr.csr_matrix

In [10]:
print(X_train)

  (0, 0)	0.9428963206116653
  (0, 1)	-0.23310784743001145
  (0, 4)	1.0
  (0, 32)	1.0
  (0, 38)	1.0
  (0, 45)	1.0
  (0, 50)	1.0
  (1, 0)	-0.18076700154970962
  (1, 1)	-0.23310784743001145
  (1, 4)	1.0
  (1, 28)	1.0
  (1, 38)	1.0
  (1, 44)	1.0
  (1, 45)	1.0
  (1, 50)	1.0
  (2, 0)	-0.5497853496018934
  (2, 1)	0.15214060182196848
  (2, 19)	1.0
  (2, 32)	1.0
  (2, 40)	1.0
  (2, 44)	1.0
  (2, 45)	1.0
  (2, 48)	1.0
  (2, 49)	1.0
  (2, 50)	1.0
  :	:
  (3864, 50)	1.0
  (3865, 0)	-0.03036230063637398
  (3865, 1)	-0.23310784743001145
  (3865, 19)	1.0
  (3865, 29)	1.0
  (3865, 38)	1.0
  (3865, 44)	1.0
  (3865, 45)	1.0
  (3865, 48)	1.0
  (3865, 50)	1.0
  (3866, 0)	1.3383492292764143
  (3866, 1)	-0.4899401469313314
  (3866, 19)	1.0
  (3866, 28)	1.0
  (3866, 40)	1.0
  (3866, 44)	1.0
  (3866, 45)	1.0
  (3866, 50)	1.0
  (3867, 0)	-0.7618030411176446
  (3867, 1)	-1.1320208956846314
  (3867, 19)	1.0
  (3867, 29)	1.0
  (3867, 40)	1.0
  (3867, 45)	1.0
  (3867, 50)	1.0
