In [0]:
import os
import time
import mlflow
import mlflow.pyfunc
import mlflow.sklearn
import sklearn
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import pickle
import datarobot
from mlflow.models.signature import infer_signature
from mlflow.utils.environment import _mlflow_conda_env
from datarobot.mlops.mlops import MLOps
from datarobot.mlops.common.enums import OutputType
from mlflow.models.signature import infer_signature


# File location and type
file_location = "/FileStore/tables/diabetes.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
dfSpark = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

df = dfSpark.toPandas()



class SklearnModelWrapper(mlflow.pyfunc.PythonModel):
  def __init__(self, model):
    self.model = model
    
  def predict(self, context, model_input):
    os.environ["AWS_DEFAULT_REGION"] = ""
    os.environ["AWS_SECRET_ACCESS_KEY"] = ""
    os.environ["AWS_ACCESS_KEY_ID"] = ""
    os.environ["AWS_SESSION_TOKEN"] = ""

    DEPLOYMENT_ID = '6141037f78e6b571743f443c'
    MODEL_ID = '6141035d31fa15750719564c'
    
    mlops = MLOps()
    
    try:
      mlops.set_deployment_id(DEPLOYMENT_ID)
      mlops.set_model_id(MODEL_ID)
      mlops.set_sqs_spooler("https://sqs.us-east-1.amazonaws.com/293058073847/mlops-agent-alp")
    except:
      print('already configured')
    
    mlops.init()
    
    start_time = time.time()
    
    prediction = self.model.predict(model_input)    
    
    end_time = time.time()
    mlops.report_deployment_stats(1, end_time - start_time)
    mlops.report_predictions_data(features_df=model_input, predictions=prediction.tolist())
    
    return prediction

mlflow.sklearn.autolog()
with mlflow.start_run():  

  # Init LinearRegression object / class
  lm = LinearRegression()

  # Set data and target
  X = df.drop("Y",1)
  y = df.iloc[:,-1]

  # Split our data by 70% training (for fitting) and 30% testing (for prediction)
  X_train, x_test, Y_train, y_test = train_test_split(X, y, train_size=.7)
  print(X_train)
  # Fit our model
  model = lm.fit(X_train, Y_train)
  
  # Wrap Model
  wrappedModel = SklearnModelWrapper(lm)
  signature = infer_signature(X_train, wrappedModel.predict(None, X_train))
  
  #print(signature)
  # MLflow contains utilities to create a conda environment used to serve models.
  # The necessary dependencies are added to a conda.yaml file which is logged along with the model.
  conda_env = _mlflow_conda_env(
        additional_conda_deps=[],
        additional_pip_deps=["datarobot=={}".format(datarobot.__version__),"/dbfs/FileStore/jars/d5f69632_a4f8_423b_872a_edc8378208b5/datarobot_mlops-7.3.1-py2.py3-none-any.whl","scikit-learn=={}".format(sklearn.__version__)],
        additional_conda_channels=None,
    )
  
  mlflow.pyfunc.log_model("random_forest_model", python_model=wrappedModel, conda_env=conda_env, signature=signature)