In [1]:
import os
import mlflow
import pandas as pd
from sqlalchemy import create_engine
from dotenv import load_dotenv
load_dotenv()

### библиотеки для работы с S3 
import boto3
import joblib
import json
from io import BytesIO

In [2]:
session = boto3.Session(
    aws_access_key_id=(os.environ["AWS_ACCESS_KEY_ID"]),
    aws_secret_access_key=(os.environ["AWS_SECRET_ACCESS_KEY"]),
)

s3_client = session.client("s3", endpoint_url=os.environ['MLFLOW_S3_ENDPOINT_URL'])

In [3]:
import os
import boto3
from botocore.exceptions import ClientError

s3_client = boto3.client('s3')

BASELINE_MODEL_S3KEY = 'models/fitted_model.pkl'

try:
    bucket_name = os.environ["S3_BUCKET_NAME"]
    print(f"Checking bucket: {bucket_name}")
    
    response = s3_client.list_objects(Bucket=bucket_name)
    
    if 'Contents' in response:
        print(f"Found {len(response['Contents'])} objects in the bucket.")
        for obj in response['Contents']:
            print(f"Checking object: {obj['Key']}")
            if obj['Key'] == BASELINE_MODEL_S3KEY:
                print("Model found:", obj)
                break
        else:
            print("Model not found in the bucket.")
    else:
        print("No objects found in the bucket.")
except ClientError as e:
    print(f"An error occurred: {e}")
except KeyError:
    print("Environment variable S3_BUCKET_NAME is not set.")

Checking bucket: s3-student-mle-20240824-ff21c1bdfa
Found 10 objects in the bucket.
Checking object: 3/0459fca77dfa4fefb8d9e25ba23d74f8/artifacts/dataframe/columns.txt
Checking object: 3/0459fca77dfa4fefb8d9e25ba23d74f8/artifacts/dataframe/users_churn.csv
Checking object: cv_results/cv_res.json
Checking object: data/initial_data.csv
Checking object: files/md5/58/0c6563118ddac3897dabc6644f0454
Checking object: files/md5/79/dd7bd0a1d4cbe944c5242766bae460
Checking object: files/md5/9d/402cf397138eb0776019f3d791b6c4
Checking object: models/fitted_model.pkl
Model found: {'Key': 'models/fitted_model.pkl', 'LastModified': datetime.datetime(2024, 11, 4, 17, 30, 28, 810000, tzinfo=tzlocal()), 'ETag': '"0ee5af8e77c115902a4ff5e67d1df885"', 'Size': 1123460, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'ajeq26s98m0ssapaa27m', 'ID': 'ajeq26s98m0ssapaa27m'}}


In [4]:
BASELINE_MODEL_S3KEY = 'models/fitted_model.pkl'
for obj in s3_client.list_objects(Bucket=os.environ["S3_BUCKET_NAME"])['Contents']:
    if obj['Key'] == BASELINE_MODEL_S3KEY:
        print(obj)

{'Key': 'models/fitted_model.pkl', 'LastModified': datetime.datetime(2024, 11, 4, 17, 30, 28, 810000, tzinfo=tzlocal()), 'ETag': '"0ee5af8e77c115902a4ff5e67d1df885"', 'Size': 1123460, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'ajeq26s98m0ssapaa27m', 'ID': 'ajeq26s98m0ssapaa27m'}}


In [5]:
response = s3_client.list_object_versions(Bucket=os.environ["S3_BUCKET_NAME"], Prefix=BASELINE_MODEL_S3KEY)

for version in response.get('Versions', []):
    print(f"Key: {version['Key']}, VersionId: {version['VersionId']}, Latest: {version['IsLatest']}")

Key: models/fitted_model.pkl, VersionId: 00062619A40FF251, Latest: True
Key: models/fitted_model.pkl, VersionId: 000625B8E8959D21, Latest: False
Key: models/fitted_model.pkl, VersionId: 000625B7D93E5093, Latest: False
Key: models/fitted_model.pkl, VersionId: 000623C282A7B519, Latest: False
Key: models/fitted_model.pkl, VersionId: 000623C212314601, Latest: False
Key: models/fitted_model.pkl, VersionId: 00062382E407019E, Latest: False
Key: models/fitted_model.pkl, VersionId: 0006235B17374247, Latest: False
Key: models/fitted_model.pkl, VersionId: 000622A28BCEBEDE, Latest: False
Key: models/fitted_model.pkl, VersionId: 000622A2152B9B25, Latest: False
Key: models/fitted_model.pkl, VersionId: 00062293736E3F3E, Latest: False


In [6]:
response = s3_client.get_object(
    Bucket=os.environ["S3_BUCKET_NAME"],
    Key=BASELINE_MODEL_S3KEY,
    VersionId='00062619A40FF251'
)

binary_data = response['Body'].read()
loaded_pipeline = joblib.load(BytesIO(binary_data))
print(f"Тип загруженного объекта: {type(loaded_pipeline)}")

Тип загруженного объекта: <class 'sklearn.pipeline.Pipeline'>


In [7]:
loaded_pipeline.steps

[('preprocessor',
  ColumnTransformer(transformers=[('CatBoostEncoder', CatBoostEncoder(),
                                   ['building_type_int', 'rooms', 'has_elevator',
                                    'is_apartment']),
                                  ('StandardScaler', StandardScaler(),
                                   ['build_year', 'latitude', 'longitude',
                                    'ceiling_height', 'flats_count',
                                    'floors_total', 'floor', 'living_area',
                                    'kitchen_area', 'flats_count',
                                    'total_area'])])),
 ('model', <catboost.core.CatBoostRegressor at 0x7f14b5249ea0>)]

In [8]:
for obj in s3_client.list_objects(Bucket=os.environ["S3_BUCKET_NAME"])['Contents']:
    if obj['Key'] == 'cv_results/cv_res.json':
        print(obj)

{'Key': 'cv_results/cv_res.json', 'LastModified': datetime.datetime(2024, 10, 2, 18, 48, 58, 607000, tzinfo=tzlocal()), 'ETag': '"2569afbee23fbfe817067e435f041cc7"', 'Size': 138, 'StorageClass': 'STANDARD', 'Owner': {'DisplayName': 'ajeq26s98m0ssapaa27m', 'ID': 'ajeq26s98m0ssapaa27m'}}


In [9]:
response = s3_client.get_object(
    Bucket=os.environ["S3_BUCKET_NAME"],
    Key='cv_results/cv_res.json'
)
json_data = response['Body'].read().decode('utf-8')
metrics_dict = json.loads(json_data)

metrics_dict

{'fit_time': 30.491,
 'score_time': 0.062,
 'test_neg_root_mean_squared_error': -64999224.288,
 'test_neg_mean_absolute_error': -5087615.134}

In [10]:
# host = os.environ['DB_DESTINATION_HOST']
# port = os.environ['DB_DESTINATION_PORT']
# db = os.environ['DB_DESTINATION_NAME']
# username = os.environ['DB_DESTINATION_USER']
# password = os.environ['DB_DESTINATION_PASSWORD']

# conn = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{db}', connect_args={'sslmode':'require'})
# data = pd.read_sql('select * from flat_prices', conn, index_col='id')
# data = data.drop(columns=['price'])
# conn.dispose()
# data

In [11]:
data = pd.read_csv('data/test_data.csv')

In [12]:
data.head()

Unnamed: 0,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,building_id,floor,kitchen_area,living_area,rooms,is_apartment,studio,total_area,target
0,1964,6,55.812397,37.65337,2.64,68,9,1,5726,6,5.0,22.0,1,0,0,32,11300000
1,1974,4,55.895039,37.607204,2.64,287,9,1,10985,9,6.0,46.0,4,0,0,63,10300000
2,2010,2,55.536835,37.154732,3.0,35,3,0,21489,2,19.700001,54.700001,3,0,0,104,9200000
3,1978,4,55.897942,37.56469,2.64,142,12,1,12496,9,8.3,30.0,2,0,0,54,10999000
4,1987,4,55.604198,37.521301,2.64,252,22,1,14705,4,0.0,0.0,1,0,0,39,7250000


In [13]:
data.drop(columns=['target']).head()

Unnamed: 0,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,has_elevator,building_id,floor,kitchen_area,living_area,rooms,is_apartment,studio,total_area
0,1964,6,55.812397,37.65337,2.64,68,9,1,5726,6,5.0,22.0,1,0,0,32
1,1974,4,55.895039,37.607204,2.64,287,9,1,10985,9,6.0,46.0,4,0,0,63
2,2010,2,55.536835,37.154732,3.0,35,3,0,21489,2,19.700001,54.700001,3,0,0,104
3,1978,4,55.897942,37.56469,2.64,142,12,1,12496,9,8.3,30.0,2,0,0,54
4,1987,4,55.604198,37.521301,2.64,252,22,1,14705,4,0.0,0.0,1,0,0,39


In [14]:
preds = loaded_pipeline.predict(data.drop(columns=['target']))
signature = mlflow.models.infer_signature(data, preds)
signature

  inputs = _infer_schema(model_input) if model_input is not None else None


inputs: 
  ['build_year': long, 'building_type_int': long, 'latitude': double, 'longitude': double, 'ceiling_height': double, 'flats_count': long, 'floors_total': long, 'has_elevator': long, 'building_id': long, 'floor': long, 'kitchen_area': double, 'living_area': double, 'rooms': long, 'is_apartment': long, 'studio': long, 'total_area': long, 'target': long]
outputs: 
  [Tensor('float64', (-1,))]
params: 
  None

In [15]:
input_example = data.iloc[1,:].to_numpy().reshape(1,-1)
input_example

array([[1.97400000e+03, 4.00000000e+00, 5.58950386e+01, 3.76072044e+01,
        2.64000010e+00, 2.87000000e+02, 9.00000000e+00, 1.00000000e+00,
        1.09850000e+04, 9.00000000e+00, 6.00000000e+00, 4.60000000e+01,
        4.00000000e+00, 0.00000000e+00, 0.00000000e+00, 6.30000000e+01,
        1.03000000e+07]])

In [16]:
TRACKING_SERVER_HOST = '127.0.0.1'
TRACKING_SERVER_PORT = 5000

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

In [3]:
!export $(cat .env | xargs)

In [20]:
!echo $S3_BUCKET_NAME

s3-student-mle-20240824-ff21c1bdfa


In [21]:
mlflow.get_artifact_uri() 

's3://s3-student-mle-20240824-ff21c1bdfa/0/1e27e97108214c609b3031c4d7b18201/artifacts'

In [24]:
mlflow.end_run()

In [26]:
EXPERIMENT_NAME = "logging_baseline_model_1"
RUN_NAME = "logging_baseline_model_2"
REGISTRY_MODEL_NAME = "baseline_model"


pip_requirements="requirements.txt"
metadata = {'created_at': 'sprint 1'}

experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)
if not experiment:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
else:
    experiment_id = experiment.experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    print(run_id)
    print(experiment_id)
    print(mlflow.get_artifact_uri())
    
    mlflow.log_metrics(metrics_dict)
    model_info = mlflow.sklearn.log_model( 
        sk_model=loaded_pipeline, 
        artifact_path='project2',
        registered_model_name=REGISTRY_MODEL_NAME, 
        signature=signature,
        input_example=input_example, 
        await_registration_for=20, 
        pip_requirements=pip_requirements,
        metadata=metadata
    )

83cc9a7001b842db90d33a830794f1fb
2
s3://s3-student-mle-20240824-ff21c1bdfa/2/83cc9a7001b842db90d33a830794f1fb/artifacts


Registered model 'baseline_model' already exists. Creating a new version of this model...
2024/11/04 20:03:20 INFO mlflow.tracking._model_registry.client: Waiting up to 20 seconds for model version to finish creation. Model name: baseline_model, version 2
Created version '2' of model 'baseline_model'.
