# Experiment Approach 1: "TBD"

TODO-Intro

TODO-TOC

## Imports, common settings and environment overview

In [1]:
# ________________________________
# Imports
# ________________________________

# External imports
import mlflow
from mlflow.tracking import MlflowClient
import mysql.connector as connection
import pandas as pd
import os
from sklearn.model_selection import train_test_split
# Internal imports
# NA

# ________________________________
# Common settings
# ________________________________

# Set tracking uri (tracking server and registry server are the same / not separated)
# No need to set the registry uri in addition, because it defaults to the tracking URI
mlflow.set_tracking_uri("http://mlflow_tracking_server:5555")
# Set experiment
experiment = mlflow.set_experiment("fraud_detection_approach_1")
# Add experiment description
client = MlflowClient()
client.set_experiment_tag(experiment.experiment_id, "mlflow.note.content", "TBD with application data from 2021")

# ________________________________
# Environment overview
# ________________________________

print("#---> Environment overview <---#")

# Get current installed versions
!conda --version
!python3 --version
!mlflow --version
!great_expectations --version

# Get the current tracking uri
tracking_uri = mlflow.get_tracking_uri()
print(f"Current tracking uri: {tracking_uri}")

# Get the current model registry uri
model_registry_uri = mlflow.get_registry_uri()
print(f"Current model registry uri: {model_registry_uri}")

# Get the current experiment
print(f"Current experiment: {experiment.name} (ID {experiment.experiment_id})")

#---> Environment overview <---#
conda 4.10.3
Python 3.9.7
mlflow, version 1.25.1
great_expectations, version 0.15.3
[0mCurrent tracking uri: http://mlflow_tracking_server:5555
Current model registry uri: http://mlflow_tracking_server:5555
Current experiment: fraud_detection_approach_1 (ID 2)


# Ingest Data

In [2]:
# Load raw data from datasource, split and save to csv
try:
    # Open connection to database
    db = connection.connect(host="database", 
                            database="db", 
                            user="root", 
                            passwd="",
                            use_pure=True)
    # Read data (2021) from application table to pandas dataframe
    query = f"SELECT * FROM applications WHERE application_request_time LIKE '2021%' AND is_fraud_verified = 1;"
    raw_data_applications = pd.read_sql(query,db)
    # Close connection
    db.close() #close the connection

    # Save to raw_data.csv
    path_to_data = "output/data"
    output_file_all = "raw_data.csv"
    output_file_train = "raw_data_train.csv"
    output_file_test = "raw_data_test.csv"
    
    # Create dir if not yet exists
    os.makedirs(path_to_data, exist_ok=True)

    # Split raw data
    raw_data_applications_train, raw_data_applications_test = train_test_split(raw_data_applications, test_size=0.30, random_state=42)
    
    # Save to raw_data.csv
    raw_data_applications.to_csv(
        os.path.join(path_to_data, output_file_all), 
        sep=";", 
        encoding="utf-8",
        index=False)
    
    # Save to raw_data_train.csv
    raw_data_applications_train.to_csv(
        os.path.join(path_to_data, output_file_train), 
        sep=";", 
        encoding="utf-8",
        index=False)
    
    # Save to raw_data_test.csv
    raw_data_applications_test.to_csv(
        os.path.join(path_to_data, output_file_test), 
        sep=";", 
        encoding="utf-8",
        index=False)
    
    # Start a new MLflow run
    with mlflow.start_run(run_name="applications in 2021") as run:
            # Log raw data files
            mlflow.log_artifact(os.path.join(path_to_data, output_file_all), artifact_path=path_to_data)
            mlflow.log_artifact(os.path.join(path_to_data, output_file_train), artifact_path=path_to_data)
            mlflow.log_artifact(os.path.join(path_to_data, output_file_test), artifact_path=path_to_data)
    
    print(run.info.run_id)

except Exception as e:
    db.close()
    print(str(e))

8e503565ef2c437293361cdf965d694e
