In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# TODO: think, how to add validation flag for further extracting from mlflow experiment class
# in order to change name of experiment run.
class Dataset_Analyzer:
	def __init__(self, dataset_url: str):
		self.CSV_dataset_URL = dataset_url
		self.CSV_dataset = self.get_dataset_in_CSV_from_URL(self.CSV_dataset_URL)
		self.dataset_column_names = self.get_dataset_columns_names()
		self.dataset_columns_types = dict(self.CSV_dataset.dtypes)
		self.dataset_columns_with_missing_values = []
		self.missing_values_columns_with_int_float_dtypes = []
		
		self.target_y_columns = ['Survived']
		self.features_set_X = pd.DataFrame()
		self.target_set_y = pd.DataFrame()

		self.train_set_X = pd.DataFrame()
		self.train_set_y = np.array
		self.test_set_X = pd.DataFrame()
		self.test_set_y = np.array

		self.ML_classifiers_model = {'random_forest': RandomForestClassifier} # in the future - maybe I add new models
		self.selected_model_name_for_experiment = 'random_forest'
		self.model_for_experiment = object
		self.prediction_of_target_y = []

	def main(self):
		self.make_data_preparation_for_training()
		print('Training ML model.')
		self.init_classifier_model_for_experiment(self.selected_model_name_for_experiment)
		dataset_analyzer.train_classifier_model()
		print('Make prediction of target y values from test set X')
		dataset_analyzer.make_prediction_on_test_dataset()
		print('Model Evaluation:')
		dataset_analyzer.evaluate_quality_of_ML_model()

	def make_data_preparation_for_training(self):
		self.show_dataset()
		self.show_dataset_columns()
		self.remove_all_string_type_columns_from_dataset()
		self.update_dataset_columns_names()
		self.update_dataset_columns_types()
		self.define_columns_with_missing_values_from_dataset()
		self.define_int_float_dtype_missing_value_columns()
		self.replace_columns_NaN_values_with_mean_values()
		print('After data cleaning and replacing:')
		self.show_dataset()
		print('=' * 40)
		self.define_features_set_X()
		self.define_target_set_y()
		self.features_max_abs_normalization()
		print('Features set X after max abs normalization:')
		self.show_features_set_X()
		print('=' * 40)
		print('Splitting dataset for train and test stages.')
		self.split_dataset_into_train_test_parts()
		print('=' * 40)

	def show_dataset(self):
		print(self.CSV_dataset)

	def get_dataset_columns_names(self):
		return self.CSV_dataset.columns

	def update_dataset_columns_names(self):
		self.dataset_column_names = self.get_dataset_columns_names()

	def update_dataset_columns_types(self):
		self.dataset_columns_types = dict(self.CSV_dataset.dtypes)

	def show_dataset_columns_types(self):
		print(self.dataset_columns_types)

	def show_dataset_columns(self):
		print(self.dataset_column_names)

	def get_dataset_in_CSV_from_URL(self, url: str):
		return pd.read_csv(url)

	def remove_all_string_type_columns_from_dataset(self):
		for dataset_column_name in self.dataset_columns_types:
			if self.dataset_columns_types[dataset_column_name] == object:
				self.CSV_dataset = self.CSV_dataset.drop(dataset_column_name, axis = 1)
		return True

	def define_columns_with_missing_values_from_dataset(self):
		missing_values_by_bool_mapping_of_dataset = self.CSV_dataset.isna()

		for dataset_column in self.dataset_column_names:
			column_values = missing_values_by_bool_mapping_of_dataset[dataset_column].values
			if True in column_values:
				self.dataset_columns_with_missing_values.append(dataset_column)
				print(f'Missing values column - {dataset_column}!')
		return True

	def define_int_float_dtype_missing_value_columns(self):
		for missing_value_column in self.dataset_columns_with_missing_values:
			if self.dataset_columns_types[missing_value_column] != object:
				self.missing_values_columns_with_int_float_dtypes.append(missing_value_column)
		return True

	def replace_columns_NaN_values_with_mean_values(self):
		for int_float_column in self.missing_values_columns_with_int_float_dtypes:
			column_mean_value = self.CSV_dataset[int_float_column].mean(skipna = True)
			self.CSV_dataset[int_float_column] = self.CSV_dataset[int_float_column].replace(
				to_replace = np.nan, value = column_mean_value
			)
		return True

	def define_features_set_X(self):
		for target_y_column in self.target_y_columns:
			self.features_set_X = self.CSV_dataset.drop(target_y_column, axis = 1)
		return True

	def show_features_set_X(self):
		print(self.features_set_X)

	def define_target_set_y(self):
		self.target_set_y = self.CSV_dataset[self.target_y_columns]
		return True

	def split_dataset_into_train_test_parts(self):
		self.train_set_X, self.test_set_X, \
		self.train_set_y, self.test_set_y = train_test_split(
			self.features_set_X, self.target_set_y, test_size = 0.2, random_state = 42
			)
		self.train_set_y = self.train_set_y.values.reshape(-1)
		return True

	def features_max_abs_normalization(self):
		features_column_names = self.features_set_X.columns
		for features_column in features_column_names:
			max_abs_column_value = self.features_set_X[features_column].abs().max()
			self.features_set_X[features_column] = self.features_set_X[features_column] / max_abs_column_value
		return True

	def init_classifier_model_for_experiment(self, model_name: str):
		self.model_for_experiment = self.ML_classifiers_model[model_name]()

	def train_classifier_model(self):
		self.model_for_experiment.fit(self.train_set_X, self.train_set_y)

	def make_prediction_on_test_dataset(self):
		self.prediction_of_target_y = self.model_for_experiment.predict(self.test_set_X)

    # TODO: change output format for mlflow detecting and logging into Run.
	def evaluate_quality_of_ML_model(self):
		print('Confusion matrix:')
		print(confusion_matrix(self.test_set_y, self.prediction_of_target_y))
		print('\nClassification Report:')
		print(classification_report(self.test_set_y, self.prediction_of_target_y))


In [None]:
# MLFlow experiment environment initiating and running
# This notebook only for testing as API for main classifier program.
import mlflow as mlf
import os

class MLFlow_Experiment_Runner:
    def __init__(self, classifier_model_name: str):
        self.classifier_model_name = classifier_model_name
        self.experiment_name = self.get_experiment_name()
        self.active_experiment = object
        self.current_run_name = ''
        self.max_experiment_runs = 10
        self.tracking_server_IP = os.getenv('MLFLOW_IP')
        self.tracking_server_port = 5000
        self.tracking_server_URI = f"http://{self.tracking_server_IP}:{self.tracking_server_port}"

    def main(self, target_classifier_program):
        self.set_mlflow_server_URI()
        if self.experiment_exist():
            print(f'[INFO] Experiment - {self.experiment_name} - active!')
            self.set_experiment_as_active()
        else:
            print(f'[INFO] Experiment - {self.experiment_name} does not exist!')
            self.create_new_experiment()
        runs_name_endtime_pairs = self.get_experiment_runs_name_endtime_pairs()
        runs_name_endtime_pairs.sort(key = lambda run: run['end_time'])
        if self.experiment_exceed_max_runs(runs_name_endtime_pairs):
            print(f"[INFO] Experiment has more than {self.max_experiment_runs} runs. Deleting old runs...")
            self.delete_old_time_experiment_runs(runs_name_endtime_pairs)
            
        self.current_run_name = self.get_current_run_name()
        with mlf.start_run(run_name = self.current_run_name):
            print("[INFO] Start classifier experiment...")
            mlf.sklearn.autolog()
            target_classifier_program()
    
    def set_mlflow_server_URI(self):
        print('[INFO] Setting Tracking Server URI...')
        mlf.set_tracking_uri(self.tracking_server_URI)
    
    def get_experiment_name(self):
        return f"{self.classifier_model_name}_classifier"
    
    def experiment_exist(self):
        if mlf.get_experiment_by_name(self.experiment_name):
            print(f'[INFO] Experiment {self.experiment_name} already exist!')
            return True
        return False
    
    def create_new_experiment(self):
        print(f'[INFO] Creating experiment - {self.experiment_name}...')
        mlf.create_experiment(self.experiment_name)
    
    def set_experiment_as_active(self):
        return mlf.set_experiment(self.experiment_name)

    def get_current_run_name(self):
        runs = self.get_experiment_runs_name_endtime_pairs()
        current_run_number = len(runs) + 1
        return f"train_test_{current_run_number}"

    def delete_old_time_experiment_runs(self, runs: list):
        target_runs_for_deleting = self.get_target_runs_for_deleting(runs)
        for run in target_runs_for_deleting:
            run_ID = run['run_ID']
            mlf.delete_run(run_ID)
            print(f"[INFO] Deleted Run with ID: {run_ID}")
        return True

    def get_target_runs_for_deleting(self, sorted_runs_by_time: list):
        return sorted_runs_by_time[:self.max_experiment_runs]
    
    def experiment_exceed_max_runs(self, runs: list):
        runs_amount = len(runs)
        if runs_amount <= self.max_experiment_runs:
            return False
        else:
            return True
            
    def get_experiment_runs_name_endtime_pairs(self) -> list[dict]:
        runs = mlf.search_runs(
            experiment_names = [self.experiment_name],
            output_format = 'list'
        )
        runs_name_endtime_pairs = []
        for run in runs:
            pair = {'run_ID': run.info.run_id, 'end_time': run.info.end_time}
            runs_name_endtime_pairs.append(pair)
        return runs_name_endtime_pairs
	
dataset_analyzer = Dataset_Analyzer("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
mlflow_experiment = MLFlow_Experiment_Runner('random_forest')
mlflow_experiment.main(dataset_analyzer.main)