# **Libraries**

In [1]:
import os

import pandas as pd
import numpy as np
import sqlalchemy
import pymysql
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer # for dummies
from sklearn.pipeline import Pipeline # creating a pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

# **Reading Data**

Conection details

In [2]:
DB_HOST = "10.56.1.20"  # Using MySQL IP address (ipv4_address in docker-compose)
DB_USER = "root"
DB_PASSWORD = "airflow" 
DB_NAME = "project_2"

Read table from MySQL

In [4]:
connection = pymysql.connect(host=DB_HOST,
                             user=DB_USER,
                             password=DB_PASSWORD,
                             db=DB_NAME,
                             cursorclass=pymysql.cursors.DictCursor)  # Using DictCursos to obtain results as dictionaries
try:
    with connection.cursor() as cursor:
        # Query the database
        cursor.execute("SELECT * FROM project_2.dataset_covertype;")
        result = cursor.fetchall()
    # Convert into a pd.DataFrame
    df = pd.DataFrame(result)
except Exception as e:
    # If error returns the exact error
    raise HTTPException(status_code=500, detail=str(e))
finally:
    connection.close()
# Show df
print(f"The dataframe has {len(df)} rows")
df.head()

The dataframe has 5810 rows


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,3169,99,13,492,39,509,241,221,108,5773,Neota,C7202,0
1,3259,137,21,391,99,1746,247,226,97,1396,Neota,C7757,0
2,3277,92,14,42,7,3331,241,217,103,1353,Commanche,C7201,0
3,2631,13,9,421,156,2126,212,222,148,499,Commanche,C2703,2
4,3276,95,22,309,33,2288,249,203,72,4380,Neota,C8772,0


# **Data Understanding**

In [4]:
df.dtypes

0     object
1     object
2     object
3     object
4     object
5     object
6     object
7     object
8     object
9     object
10    object
11    object
12    object
dtype: object

In [46]:
df["10"].unique()

array(['Commanche', 'Neota'], dtype=object)

In [45]:
df["11"].unique()

array(['C8772', 'C4703', 'C8776', 'C7757', 'C7201', 'C7756', 'C7202',
       'C2703', 'C7790', 'C7701', 'C7102', 'C4704', 'C5101', 'C8771',
       'C4758', 'C2705', 'C2704', 'C7700', 'C6102', 'C7755', 'C6101',
       'C8703', 'C7709', 'C7101', 'C7702', 'C6731'], dtype=object)

In [47]:
df["12"].unique()

array(['6', '1', '0', '5', '2', '4'], dtype=object)

Reviewing if the is any null value

In [53]:
# Putting "" as null 
df.replace("", np.nan, inplace=True)

In [56]:
pd.DataFrame(df.isna().sum()) / len(df)

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0


Reviewing duplicated values

In [62]:
df.duplicated().sum()

0

# **Data Procesing**

Converting 10 first columns and the result variable into numeric

In [5]:
for i in range(10):
    try:
        df[str(i)] = pd.to_numeric(df[str(i)], errors='raise')
    except Exception as e:
        print(f"Can't convert column {i} to number: {e}")
df["12"] = pd.to_numeric(df["12"], errors='raise')

In [5]:
df.dtypes

0      int64
1      int64
2      int64
3      int64
4      int64
5      int64
6      int64
7      int64
8      int64
9      int64
10    object
11    object
12     int64
dtype: object

## _Split data into train and test_

In [6]:
# Division between y and the rest of variables

y = df["12"]
X = df.drop(columns="12")

# Split train and test (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

## _Dummy_

Dummy variables for categorical features

In [7]:
# categorical_columns = X_train.select_dtypes(exclude=[int, float]).columns
# numerical_columns = X_train.select_dtypes(include=[int, float]).columns

# X_train = pd.get_dummies(
#     X_train, columns=categorical_columns, drop_first=True, dtype=float
# )
# X_test = pd.get_dummies(
#     X_test, columns=categorical_columns, drop_first=True, dtype=float
# )

Align same X_train and X_test variables

In [8]:
# X_train, X_test = X_train.align(X_test, fill_value=0, axis=1, join="left")

# What the model receives
# print(X_train.shape)
# print(X_test.shape)

In [7]:
column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
                                        ["10", "11"]),
                                      remainder='passthrough') # pass all the numeric values through the pipeline without any changes.

column_trans

## _Standarization_

In [9]:
# columns = X_train.columns
# scaler = StandardScaler()

# scaler.fit(X_train)  # Se realiza el fit con la data de entrenamiento
# X_train.values[:] = scaler.transform(X_train)
# X_test.values[:] = scaler.transform(X_test)

In [8]:
pipe = Pipeline(steps=[("column_trans", column_trans),("scaler", StandardScaler(with_mean=False)), ("RandomForestClassifier", RandomForestClassifier())])

pipe

# **Modeling with MLFlow**

Hyperparameters

In [9]:
param_grid =  dict()
param_grid["RandomForestClassifier__max_depth"] = [1,2,3,10] 
param_grid['RandomForestClassifier__n_estimators'] = [10,11]

search = GridSearchCV(pipe, param_grid, n_jobs=2)
search

In [10]:
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://minio:8083" # "http://0.0.0.0:8083"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

# connect to mlflow
mlflow.set_tracking_uri("http://mlflow:8087") # "http://0.0.0.0:8087")
mlflow.set_experiment("mlflow_prooject_2")

mlflow.sklearn.autolog(log_model_signatures=True, log_input_examples=True, registered_model_name="modelo1")

with mlflow.start_run(run_name="autolog_pipe_model_reg") as run:
    search.fit(X_train, y_train)

2024/03/28 04:15:51 INFO mlflow.tracking.fluent: Experiment with name 'mlflow_prooject_2' does not exist. Creating a new experiment.
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
print('tracking uri:', mlflow.get_tracking_uri())
print('artifact uri:', mlflow.get_artifact_uri())

tracking uri: http://mlflow:8087
artifact uri: s3://project2bucket/artifacts/1/f9e351d7f57643e5b1e50836c4d73ded/artifacts


In [18]:
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://minio:8083" # "http://0.0.0.0:8083"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

# connect to mlflow
mlflow.set_tracking_uri("http://mlflow:8087") # "http://0.0.0.0:8087")

model_name = "modelo1"

# logged_model = 'runs:/71428bebed2b4feb9635714ea3cdb562/model'
model_production_uri = "models:/{model_name}/production".format(model_name=model_name)

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_uri=model_production_uri)
loaded_model
example_test = X_test.iloc[0].to_frame().T
#print(example_test)
print('real: ', y_test.iloc[0])
print('prediction: ', loaded_model.predict(example_test))



RestException: RESOURCE_DOES_NOT_EXIST: Registered Model with name=modelo1 not found