# **Libraries**

In [1]:
import os

import pandas as pd
import numpy as np
import sqlalchemy
import pymysql
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer # for dummies
from sklearn.pipeline import Pipeline # creating a pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.ensemble import RandomForestClassifier

# **Reading Data**

Conection details

In [2]:
DB_HOST = "10.56.1.20"  # Using MySQL IP address (ipv4_address in docker-compose)
DB_USER = "root"
DB_PASSWORD = "airflow" 
DB_NAME = "project_2"

Read table from MySQL

In [3]:
connection = pymysql.connect(host=DB_HOST,
                             user=DB_USER,
                             password=DB_PASSWORD,
                             db=DB_NAME,
                             cursorclass=pymysql.cursors.DictCursor)  # Using DictCursos to obtain results as dictionaries
try:
    with connection.cursor() as cursor:
        # Query the database
        cursor.execute("SELECT * FROM project_2.dataset_covertype;")
        result = cursor.fetchall()
    # Convert into a pd.DataFrame
    df = pd.DataFrame(result)
except Exception as e:
    # If error returns the exact error
    raise HTTPException(status_code=500, detail=str(e))
finally:
    connection.close()
# Show df
print(f"The dataframe has {len(df)} rows")
df.head()

The dataframe has 5810 rows


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,2649,122,35,524,241,497,253,189,31,1445,Commanche,C2705,2
1,2338,123,17,108,15,108,246,225,101,618,Commanche,C2705,2
2,2581,41,17,175,69,1256,220,201,111,633,Commanche,C4704,1
3,3306,218,20,492,159,1294,194,254,191,4302,Neota,C7756,1
4,2573,48,29,297,145,888,217,164,63,924,Commanche,C4703,1


# **Data Understanding**

In [4]:
df.dtypes

0     object
1     object
2     object
3     object
4     object
5     object
6     object
7     object
8     object
9     object
10    object
11    object
12    object
dtype: object

In [15]:
df["10"].unique()

array(['Commanche', 'Neota'], dtype=object)

In [14]:
df["11"].unique()

array(['C7202', 'C7757', 'C4703', 'C4704', 'C7700', 'C7790', 'C7756',
       'C7201', 'C2704', 'C2703', 'C2705', 'C8772', 'C8771', 'C8776',
       'C7755', 'C4758', 'C7101', 'C7702', 'C6102', 'C5101', 'C7102',
       'C7701', 'C7709', 'C8703', 'C6731', 'C6101'], dtype=object)

In [47]:
df["12"].unique()

array(['6', '1', '0', '5', '2', '4'], dtype=object)

Reviewing if the is any null value

In [12]:
# Putting "" as null 
df.replace("", np.nan, inplace=True)

In [13]:
pd.DataFrame(df.isna().sum()) / len(df)

Unnamed: 0,0
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0


Reviewing duplicated values

In [17]:
df.duplicated().sum()

0

# **Data Procesing**

Converting 10 first columns and the result variable into numeric

In [5]:
for i in range(10):
    try:
        df[str(i)] = pd.to_numeric(df[str(i)], errors='raise')
    except Exception as e:
        print(f"Can't convert column {i} to number: {e}")
df["12"] = pd.to_numeric(df["12"], errors='raise')

In [6]:
df.dtypes

0      int64
1      int64
2      int64
3      int64
4      int64
5      int64
6      int64
7      int64
8      int64
9      int64
10    object
11    object
12     int64
dtype: object

## _Split data into train and test_

In [6]:
# Division between y and the rest of variables

y = df["12"]
X = df.drop(columns="12")

# Split train and test (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

## _Dummy_

Dummy variables for categorical features

In [7]:
# categorical_columns = X_train.select_dtypes(exclude=[int, float]).columns
# numerical_columns = X_train.select_dtypes(include=[int, float]).columns

# X_train = pd.get_dummies(
#     X_train, columns=categorical_columns, drop_first=True, dtype=float
# )
# X_test = pd.get_dummies(
#     X_test, columns=categorical_columns, drop_first=True, dtype=float
# )

Align same X_train and X_test variables

In [8]:
# X_train, X_test = X_train.align(X_test, fill_value=0, axis=1, join="left")

# What the model receives
# print(X_train.shape)
# print(X_test.shape)

In [7]:
column_trans = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
                                        ["10", "11"]),
                                      remainder='passthrough') # pass all the numeric values through the pipeline without any changes.

column_trans

## _Standarization_

In [9]:
# columns = X_train.columns
# scaler = StandardScaler()

# scaler.fit(X_train)  # Se realiza el fit con la data de entrenamiento
# X_train.values[:] = scaler.transform(X_train)
# X_test.values[:] = scaler.transform(X_test)

In [8]:
pipe = Pipeline(steps=[("column_trans", column_trans),("scaler", StandardScaler(with_mean=False)), ("RandomForestClassifier", RandomForestClassifier())])

pipe

# **Modeling with MLFlow**

Hyperparameters

In [9]:
param_grid =  dict()
param_grid["RandomForestClassifier__max_depth"] = [1,2,3,10] 
param_grid['RandomForestClassifier__n_estimators'] = [10,11]

search = GridSearchCV(pipe, param_grid, cv=10, n_jobs=2)
search

In [10]:
# YOU MUST TAKE THE API NOT THE WEBAPP IN MY CASE IT WAS "http://0.0.0.0:8083" BUT API "9000"
# WE ARE ALSO TAKING THE NETWORK VALUE NEVERTHELESS YOU CAN USE THE CONTEINER NAME (IN OUR CASE S3)

os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.56.1.22:9000" 
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

# connect to mlflow
mlflow.set_tracking_uri("http://mlflow:8087") # "http://0.0.0.0:8087")
mlflow.set_experiment("mlflow_project_2")

mlflow.sklearn.autolog(log_model_signatures=True, log_input_examples=True, registered_model_name="modelo1")

with mlflow.start_run(run_name="autolog_pipe_model_reg") as run:
    search.fit(X_train, y_train)
print("fin")

  _warn_prf(average, modifier, msg_start, len(result))
Successfully registered model 'modelo1'.
2024/03/31 00:42:58 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: modelo1, version 1
Created version '1' of model 'modelo1'.
2024/03/31 00:43:00 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.


fin


In [18]:
print('tracking uri:', mlflow.get_tracking_uri())
print('artifact uri:', mlflow.get_artifact_uri())

tracking uri: http://mlflow:8087
artifact uri: s3://project2bucket/1/c86c4bc824d54852a28907acd1b89bfc/artifacts


In [31]:
# YOU MUST TAKE THE API NOT THE WEBAPP IN MY CASE IT WAS "http://0.0.0.0:8083" BUT API "9000"
# WE ARE ALSO TAKING THE NETWORK VALUE NEVERTHELESS YOU CAN USE THE CONTEINER NAME (IN OUR CASE S3)

os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.56.1.22:9000" # YOU MUST TAKE THE API NOT THE WEBAPP IN MY CASE IT WAS "http://0.0.0.0:8083" BUT API "9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

# connect to mlflow
mlflow.set_tracking_uri("http://mlflow:8087") # "http://0.0.0.0:8087")

model_name = "modelo1"

# logged_model = 'runs:/71428bebed2b4feb9635714ea3cdb562/model'
model_production_uri = "models:/{model_name}/production".format(model_name=model_name)

print(model_production_uri)

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(model_uri=model_production_uri)
loaded_model
example_test = X_test.iloc[0:2]#.to_frame().T
#print(example_test)
print('real: ', y_test.iloc[0:2])
print('prediction: ', loaded_model.predict(example_test))

a = loaded_model.predict(example_test)



models:/modelo1/production
real:  2042    6
157     1
Name: 12, dtype: int64
prediction:  [6 1]


In [20]:
X_test.iloc[0:2]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
2042,3448,311,25,127,1,1518,146,214,204,1869,Neota,C8772
157,2693,68,14,256,76,2391,234,210,105,883,Commanche,C2703


In [35]:
len(a)

2

In [39]:
final_response = [a[i].item() for i, _ in enumerate(a)]
final_response

[6, 1]