In [1]:
import numpy as np
from numpy import array
import os
import requests
import pandas as pd
import sklearn
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pprint
import mysql.connector

In [2]:
# Conexión a la base de datos
db = mysql.connector.connect(
  host="db",
  user="root",
  password="123456",
  database="db",
)
cursor = db.cursor()
# Consulta a la base de datos
cursor.execute("SELECT * FROM train_table")
data = cursor.fetchall()
df = pd.DataFrame(data, columns=[col[0] for col in cursor.description])

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (20,10)
plt.rcParams.update({'font.size': 15})

In [4]:
import mlflow
#mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_tracking_uri("http://10.43.102.113:5000/")
mlflow.set_experiment("Cover_type")

<Experiment: artifact_location='mlflow-artifacts:/531043269374479168', creation_time=1683813250476, experiment_id='531043269374479168', last_update_time=1683813250476, lifecycle_stage='active', name='Cover_type', tags={}>

### Procesamiento data

In [5]:
#Selección de las variables numericas
num = df.select_dtypes(include = 'number')

In [6]:
# Extracción de todas las filas y todas las columnas de la variable num, excepto la última columna.
# features_num es un DataFrame que contiene solo las características numéricas del conjunto de datos original,
# sin la variable objetivo.

features_num = num.iloc[:, 0:-1] 

#Extracción de la columna "Cover_Type" del DataFrame num para asignarla a la variable label_num.
#Se crea una serie que contiene los valores de la variable objetivo para el conjunto de datos original.

label_num = num['Cover_Type'] 

In [7]:
# Se crea un objeto SelectKBest que se utiliza para realizar la selección de características univariante.
# El parámetro score_func especifica la función de puntuación que se utilizará para evaluar la importancia
# de cada característica individual.

#En este caso, se están seleccionando las 8 características con la puntuación más alta según la función 
#f_classif.

select = SelectKBest(score_func=f_classif,k = 8)
# El método fit_transform se utiliza para ajustar el objeto SelectKBest a las características de 
# entrada features_num y la variable objetivo label_num, 
# y para transformar las características originales en un nuevo conjunto de datos que contiene solo 
# las características seleccionadas.
z = select.fit_transform(features_num,label_num)

In [8]:
# Se obtiene una lista de booleanos que indican si cada característica en features_num ha sido
# seleccionada o no por el objeto SelectKBest.

filter = list(select.get_support())
filter

[True,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False]

In [9]:
# feature_names contiene la lista de nombres de características, que se utiliza para etiquetar 
# las características en los resultados de la selección de características.

feature_names = list(features_num.columns)
feature_names

['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points',
 'Soil_Type_C2702',
 'Soil_Type_C2703',
 'Soil_Type_C2704',
 'Soil_Type_C2705',
 'Soil_Type_C2706',
 'Soil_Type_C2717',
 'Soil_Type_C3501',
 'Soil_Type_C3502',
 'Soil_Type_C4201',
 'Soil_Type_C4703',
 'Soil_Type_C4704',
 'Soil_Type_C4744',
 'Soil_Type_C4758',
 'Soil_Type_C5101',
 'Soil_Type_C5151',
 'Soil_Type_C6101',
 'Soil_Type_C6102',
 'Soil_Type_C6731',
 'Soil_Type_C7101',
 'Soil_Type_C7102',
 'Soil_Type_C7103',
 'Soil_Type_C7201',
 'Soil_Type_C7202',
 'Soil_Type_C7700',
 'Soil_Type_C7701',
 'Soil_Type_C7702',
 'Soil_Type_C7709',
 'Soil_Type_C7710',
 'Soil_Type_C7745',
 'Soil_Type_C7746',
 'Soil_Type_C7755',
 'Soil_Type_C7756',
 'Soil_Type_C7757',
 'Soil_Type_C7790',
 'Soil_Type_C8703',
 'Soil_Type_C8707',
 'Soil_Type_C8708',
 'Soil_Type_C8771',
 'S

In [10]:
# Crea un objeto DataFrame de pandas que contiene los nombres de las características seleccionadas 
# después de la selección de características univariada.
numss = pd.DataFrame(feature_names,filter)
numss

Unnamed: 0,0
True,Elevation
False,Aspect
False,Slope
False,Horizontal_Distance_To_Hydrology
False,Vertical_Distance_To_Hydrology
True,Horizontal_Distance_To_Roadways
False,Hillshade_9am
False,Hillshade_Noon
False,Hillshade_3pm
False,Horizontal_Distance_To_Fire_Points


In [11]:
# Una vez se han identificado las 2 columnas numericas con menor relevancia, se proceda a 
# eliminarlas del dataset original

df_2 = df.drop(['Hillshade_3pm','Aspect'], axis = 1)

In [12]:
df_2.head()

Unnamed: 0,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Fire_Points,Wilderness_Area,Soil_Type_C2702,...,Soil_Type_C7756,Soil_Type_C7757,Soil_Type_C7790,Soil_Type_C8703,Soil_Type_C8707,Soil_Type_C8708,Soil_Type_C8771,Soil_Type_C8772,Soil_Type_C8776,Cover_Type
0,2991.0,7.0,67.0,11.0,1015.0,233.0,234.0,1570.0,Commanche,0,...,0,0,0,0,0,0,0,0,0,1
1,2876.0,18.0,485.0,71.0,2495.0,192.0,202.0,1557.0,Commanche,0,...,0,1,0,0,0,0,0,0,0,1
2,3171.0,2.0,277.0,9.0,4374.0,213.0,237.0,1052.0,Rawah,0,...,0,0,0,0,0,0,0,0,0,0
3,3087.0,13.0,190.0,31.0,4774.0,193.0,221.0,752.0,Rawah,0,...,0,0,0,0,0,0,0,0,0,0
4,2835.0,10.0,212.0,41.0,3596.0,231.0,242.0,3280.0,Rawah,0,...,0,0,0,0,0,0,0,0,0,1


In [13]:
df_2 = pd.get_dummies(df_2)

In [14]:
df_2

Unnamed: 0,Elevation,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Fire_Points,Soil_Type_C2702,Soil_Type_C2703,...,Soil_Type_C8707,Soil_Type_C8708,Soil_Type_C8771,Soil_Type_C8772,Soil_Type_C8776,Cover_Type,Wilderness_Area_Cache,Wilderness_Area_Commanche,Wilderness_Area_Neota,Wilderness_Area_Rawah
0,2991.0,7.0,67.0,11.0,1015.0,233.0,234.0,1570.0,0,0,...,0,0,0,0,0,1,False,True,False,False
1,2876.0,18.0,485.0,71.0,2495.0,192.0,202.0,1557.0,0,0,...,0,0,0,0,0,1,False,True,False,False
2,3171.0,2.0,277.0,9.0,4374.0,213.0,237.0,1052.0,0,0,...,0,0,0,0,0,0,False,False,False,True
3,3087.0,13.0,190.0,31.0,4774.0,193.0,221.0,752.0,0,0,...,0,0,0,0,0,0,False,False,False,True
4,2835.0,10.0,212.0,41.0,3596.0,231.0,242.0,3280.0,0,0,...,0,0,0,0,0,1,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116198,3150.0,16.0,285.0,47.0,2275.0,200.0,253.0,866.0,0,0,...,0,0,0,0,0,1,False,True,False,False
116199,3125.0,13.0,234.0,2.0,2430.0,224.0,212.0,1426.0,0,0,...,0,0,0,0,0,0,False,False,False,True
116200,3166.0,11.0,67.0,0.0,1275.0,234.0,240.0,2404.0,0,0,...,0,0,0,0,0,0,False,False,False,True
116201,3154.0,14.0,738.0,46.0,6012.0,181.0,239.0,1320.0,0,0,...,0,0,0,0,0,1,False,False,False,True


In [15]:
NUMERIC_FEATURES = ['Elevation','Slope','Horizontal_Distance_To_Hydrology','Vertical_Distance_To_Hydrology','Horizontal_Distance_To_Roadways','Hillshade_9am','Hillshade_Noon','Horizontal_Distance_To_Fire_Points']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df_2.drop('Cover_Type', axis=1)
y = df_2['Cover_Type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

sc = StandardScaler()
X_train.loc[:, NUMERIC_FEATURES] = sc.fit_transform(X_train[NUMERIC_FEATURES])
X_test.loc[:, NUMERIC_FEATURES] = sc.transform(X_test[NUMERIC_FEATURES])

### Modelo 1: Randomforest

In [17]:
from sklearn.ensemble import RandomForestClassifier

with mlflow.start_run(run_name='rfc'):
    params = {
        "n_estimators": 100,
        "max_depth": 20
    }

    mlflow.set_tag("model_name", "RF")
    mlflow.log_params(params)

    rf = RandomForestClassifier(n_estimators=100, max_depth=20)
    rf.fit(X_train, y_train)

    rf_preds = rf.predict(X_test)
    rf_acc = accuracy_score(y_test, rf_preds)
    rf_prec = precision_score(y_test, rf_preds, average='weighted')
    rf_rec = recall_score(y_test, rf_preds, average='weighted')

    mlflow.log_metric("test_acc", rf_acc)
    mlflow.log_metric("test_prec", rf_prec)
    mlflow.log_metric("test_rec", rf_rec)
    mlflow.sklearn.log_model(rf, "sk_models")


In [18]:
print(rf_acc,rf_prec,rf_rec)

0.8630274518803247 0.8658340894649335 0.8630274518803247


### Modelo 2: Regresión logística


In [19]:
from sklearn.linear_model import LogisticRegression

with mlflow.start_run(run_name='logistic_regression'):
    params = {
        "C": 1.0,
        "penalty": "l2",
        "solver": "lbfgs"
    }

    mlflow.set_tag("model_name", "Logistic Regression")
    mlflow.log_params(params)

    log_reg = LogisticRegression(C=1.0, penalty="l2", solver="lbfgs")
    log_reg.fit(X_train, y_train)

    log_reg_preds = log_reg.predict(X_test)
    log_reg_acc = accuracy_score(y_test, log_reg_preds)
    log_reg_prec = precision_score(y_test, log_reg_preds, average='weighted')
    log_reg_rec = recall_score(y_test, log_reg_preds, average='weighted')

    mlflow.log_metric("test_acc", log_reg_acc)
    mlflow.log_metric("test_prec", log_reg_prec)
    mlflow.log_metric("test_rec", log_reg_rec)
    mlflow.sklearn.log_model(log_reg, "sk_models")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
print(log_reg_acc,log_reg_prec,log_reg_rec)

0.7265712400676975 0.7141817360338469 0.7265712400676975


### Modelo 3: Catboost

In [21]:
import catboost as cb
catb_train_dataset = cb.Pool(X_train, y_train) 
catb_test_dataset = cb.Pool(X_test, y_test) 

In [22]:
with mlflow.start_run(run_name="catboost"):
    mlflow.set_tag("model_name", "CatBoost")
    catb = cb.CatBoostClassifier()
    catb.fit(catb_train_dataset, eval_set=catb_test_dataset, early_stopping_rounds=50)
    catb_preds = catb.predict(catb_test_dataset)
    catb_acc = accuracy_score(y_test, catb_preds)
    catb_prec = precision_score(y_test, catb_preds, average='weighted')
    catb_rec = recall_score(y_test, catb_preds, average='weighted')


    mlflow.log_metric("test_acc", catb_acc)
    mlflow.log_metric("test_prec", catb_prec)
    mlflow.log_metric("test_rec", catb_rec)
    mlflow.catboost.log_model(catb, "cb_models")

Learning rate set to 0.117906
0:	learn: 1.6472704	test: 1.6454564	best: 1.6454564 (0)	total: 82.3ms	remaining: 1m 22s
1:	learn: 1.4594574	test: 1.4566369	best: 1.4566369 (1)	total: 110ms	remaining: 54.9s
2:	learn: 1.3318209	test: 1.3288458	best: 1.3288458 (2)	total: 136ms	remaining: 45.2s
3:	learn: 1.2314741	test: 1.2281950	best: 1.2281950 (3)	total: 158ms	remaining: 39.4s
4:	learn: 1.1508376	test: 1.1474916	best: 1.1474916 (4)	total: 180ms	remaining: 35.8s
5:	learn: 1.0855357	test: 1.0825495	best: 1.0825495 (5)	total: 217ms	remaining: 36s
6:	learn: 1.0305837	test: 1.0280020	best: 1.0280020 (6)	total: 244ms	remaining: 34.6s
7:	learn: 0.9841438	test: 0.9814835	best: 0.9814835 (7)	total: 266ms	remaining: 33s
8:	learn: 0.9450722	test: 0.9421718	best: 0.9421718 (8)	total: 295ms	remaining: 32.5s
9:	learn: 0.9121536	test: 0.9092906	best: 0.9092906 (9)	total: 319ms	remaining: 31.6s
10:	learn: 0.8816552	test: 0.8788145	best: 0.8788145 (10)	total: 346ms	remaining: 31.1s
11:	learn: 0.8540452	tes

In [23]:
print(catb_acc,catb_rec,catb_prec)

0.8734689194228508 0.8734689194228508 0.8735711305964579
