In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv('orders_challengue_sep2023_PE_CO.csv', sep=",", engine='python')  
df.head(10)

## 1. Data exploration

In [None]:
df.info()

In [None]:
# Counting NaN values in all columns
nan_count = df.isna().sum()

print(nan_count)


### 1.1. Numeric variables

In [None]:
df.describe()

In [None]:
columns = ["TO_USER_DISTANCE", "TOTAL_EARNINGS", "DISTANCE_TO_STORE", "TIP", "SATURATION", "TAKEN"]
df.hist(column=columns, bins=120, layout=(2,3), figsize=(15,5))

In [None]:
#define subplot layout
fig, axes = plt.subplots(ncols=4, figsize=(15,5))

#add DataFrames to subplots
df[["TO_USER_DISTANCE"]].boxplot(ax=axes[0])
df[["TOTAL_EARNINGS"]].boxplot(ax=axes[1])
df[["DISTANCE_TO_STORE"]].boxplot(ax=axes[2])
df[["TIP"]].boxplot(ax=axes[3])

### 1.2. Categorical variables

In [None]:
df.describe(include=[object])  

In [None]:
print("Countries:", df["COUNTRY"].unique())
print("Cities:", df["CITY"].unique())

In [None]:
columns = ["COUNTRY", "CITY"]
df[["COUNTRY", "CITY"]].value_counts().plot(kind='bar', xlabel='CITY', ylabel='Count', rot=90, figsize=(15,5))


In [None]:
df["COUNTRY"].value_counts().plot(kind='bar', xlabel='COUNTRY', ylabel='Count', rot=0)

### 1.3. Target

In [None]:
df["TAKEN"].value_counts().plot(kind='bar', xlabel='TAKEN', ylabel='Count', rot=0)

## 2.  Machine Learning model. Classification for TAKEN variable.

In [1]:
from ClassifierModel import DTCTrainer, DTCModel

# Ejemplo de uso
if __name__ == "__main__":

    # Instanciamos la clase de entrenamiento
    trainer = DTCTrainer(test_size=0.3)
    
    # Get data
    dataset_path = "orders_challengue_sep2023_PE_CO.csv"
    X_train, X_test, y_train, y_test = trainer.get_data(dataset_path)
      
    # Transform data
    X_train = trainer.fit_transform(X_train)
    X_test = trainer.transform(X_test)

    # Train model
    trainer.train(X_train, y_train)
    
    # Evaluate model
    accuracy_score, precision_score, recall_score, f1_score = trainer.evaluate(X_test, y_test)
    print("Acurracy Score:", accuracy_score)
    print("Precision Score:", precision_score)
    print("Recall Score:", recall_score)
    print("f1 Score:", f1_score)
   
    
    # Save model, scaler, and encoder for categorical variables
    trainer.save_model_scaler_enc(filepath_model="model.pkl", filepath_preprocessor= "preprocessor.pkl")

Leyendo datos y creando nuevas features...
Dividiendo en conjunto de train y test (test_size=0.3)...
Datos cargados!
Ajustando preprocessor...
Peprocessor ajustando!
Transformando datos...
Datos transformados!
Transformando datos...
Datos transformados!
Ajustando el modelo de clasificación...
Modelo ajustado!
Acurracy Score: 0.6590150275212462
Precision Score: 0.7633080216281484
Recall Score: 0.7672350610678025
f1 Score: 0.7652665033850017
****************************************************************************************************
Modelo guardado en: model.pkl
Preprocessor guardado en: preprocessor.pkl
****************************************************************************************************


In [None]:
# Ejemplo de uso
if __name__ == "__main__":
    
    filepath_model="model.pkl"
    filepath_preprocessor= "preprocessor.pkl"
    
    # Instanciamos la clase del modelo DecisionTreeClassifier
    model = DTCModel(filepath_model, filepath_preprocessor)
    
    # Get data
    data_path = "test_set.csv"
    X = model.get_data(data_path)
    
    # Preprocess data
    X = model.transform(X)
    
    # Predictions
    y_pred = model.predict(X)
    
    print("Predictions:\n", y_pred)

In [None]:
y_pred

In [1]:
# Import mlflow
import mlflow
import mlflow.sklearn
from ClassifierModel import DTCTrainer, DTCModel
from mlflow.models import infer_signature


# Start an MLflow run; the "with" keyword ensures we'll close the run even if this cell crashes
with mlflow.start_run():
    # Instanciamos la clase de entrenamiento
    trainer = DTCTrainer(test_size=0.3)
    
    # Get data
    dataset_path = "orders_challengue_sep2023_PE_CO.csv"
    X_train, X_test, y_train, y_test = trainer.get_data(dataset_path)
      
    # Transform data
    X_train = trainer.fit_transform(X_train)
    X_test = trainer.transform(X_test)

    # Train model
    trainer.train(X_train, y_train)
    
    # Evaluate model
    accuracy_score, precision_score, recall_score, f1_score = trainer.evaluate(X_test, y_test)

    # Print model metrics
    print("Acurracy Score:", accuracy_score)
    print("Precision Score:", precision_score)
    print("Recall Score:", recall_score)
    print("f1 Score:", f1_score)
    
    # Log mlflow attributes for mlflow UI
    mlflow.log_param("acurracy", accuracy_score)
    mlflow.log_param("precision", precision_score)
    mlflow.log_metric("recall", recall_score)
    mlflow.log_metric("f1_score", f1_score)

    # Signature
    signature = infer_signature(X_train, y_train)
    
    mlflow.sklearn.log_model(trainer.model, "DTCmodel", signature=signature)
    

    
    
    


Leyendo datos y creando nuevas features...
Dividiendo en conjunto de train y test (test_size=0.3)...
Datos cargados!
Ajustando preprocessor...
Peprocessor ajustando!
Transformando datos...
Datos transformados!
Transformando datos...
Datos transformados!
Ajustando el modelo de clasificación...
Modelo ajustado!
Acurracy Score: 0.6590150275212462
Precision Score: 0.7633080216281484
Recall Score: 0.7672350610678025
f1 Score: 0.7652665033850017


MlflowException: Path 'DTCmodel' already exists and is not empty