In [9]:
# DataFrame handling
import pandas as pd

# Datetime
from datetime import datetime

# Split data
from sklearn.model_selection import train_test_split

# Feature engineering
from feature_engine.datetime import DatetimeFeatures
from feature_engine.encoding import MeanEncoder
from feature_engine.selection import DropFeatures
from feature_engine.imputation import CategoricalImputer
from sklearn.preprocessing import StandardScaler

# Model
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import accuracy_score

# Pipeline
from sklearn.pipeline import Pipeline

# Serialization
import joblib

# Paths
from pathlib import Path

# Sklearn settings
from sklearn import set_config
set_config(transform_output= "pandas")

## **1. Read data**

In [10]:
# Paths and directories
root_dir = Path().cwd().parent
data_path = root_dir / "data" / "data.csv"

# Read data
data = pd.read_csv(data_path, index_col= 0, parse_dates= ["period"])
data = data.drop(["status_str", "department_code","country_client"], axis= 1)

data.head()

Unnamed: 0_level_0,period,status,city_client,department_client
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2022-03-16 19:56:42,0,Soracá,Boyacá
2,2022-04-30 19:14:36,1,Puerto Libertador,Córdoba
3,2023-05-20 21:17:44,0,Iles,Nariño
4,2023-05-05 04:50:31,0,Solita,Caquetá
5,2022-02-14 05:17:56,1,San Andrés,"San Andrés, Providencia y Santa Catalina"


## **2. Prepare data**

In [11]:
# Define target variable
target = "status"

# Split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(data.drop(target, axis= 1),
                                                    data[target],
                                                    test_size=0.2,
                                                    random_state=42)
print(f"{X_train.shape= }, {y_train.shape= }")
print(f"{X_test.shape= }, {y_test.shape= }")
X_train.head()

X_train.shape= (1600, 3), y_train.shape= (1600,)
X_test.shape= (400, 3), y_test.shape= (400,)


Unnamed: 0_level_0,period,city_client,department_client
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
969,2023-12-21 18:28:49,El Zulia,Norte de Santander
241,2023-10-15 00:03:56,Silos,Norte de Santander
820,2022-12-20 13:25:32,San Antonio,Tolima
693,2022-11-10 02:03:47,Úmbita,Boyacá
421,2022-08-29 21:43:14,Busbanzá,Boyacá


## **3. Feature engineering pipeline**

In [12]:
# Drop city client feature
drop_features = DropFeatures(features_to_drop= ["city_client"])

# Define datetime features transformer
datetime_features = DatetimeFeatures(variables= "period",
                                     features_to_extract= ["day_of_week",
                                                           "day_of_month",
                                                           "month"])

# Missing categorical data imputer
nan_cat_imputer = CategoricalImputer(imputation_method= "missing",
                                     fill_value= "Otro")
                                                
# Define encoder for department
mean_encoder = MeanEncoder(variables= "department_client",missing_values= "ignore")

# Scaler
scaler = StandardScaler()

# Define feature engineering pipeline
pipeline = Pipeline(steps= [("DropFeatures", drop_features),
                            ("CategoricalImputer", nan_cat_imputer),
                            ("DatetimeFeatures", datetime_features),
                            ("MeanEncoder", mean_encoder),
                            ("Scaler", scaler)])

# Fit the pipeline
pipeline.fit(X_train, y_train)

In [13]:
# Data transformation
X_train = pipeline.transform(X_train)
X_train.head()

Unnamed: 0_level_0,department_client,period_day_of_week,period_day_of_month,period_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
969,-0.312566,-0.027554,0.595746,1.622576
241,-0.312566,1.475366,-0.084055,1.062705
820,-0.411793,-1.0295,0.482446,1.622576
693,-0.180159,-0.027554,-0.650555,1.34264
421,-0.180159,-1.530473,1.502148,0.502834


## **4. Model training**

In [14]:
# Define the model
model = LogisticRegression(penalty= "l2",C= 0.001)

# Fit the model
model.fit(X_train, y_train)

## **5. Model evaluation**

In [15]:
# Transform test data
X_test = pipeline.transform(X_test)
X_test.head()

Unnamed: 0_level_0,department_client,period_day_of_week,period_day_of_month,period_month
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1861,1.526057,-1.0295,0.595746,-0.057037
354,-0.201108,0.974393,1.502148,1.062705
1334,-0.106761,0.47342,1.502148,-0.616908
906,-0.83687,-0.027554,-0.763856,-0.896843
1290,-0.201108,-0.528527,1.048947,1.062705


In [16]:
# Make predictions
preds = model.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, preds)

# Show metrics
print(" --- Model performance ---")
print(f"{accuracy= }")

 --- Model performance ---
accuracy= 0.495


## **6. ML pipeline serialization**

In [17]:
# Define ML pipeline
ml_pipeline = Pipeline(steps= [("FeatureEngineering" , pipeline),
                               ("Model", model)])

# Serialize pipeline
pipeline_path = root_dir / "models" / "pipeline.pkl"
joblib.dump(ml_pipeline, pipeline_path)

['c:\\Users\\carlo\\OneDrive\\Escritorio\\Prueba_ScotiaTech\\models\\pipeline.pkl']

In [18]:
# Test pipeline
test_pipeline = joblib.load(pipeline_path)

# Create dummy data
dummy = pd.DataFrame([(datetime(2023,12,5,10,30,10), "Neiva","Huila")],
                     columns= ["period","city_client","department_client"])
# Prediction
dummy_pred = test_pipeline.predict(dummy)
print(dummy_pred)

[1]
