In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

Empezamos analizando un poco la estructura del csv

In [42]:
data = pd.read_csv('data/synthetic_customer_data.csv')

In [43]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_id               1000 non-null   object 
 1   age                       1000 non-null   int64  
 2   gender                    1000 non-null   object 
 3   annual_income             1000 non-null   int64  
 4   total_spent               1000 non-null   float64
 5   num_purchases             1000 non-null   int64  
 6   avg_purchase_value        1000 non-null   float64
 7   online_activity_score     1000 non-null   float64
 8   loyalty_program           1000 non-null   int64  
 9   days_since_last_purchase  1000 non-null   int64  
 10  num_site_visits           1000 non-null   int64  
 11  customer_segment          1000 non-null   object 
dtypes: float64(3), int64(6), object(3)
memory usage: 93.9+ KB
None


In [44]:
print(data.describe())

              age  annual_income   total_spent  num_purchases  \
count  1000.00000    1000.000000   1000.000000    1000.000000   
mean     43.81900   87130.677000  25083.705801     100.044000   
std      14.99103   38333.193443  14427.156274      56.211812   
min      18.00000   20207.000000    106.721181       1.000000   
25%      31.00000   52723.250000  13058.712181      54.000000   
50%      44.00000   88482.000000  25332.129881     100.000000   
75%      56.00000  121396.750000  37736.172303     149.000000   
max      69.00000  149839.000000  49917.540819     199.000000   

       avg_purchase_value  online_activity_score  loyalty_program  \
count         1000.000000            1000.000000      1000.000000   
mean           502.587993              48.950592         0.289000   
std            286.999641              28.469967         0.453525   
min              5.185596               0.092942         0.000000   
25%            256.496539              24.407169         0.000000   


In [45]:
print("Cantidad de valores nulos por columna:")
print(data.isnull().sum())

Cantidad de valores nulos por columna:
customer_id                 0
age                         0
gender                      0
annual_income               0
total_spent                 0
num_purchases               0
avg_purchase_value          0
online_activity_score       0
loyalty_program             0
days_since_last_purchase    0
num_site_visits             0
customer_segment            0
dtype: int64


In [46]:
print("\nCantidad de registros duplicados:", data.duplicated().sum())


Cantidad de registros duplicados: 0


Los datos parecen ser aptos para utilizar en el modelo

In [47]:
#Separamos nuestras variables
X = data.drop(columns=['age', 'gender', 'annual_income', 'total_spent', 'num_purchases', 'avg_purchase_value', 'online_activity_score', 'loyalty_program', 'days_since_last_purchase'])
y = data['customer_segment']

In [48]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

Normalización

In [49]:
scaler = StandardScaler()
X[numeric_features] = scaler.fit_transform(X[numeric_features])

In [50]:
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_categorical = encoder.fit_transform(X[categorical_features])

In [51]:
encoded_columns = encoder.get_feature_names_out(categorical_features)
encoded_df = pd.DataFrame(encoded_categorical, columns=encoded_columns, index=X.index)
X = pd.concat([X.drop(columns=categorical_features), encoded_df], axis=1)

División de datos en entrenamiento y prueba

In [52]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Voy a probar un modelo de random forest

In [53]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

In [54]:
y_pred = rf_model.predict(X_test)

In [55]:
print("Matriz de confusion:")
print(confusion_matrix(y_test, y_pred))

Matriz de confusion:
[[ 27   0   0]
 [  0   1   0]
 [  0   0 172]]


In [56]:
print("Análisis de desempeño:")
print(classification_report(y_test, y_pred))

Análisis de desempeño:
              precision    recall  f1-score   support

  high_value       1.00      1.00      1.00        27
   low_value       1.00      1.00      1.00         1
medium_value       1.00      1.00      1.00       172

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



In [57]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precisión: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")
print(f"Exactitud: {accuracy:.2f}")

Precisión: 1.00
Recall: 1.00
F1-score: 1.00
Exactitud: 1.00


El modelo parece tener un desempeño exitoso, ahora voy a continuar empaquetando el modelo para descargarlo de manera local

In [58]:
import joblib

In [59]:
joblib.dump(rf_model, 'src/random_forest_model.pkl')

['src/random_forest_model.pkl']