# Catboost classification


In [1]:

import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("intrafamiliar_modelov2.csv")

In [3]:
df.head()

Unnamed: 0,Grupo de Edad judicial,Escolaridad,Departamento del hecho DANE,Escenario del Hecho,Actividad Durante el Hecho,Sexo del Agresor,Presunto Agresor Detallado,Factor Desencadenante de la Agresión,Porcentaje de riesgo
0,(00 a 04),Sin escolaridad,Meta,Vivienda,Actividades Vitales / Cuidado Personal,Hombre,Padre,Intolerancia o Machismo,0.785714
1,(05 a 09),Preescolar,Huila,Otros,Estudio Y Aprendizaje,Mujer,Otros familiares civiles o consanguíneos,Intolerancia o Machismo,0.642857
2,(50 a 54),Primaria,"Bogotá, D.C.",Vía Pública,Desplazamiento / Transporte,Mujer,Hermano (a),Otros,0.785714
3,(50 a 54),Primaria,Antioquia,Vía Pública,Desplazamiento / Transporte,Hombre,Primo (a),Intolerancia o Machismo,0.857143
4,(60 a 64),Sin escolaridad,Valle del Cauca,Vivienda,Actividades Vitales / Cuidado Personal,Mujer,Nuera,Intolerancia o Machismo,0.571429


In [4]:
target = "Porcentaje de riesgo"
X = df.drop(columns=[target])
y = df[target]

In [5]:
cat_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [7]:
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    cat_features=cat_features,
    loss_function='RMSE',
    verbose=200,
    random_seed=42
)

model.fit(X_train, y_train)

0:	learn: 0.1285114	total: 810ms	remaining: 13m 29s
200:	learn: 0.0124776	total: 1m 14s	remaining: 4m 54s
400:	learn: 0.0113231	total: 3m 4s	remaining: 4m 35s
600:	learn: 0.0109885	total: 5m 19s	remaining: 3m 32s
800:	learn: 0.0107865	total: 7m 7s	remaining: 1m 46s
999:	learn: 0.0106344	total: 8m 32s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x2124d228ce0>

In [8]:
y_pred = model.predict(X_test)

In [9]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)



In [10]:
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")



RMSE: 0.0107
R²: 0.9936


In [11]:

nueva_muestra = X.sample(1)  
prediccion = model.predict(nueva_muestra)
print(f"Predicción de riesgo: {prediccion[0]:.2f}")

Predicción de riesgo: 1.00


In [13]:
comparacion = pd.DataFrame({
    "Real": y_test,
    "Predicho": y_pred
})
print(comparacion.head(100))


            Real  Predicho
2839    0.642857  0.641142
126787  0.857143  0.858302
99989   0.928571  0.928799
57457   0.857143  0.857221
63321   0.642857  0.645871
...          ...       ...
123068  0.857143  0.856530
13819   1.000000  1.000428
25416   0.785714  0.787359
111234  1.000000  1.000164
19703   0.714286  0.717316

[100 rows x 2 columns]
