In [275]:
import pandas as pd
from fuzzywuzzy import fuzz

In [276]:
data = pd.read_excel("etiquetado.xlsx", engine="openpyxl")

In [277]:
data["TIPO"] = data["TIPO"].astype("category")
data["TIPO PREDICHO"] = data["TIPO PREDICHO"].astype("category")
data["COLOR"] = data["COLOR"].astype("category")
data["COLOR PREDICHO"] = data["COLOR PREDICHO"].astype("category")
data["PLACA ESPECIAL"].fillna("", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["PLACA ESPECIAL"].fillna("", inplace=True)


In [278]:
# Seleccion de conjunto test
random_state = 42
porcentaje_dificultad_sencilla = 0.9
porcentaje_dificultad_normal = 0.3
porcentaje_dificultad_pesadilla = 0.1
data["test"] = 0
tipo_dificultad_sencilla = ["", "*"]
tipo_dificultad_normal = ["**"]
tipo_dificultad_pesadilla = ["***"]
data_select_sencilla = data[data["PLACA ESPECIAL"].isin(tipo_dificultad_sencilla)]
data_select_sencilla = data_select_sencilla.sample(
    n=int(len(data_select_sencilla) * porcentaje_dificultad_sencilla), random_state=random_state
)
data_select_normal = data[data["PLACA ESPECIAL"].isin(tipo_dificultad_normal)]
data_select_normal = data_select_normal.sample(n=int(len(data_select_normal)*porcentaje_dificultad_normal), random_state=random_state)

data_select_pesadilla = data[data["PLACA ESPECIAL"].isin(tipo_dificultad_pesadilla)]
data_select_pesadilla = data_select_pesadilla.sample(n=int(len(data_select_pesadilla)*porcentaje_dificultad_pesadilla), random_state=random_state)
data_select_test = pd.concat([data_select_sencilla, data_select_normal, data_select_pesadilla])
data.loc[data_select_test.index, "test"] = 1

In [279]:
# Conteo
data[data["test"]==1]["PLACA ESPECIAL"].value_counts(dropna=False)

PLACA ESPECIAL
       53
*      15
**      6
***     3
Name: count, dtype: int64

In [280]:
# Funcion para comparacion de cadenas
def compare_strings(
    str1, str2, threshold_1=70, threshold_2=80, threshold_3=90, chars=",.¡!¿?[]"
):
    if isinstance(str1, str) and isinstance(str2, str):
        str1 = str1.replace(" ", "").translate(str.maketrans("","", chars))
        str2 = str2.replace(" ", "").translate(str.maketrans("","", chars))
        similarity = fuzz.ratio(str1, str2)
        if similarity == 100:
            return 5
        elif similarity > threshold_3:
            return 4
        elif similarity > threshold_2 and similarity <= threshold_3:
            return 3
        elif similarity > threshold_1 and similarity <= threshold_2:
            return 2
        elif similarity <= threshold_1:
            return 1
    else:
        return 0
    
# Analisis test
data["similitud"] = data[["PLACA", "PLACA PREDICHA"]].apply(lambda strs: compare_strings(strs[0],strs[1] ), axis=1)

  data["similitud"] = data[["PLACA", "PLACA PREDICHA"]].apply(lambda strs: compare_strings(strs[0],strs[1] ), axis=1)


In [281]:
data_evaluada = data[data["test"]==1].copy()

In [282]:
matriz_tipo = pd.crosstab(
    data_evaluada["TIPO"],
    data_evaluada["TIPO PREDICHO"],
    rownames=["Real"],
    colnames=["Predicho"],
    # normalize="index",
    dropna=False,
)
display(matriz_tipo)

Predicho,AUTOBUS,AUTOMOVIL,CAMIONETA,MOTOCICLETA,NO DETECTADO
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AUTOMOVIL,0,53,3,0,0
BICICLETA,0,2,0,0,0
CAMIONETA,1,9,4,0,0
MOTOCICLETA,0,0,0,1,0
OTRO,0,0,0,1,3


In [283]:
matriz_color = pd.crosstab(
    data_evaluada["COLOR"],
    data_evaluada["COLOR PREDICHO"],
    rownames=["Real"],
    colnames=["Predicho"],
    # normalize=True,
    dropna=False,
)
display(matriz_color)

Predicho,ALMENDRA,AZUL,BLANCO,CAFÉ OBSCURO,CARBON,COBRE,GRAFITO,GRIS,GRIS CENIZA,GRIS OBSCURO,GRIS OSCURO,GRIS PERLA,LAVANDA,MARFIL,NEGRO,PELTRE,PERLA,PLATA
Real,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
AMARILLO,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
AZUL,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,2
BLANCO,0,0,0,0,2,0,5,0,0,0,0,2,1,0,6,3,0,1
CAFÉ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
GRIS,0,0,0,0,1,0,2,3,1,3,0,2,1,1,2,0,0,0
GRIS CLARO,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0
NEGRO,0,0,0,0,1,0,1,1,2,0,1,0,1,0,2,2,0,0
PLATA,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,2
ROJO,0,0,0,3,1,1,0,0,0,0,0,2,0,0,1,1,1,0


In [284]:
with pd.ExcelWriter("etiquetado_evaluado.xlsx", engine="openpyxl") as writer:
    data_evaluada.to_excel(writer, sheet_name="data_evaluada", index=False)
    matriz_tipo.to_excel(writer, sheet_name="matriz_tipo")
    matriz_color.to_excel(writer, sheet_name="matriz_color")
    