In [2]:
import pandas as pd 
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate #base 
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import glob,os,random
import mlflow
from mlflow.models import infer_signature
import pickle

In [5]:
def get_recent_c_df(Carpeta_path: str, sheet_name: str):
    """ Devuelve el df de la hoja especifica, del archivo mas reciente creado de la carpeta especificada"""
    Path_n= Carpeta_path + '/*'
    tipo_de_archivo = '*.xlsx'
    # Busca el archivo más reciente
    archivos = glob.glob(Path_n + tipo_de_archivo)
    archivo_mas_reciente = max(archivos, key=os.path.getctime)
    nombre_del_archivo_N = os.path.basename(archivo_mas_reciente)
    print(archivo_mas_reciente)
    # Lee el archivo sin especificar el tipo de datos
    df = pd.read_excel(archivo_mas_reciente , sheet_name=sheet_name)
    # Si la columna "COMENTARIO" existe, cambia su tipo de datos a str
    if 'COMENTARIO' in df.columns:
        df['COMENTARIO'] = df['COMENTARIO'].astype(str)
    return df,nombre_del_archivo_N

# Proceso:
--------------------------------
   1. Cargo data 
   2. Analisis exploratorio
   3. Balancer Datos
   4. Aplicar encoder para las columnas categoricas
   7. Dividar en train y test y Entrenar modelo
   8. Dummy Clasifier 
   10. Iniciar MLFlow para capturar metricas
   14. Prueba con data Real
   15. Serializacion del Modelo

### 1 Cargo la Data ##################################

In [6]:

dataB,_ = get_recent_c_df(r'C:\Users\C26764\America Movil Peru S.A.C\EAS - 1','Sheet1')
analistas_map = {'JENNY PIZAN':'JENNY',
                 'DANNER YARLEQUE':'DANNER',
                 'LAURA RAFAEL':'LAURA'}
### Normalizo columna Tarjet 
dataB.loc[:,'RESPONSABLE_DE_EA'] = dataB.loc[:,'RESPONSABLE_DE_EA'].str.upper()
dataB.loc[:,'RESPONSABLE_DE_EA'] = dataB.loc[:,'RESPONSABLE_DE_EA'].replace(analistas_map)
dataB.RESPONSABLE_DE_EA.replace(pd.NA,'POR ASIGNAR',inplace=True)

C:\Users\C26764\America Movil Peru S.A.C\EAS - 1\EAUPDATE01.08.xlsx


In [7]:
data4Train = dataB[dataB.RESPONSABLE_DE_EA != 'POR ASIGNAR']
data4Test = dataB[dataB.RESPONSABLE_DE_EA == 'POR ASIGNAR']

data4Test = data4Test[['TEXTO','PROVEEDOR','NOMBRE PROYECTO','TIPO_PROYECTOS','CLASIF_FINANZAS','CLASIF_RED_1','CLASIF_RED_2']]

data4Train = data4Train[['TEXTO','PROVEEDOR','NOMBRE PROYECTO','TIPO_PROYECTOS','CLASIF_FINANZAS',
                                                'CLASIF_RED_1','CLASIF_RED_2','RESPONSABLE_DE_EA']]
for column in data4Train.columns:
    data4Train[column] = data4Train[column].astype('category')

### 2. Analisis Exploratorio

In [8]:
data4Train.describe()

Unnamed: 0,TEXTO,PROVEEDOR,NOMBRE PROYECTO,TIPO_PROYECTOS,CLASIF_FINANZAS,CLASIF_RED_1,CLASIF_RED_2,RESPONSABLE_DE_EA
count,1262,1262,1262,1262,1262,1262,1262,1262
unique,32,23,19,17,11,19,7,8
top,MEDICION DE RADIACIONES NO IONIZANTES,SGA TELECOMUNICACIONES S.A.C.,Ampliaciones de Capacidad,ANTENAS,"Radio Bases (Sitios nuevos, ampliacion capacidad)",Ampliación de Capacidad Radio,Radio,ANGGIE
freq,394,394,614,471,1118,614,1095,609


In [9]:
dataB.RESPONSABLE_DE_EA.value_counts(dropna=False)

RESPONSABLE_DE_EA
ANGGIE      609
JHORDAN     245
DEMETRIO    228
JORGE        52
JENNY        42
LAURA        42
DANNER       32
FERNANDO     12
Name: count, dtype: int64

### 3. Balanceo los datos

In [10]:
# Suponemos que 'df' es tu DataFrame y que 'RESPONSABLE EA' es la columna objetivo
class_counts = data4Train['RESPONSABLE_DE_EA'].value_counts()

# Calculamos la media de muestras por clase
mean_samples = int(class_counts.mean())

# Creamos un DataFrame vacío para almacenar los datos balanceados
df_balanced = pd.DataFrame()

# Iteramos sobre cada clase
for class_name in class_counts.index:
    # Seleccionamos todas las muestras de la clase actual
    df_class = data4Train[data4Train['RESPONSABLE_DE_EA'] == class_name]
    
    # Si el número de muestras en la clase actual es mayor que la media
    if class_counts[class_name] > mean_samples:
        # Realizamos un downsampling: eliminamos aleatoriamente muestras de la clase actual hasta que su número de muestras sea igual a la media
        df_class_balanced = resample(df_class, 
                                     replace=False,  # No permitimos muestrear la misma muestra más de una vez
                                     n_samples=mean_samples,  # El número de muestras a extraer
                                     random_state=123)  # La semilla para el generador de números aleatorios
    else:
        # Si el número de muestras en la clase actual es menor que la media
        # Realizamos un upsampling: añadimos aleatoriamente muestras de la clase actual hasta que su número de muestras sea igual a la media
        df_class_balanced = resample(df_class, 
                                     replace=True,  # Permitimos muestrear la misma muestra más de una vez
                                     n_samples=mean_samples,  # El número de muestras a extraer
                                     random_state=123)  # La semilla para el generador de números aleatorios
    
    # Añadimos la clase balanceada al DataFrame balanceado
    df_balanced = pd.concat([df_balanced, df_class_balanced])
# Imprimimos el número de muestras por clase en el DataFrame balanceado
display(df_balanced['RESPONSABLE_DE_EA'].value_counts())
print("Clases balanceadas")

RESPONSABLE_DE_EA
ANGGIE      157
DANNER      157
DEMETRIO    157
FERNANDO    157
JENNY       157
JHORDAN     157
JORGE       157
LAURA       157
Name: count, dtype: int64

Clases balanceadas


In [11]:
df_balanced.describe()
df_balanced.drop(columns=['CLASIF_RED_2','CLASIF_FINANZAS'],inplace=True)
### Decido dropear 2 columnas que solo introducen ruido al modelo, porque muy alta freq

### 4. Categorizo cada columna (one hot encoder)

In [12]:
# Crea una copia del DataFrame para no modificar el original
df_encoded = df_balanced.copy()
# Selecciona todas las columnas excepto la columna objetivo
columns_to_encode = df_balanced.columns[df_balanced.columns != 'RESPONSABLE_DE_EA']
# Crea el codificador OneHot
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Aplica el codificador a las columnas seleccionadas
encoded_data = encoder.fit_transform(df_balanced[columns_to_encode])

# Convierte el resultado en un DataFrame y utiliza get_feature_names_out para obtener los nombres de las columnas
df_encoded = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_to_encode))

# Asegúrate de mantener el índice original
df_encoded.index = df_balanced.index

# Agrega la columna objetivo al DataFrame codificado
#df_encoded = pd.concat([df_encoded, ], axis=1)
df_encoded

Unnamed: 0,TEXTO_ADECUACION OBRA CIVIL SEDE,TEXTO_ADECUACION OBRA CIVIL SITIO TECNICO,TEXTO_AUMENTO DE CARGA ELECTRICA,TEXTO_CAJONERA AEREA,TEXTO_DISENO DE PROYECTOS,TEXTO_ESCRITORIO DE MELAMINE GRIS,TEXTO_EVALUACIÓN ESTRUCTURAL DE TORRE,TEXTO_HABILITACIÓN CAPACIDAD 10GEN LAMBDA 100G,TEXTO_IMPLEM ENLACE MW COUBICADO,TEXTO_IMPLEMENTACION CELL SITE ROUTER,...,CLASIF_RED_1_IP Fotónico,CLASIF_RED_1_IPRAN,CLASIF_RED_1_Migraciones a FO Urbana,CLASIF_RED_1_Modernización MW Ultima Milla,CLASIF_RED_1_Red de Fibra Óptica Nacional,CLASIF_RED_1_Rollout Sitios Nuevos Macro,CLASIF_RED_1_Sedes,CLASIF_RED_1_Sitios 5G,CLASIF_RED_1_Sitios Nuevos Comerciales,CLASIF_RED_1_Street Cell
1049,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
717,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1233,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
690,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1233,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1237,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [13]:
# Esto solo se hace una vez para crear el experimento de MLFlow 


### 5. Divido en set de pruebas y entrenamiento & entreno el modelo

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_encoded
                                                    , df_balanced['RESPONSABLE_DE_EA'], test_size=0.2, random_state=42)

In [17]:
mlflow.set_experiment("MLflow Quickstart") #Defino el experimento en MLFLOW

<Experiment: artifact_location='file:///D:/Scripts1/sklrn/mlruns/762750926988870532', creation_time=1723061274626, experiment_id='762750926988870532', last_update_time=1723061274626, lifecycle_stage='active', name='MLflow Quickstart', tags={}>

In [24]:
# Define los hiperparámetros// Es el mejor mas rapido, comprobado por interacion
params = {
    'ccp_alpha': 0.0,
    'class_weight': 'balanced',
    'criterion': 'gini',
    'max_depth': None,
    'max_features': 1.0,
    'max_leaf_nodes': None,
    'min_impurity_decrease': 0.0,
    'min_samples_leaf': 0.00029163021289005544,
    'min_samples_split': 0.0005832604257801109,
    'min_weight_fraction_leaf': 0.0,
    'random_state': 5,
    'splitter': 'best'
}

# Crea el clasificador
clf = DecisionTreeClassifier(**params)
# Realizamos validación cruzada en el conjunto de entrenamiento
scores = cross_val_score(clf, X_train, y_train, cv=5)

print(f'La precisión del modelo en los datos de entrenamiento con validación cruzada es: {scores.mean()}')

# Ajustamos el modelo en el conjunto de entrenamiento
with mlflow.start_run():
    mlflow.autolog() ## Activo el autolog(aumenta el tiempo de ejecucion)
    clf.fit(X_train, y_train)
    mlflow.log_param("estimator", "DecisionTreeClasifier")
    #mlflow.sklearn.log_model(clf, "model")
# Ahora puedes usar 'clf.predict(X_test)' para predecir los responsables de los proyectos en tu conjunto de prueba
# Calculamos la precisión del modelo en los datos de prueba
precision = clf.score(X_test, y_test)

print(f'La precisión del modelo en los datos de prueba es: {precision}')
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)


2024/08/07 16:12:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '91217abcc2cf4a14aefa1dbb98c3a28b', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/08/07 16:12:45 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'a93daacbae5a4ebda063edb943c24820', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/08/07 16:12:49 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7775110527164a918184f4dd11fe921f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/08/07 16:12:51 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '644f2fb5ef4a4724accebaac658dd72e', which will track hyperparameters, performance metrics, model artifacts, and lineage i

La precisión del modelo en los datos de entrenamiento con validación cruzada es: 0.9960149253731343
La precisión del modelo en los datos de prueba es: 0.9841269841269841


### 5.5 Dummy Clasifier 

In [63]:
modelo = DummyClassifier()
results = cross_validate(modelo, X_test, y_test, cv = 5, return_train_score=False)
media = results['test_score'].mean()
desviacion_estandar = results['test_score'].std()
print("Accuracy con dummy stratified, 10 = [%.2f, %.2f]" % ((media - 2 * desviacion_estandar)*100, (media + 2 * desviacion_estandar) * 100))

Accuracy con dummy stratified, 10 = [13.10, 17.10]


### 6. Guardo la data en el MLFlow

#### Registro el model en el MLFlow ( De forma manueal, inecesario is ya active el autolog) 

In [18]:
# Start an MLflow run
with mlflow.start_run() as run:
     # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)
     # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic DesicionTree model for analist data")
    # Infer the model signature
    signature = infer_signature(X_train, clf.predict(X_train))
    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=clf,
        artifact_path="DecisionTreeModel",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking-Responsables",
    )

Registered model 'tracking-Responsables' already exists. Creating a new version of this model...
Created version '5' of model 'tracking-Responsables'.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

### 7. Pruebas reales 

#### Hago prueba con las filas random


In [289]:
random.seed(15)
lista = random.sample(range(1, 1257), 15)
df_test = dataB.loc[lista]
df_test['RESPONSABLE_DE_EA'].tolist()

['ANGGIE',
 'ANGGIE',
 'ANGGIE',
 'ANGGIE',
 'JHORDAN',
 'ANGGIE',
 'DEMETRIO',
 'ANGGIE',
 'ANGGIE',
 'DEMETRIO',
 'DEMETRIO',
 'ANGGIE',
 'ANGGIE',
 'ANGGIE',
 'JENNY']

In [292]:
data2test = df_test[['TEXTO','PROVEEDOR','NOMBRE PROYECTO','TIPO_PROYECTOS','CLASIF_RED_1']].copy()

In [293]:
df_real = data2test.copy()

#### Normalizo la entrada de que tenga el formato del modelo, igualo columnas

In [319]:
def process_2_model(df_real:pd.DataFrame):
        # Supongamos que 'encoder' es tu OneHotEncoder ya ajustado y 'df_real' es tu DataFrame real
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_data = encoder.fit_transform(df_real)
    
    # Aplica el codificador a tus datos reales
    df_real_encoded = encoder.transform(df_real)
    
    # Convierte el resultado en un DataFrame
    df_real_encoded = pd.DataFrame(df_real_encoded, columns=encoder.get_feature_names_out(df_real.columns))
    
    # Asegúrate de que todas las columnas en tus datos de entrenamiento también existen en tus datos reales
    for col in df_encoded.columns:
        if col not in df_real_encoded.columns:
            # Si falta alguna columna en tus datos reales, añade una nueva columna llena de ceros
            df_real_encoded[col] = 0
    # Ordena las columnas de df_real_encoded para que coincidan con el orden de las columnas en df_encoded
    df_real_encoded = df_real_encoded.reindex(columns=df_encoded.columns)
    return df_real_encoded


    
    

In [295]:
df_real_encoded = df_real_encoded.drop(columns=['RESPONSABLE_DE_EA'])

In [297]:
list_predic = clf.predict(df_real_encoded).tolist()

In [298]:
list_predic

['LAURA',
 'LAURA',
 'LAURA',
 'LAURA',
 'LAURA',
 'LAURA',
 'DEMETRIO',
 'LAURA',
 'LAURA',
 'DEMETRIO',
 'DEMETRIO',
 'LAURA',
 'LAURA',
 'LAURA',
 'JENNY']

### 8. Serializo el modelo para guardarlo

In [None]:

with open(r'\\LIMBIPBICOV01.claro.pe\Red Región Norte\EAS\model_v1.pkl', 'wb') as f:
    # Usar pickle.dump para serializar y guardar el objeto
    pickle.dump(clf, f)  