In [6]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder

In [3]:
nodes = pd.read_csv("../Tablas/TablaAtributos.csv")

In [4]:
# Definir atributos y objetivo
ac=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
ad=['name']
atributtes = nodes.loc[:, ['id']+ad + ac  ]

#Elegimos el atributo a predecir
y = nodes['ml_target']
atributtes.head(5)

Unnamed: 0,id,name,Closeness_Centrality,Betweenness_Centrality,Degree_Centrality,Clustering_Coefficient,Triangles,Squares,K_Core,Comunidad,asyn_lpa_community
0,0,Eiryyy,0.275005,0.0,2.7e-05,0.0,0.0,0.0,1,0.0,0.0
1,1,shawflying,0.294956,1.149733e-06,0.000212,0.178571,6.2e-05,0.072344,6,0.002227,0.0
2,2,JpMCarrilho,0.261845,0.0,2.7e-05,0.0,0.0,0.0,1,0.0,0.0
3,3,SuhwanCha,0.278718,5.316292e-05,0.000133,0.0,0.0,0.019178,4,0.004454,0.0
4,4,sunilangadi2,0.243084,6.134318e-09,5.3e-05,0.0,0.0,0.0,2,0.011136,0.0


In [8]:
#Preprocesamiento de datos
codificador_ad = OrdinalEncoder()
codificador_ad.fit(atributtes[ad])

#Codificar atributos
atributtes[ad] = codificador_ad.transform(atributtes[ad])



In [9]:
#Normalizamos la columna nombre 
scaler = MinMaxScaler(
    feature_range=(0, 1)
)
atributtes[ad] = scaler.fit_transform(atributtes[['name']])
atributtes.head(5)

Unnamed: 0,id,name,Closeness_Centrality,Betweenness_Centrality,Degree_Centrality,Clustering_Coefficient,Triangles,Squares,K_Core,Comunidad,asyn_lpa_community
0,0,0.061673,0.275005,0.0,2.7e-05,0.0,0.0,0.0,1,0.0,0.0
1,1,0.929866,0.294956,1.149733e-06,0.000212,0.178571,6.2e-05,0.072344,6,0.002227,0.0
2,2,0.106687,0.261845,0.0,2.7e-05,0.0,0.0,0.0,1,0.0,0.0
3,3,0.191517,0.278718,5.316292e-05,0.000133,0.0,0.0,0.019178,4,0.004454,0.0
4,4,0.969442,0.243084,6.134318e-09,5.3e-05,0.0,0.0,0.0,2,0.011136,0.0


In [10]:
#Discretizamos las columnas numericas
discretizer = KBinsDiscretizer(
    n_bins=500, encode='ordinal', strategy='uniform'
)
atributtes_discretized = atributtes.copy()
atributtes_discretized[ac] = discretizer.fit_transform(atributtes[ac])

In [11]:
#Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    atributtes_discretized, y, test_size=0.2, random_state=42
)


In [12]:
#Definimos el modelo Random Forest
model = RandomForestClassifier(
    n_estimators=100, random_state=42
)

#Entrenamos el modelo
model.fit(X_train, y_train)

#Predecimos los valores de prueba

y_pred = model.predict(X_test)

In [13]:
#Evaluamos el modelo
confusionM=confusion_matrix(y_test, y_pred)
print(confusionM)
print('Recall:', recall_score(y_test, y_pred))

[[5074  482]
 [ 950 1034]]
Recall: 0.5211693548387096


In [14]:
#Validacion cruzada
cv_results = cross_validate(
    model, atributtes_discretized, y, cv=10, scoring='recall'
)
print('Recall promedio:', cv_results['test_score'].mean())

Recall promedio: 0.5850994299895959


## ANALISIS
Aunque el modelo de Random Forest muestra un recall promedio decente de 0.5851, hay margen para mejorar especialmente en la identificación de casos positivos, dado que el recall es un poco bajo (0.5212).
La alta cantidad de falsos positivos (482) y falsos negativos (950) indica que el modelo no es perfecto en la clasificación de las instancias. Los falsos positivos son instancias negativas que el modelo predijo incorrectamente como positivas, mientras que los falsos negativos son instancias positivas que el modelo predijo incorrectamente como negativas.
El hecho de que haya más casos de desarrolladores web que de desarrolladores de IA puede influir en el desempeño del modelo. 
El modelo Random Forest muestra un rendimiento aceptable,aunque hay margen para ajustes y mejoras que podrían potenciar su capacidad para clasificar con mayor precisión casos positivos y negativos en el conjunto de datos proporcionado.
