In [311]:

import csv 
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier


In [312]:
#Leemos los csv de las tablas 
nodes = pd.read_csv('../Tablas/TablaAtributos.csv')
nodesNoCentrality = pd.read_csv('../Tablas/TablaNoCentrality.csv')
nodesNoClustering = pd.read_csv('../Tablas/TablaNoClustering.csv')
nodesNoCommunities = pd.read_csv('../Tablas/TablaNoCommunities.csv')
nodesNoKCore = pd.read_csv('../Tablas/TablaNoKCore.csv')

In [313]:
ad=['name']
ac=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtes = nodes.loc[:, ['id']+ad+ac]

ad_NoCentrality=['name']
ac_NoCentrality=['Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoCentrality = nodesNoCentrality.loc[:, ['id']+ad_NoCentrality+ac_NoCentrality]

ad_NoClustering=['name']
ac_NoClustering=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoClustering = nodesNoClustering.loc[:, ['id']+ad_NoClustering+ac_NoClustering]

ad_NoCommunities=['name']
ac_NoCommunities=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core']
atributtesNoCommunities = nodesNoCommunities.loc[:, ['id']+ad_NoCommunities+ac_NoCommunities]

ad_NoKCore=['name']
ac_NoKCore=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','Comunidad','asyn_lpa_community']
atributtesNoKCore = nodesNoKCore.loc[:, ['id']+ad_NoKCore+ac_NoKCore]

#Elegimos el objetivo 
y = nodes['ml_target']



In [314]:
#Manipulación de datos 
#Para ello, usamos OrdinalEncoder
codificador_ad= OrdinalEncoder()
codificador_ad.fit(atributtes[ad])
codificador_ad.fit(atributtesNoCentrality[ad_NoCentrality])
codificador_ad.fit(atributtesNoClustering[ad_NoClustering])
codificador_ad.fit(atributtesNoCommunities[ad_NoCommunities])
codificador_ad.fit(atributtesNoKCore[ad_NoKCore])



In [315]:
#Transformamos los datos
atributtes[ad] = codificador_ad.transform(atributtes[ad])
atributtesNoCentrality[ad_NoCentrality] = codificador_ad.transform(atributtesNoCentrality[ad_NoCentrality])
atributtesNoClustering[ad_NoClustering] = codificador_ad.transform(atributtesNoClustering[ad_NoClustering])
atributtesNoCommunities[ad_NoCommunities] = codificador_ad.transform(atributtesNoCommunities[ad_NoCommunities])
atributtesNoKCore[ad_NoKCore] = codificador_ad.transform(atributtesNoKCore[ad_NoKCore])

In [316]:
#Normalizamos name 
scaler = MinMaxScaler(
    feature_range=(0, 1)
)
atributtes['name'] = scaler.fit_transform(atributtes[['name']])
atributtesNoCentrality['name'] = scaler.fit_transform(atributtesNoCentrality[['name']])
atributtesNoClustering['name'] = scaler.fit_transform(atributtesNoClustering[['name']])
atributtesNoCommunities['name'] = scaler.fit_transform(atributtesNoCommunities[['name']])
atributtesNoKCore['name'] = scaler.fit_transform(atributtesNoKCore[['name']])

In [317]:
#Validación por retención
X_train, X_test, y_train, y_test = train_test_split(atributtes, y, test_size=0.2)


In [318]:
#Creamos el modelo
#Para ello , usamos los hiperparámetros obtenidos anteriormente 
modelo = KNeighborsClassifier(
    n_neighbors=1,
    metric='manhattan'
   
)
modelo.fit(X_train, y_train)


In [319]:
#Predecimos 
y_pred = modelo.predict(X_test)
print(confusion_matrix(y_test, y_pred))
recall_all = recall_score(y_test, y_pred)
print(recall_all)


[[4099 1436]
 [1464  541]]
0.26982543640897755


In [320]:
#Sin centrality
X_train, X_test, y_train, y_test = train_test_split(atributtesNoCentrality, y, test_size=0.2)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
print(confusion_matrix(y_test, y_pred))
recall_no_centrality = recall_score(y_test, y_pred)
print(recall_no_centrality)


[[4169 1428]
 [1417  526]]
0.27071538857436955


In [321]:
#Sin clustering
X_train, X_test, y_train, y_test = train_test_split(atributtesNoClustering, y, test_size=0.2)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
print(confusion_matrix(y_test, y_pred))
recall_no_clustering = recall_score(y_test, y_pred)
print(recall_no_clustering)

[[4127 1458]
 [1415  540]]
0.27621483375959077


In [322]:
#Sin communities
X_train, X_test, y_train, y_test = train_test_split(atributtesNoCommunities, y, test_size=0.2)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
print(confusion_matrix(y_test, y_pred))
recall_no_communities = recall_score(y_test, y_pred)
print(recall_no_communities)

[[4166 1450]
 [1386  538]]
0.2796257796257796


In [323]:
#Sin kcore
X_train, X_test, y_train, y_test = train_test_split(atributtesNoKCore, y, test_size=0.2)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
print(confusion_matrix(y_test, y_pred))
recall_no_kcore = recall_score(y_test, y_pred)
print(recall_no_kcore)

[[4163 1405]
 [1463  509]]
0.2581135902636917


In [324]:
print('Recall all:', recall_all)
print('Recall no centrality:', recall_no_centrality)
print('Recall no clustering:', recall_no_clustering)
print('Recall no communities:', recall_no_communities)
print('Recall no kcore:', recall_no_kcore)

Recall all: 0.26982543640897755
Recall no centrality: 0.27071538857436955
Recall no clustering: 0.27621483375959077
Recall no communities: 0.2796257796257796
Recall no kcore: 0.2581135902636917


# ANALISIS
- Al evaluar los resultados de recall para diferentes configuraciones del modelo k-NN, se observa un rendimiento consistentemente bajo en todas las variantes probadas
- El recall para todas las configuraciones se mantiene consistentemente bajo, con valores cercanos a 0.265. Este bajo rendimiento sugiere que el modelo k-NN no es eficaz para este conjunto de datos en particular
- El análisis muestra que el modelo k-NN presenta un rendimiento muy bajo en términos de recall en todas las configuraciones probadas. Ninguna de las métricas de centralidad, agrupamiento, comunidades, o k-core mejora significativamente el desempeño del modelo. Esto sugiere que el modelo k-NN puede no ser adecuado para este problema en particular y que otros modelos o enfoques podrían ser más eficaces.