In [1]:

import csv 
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier


In [2]:
#Leemos los csv de las tablas 
nodes = pd.read_csv('../Tablas/TablaAtributos.csv')
nodesNoCentrality = pd.read_csv('../Tablas/TablaNoCentrality.csv')
nodesNoClustering = pd.read_csv('../Tablas/TablaNoClustering.csv')
nodesNoCommunities = pd.read_csv('../Tablas/TablaNoCommunities.csv')
nodesNoKCore = pd.read_csv('../Tablas/TablaNoKCore.csv')

In [3]:
ad=['name']
ac=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtes = nodes.loc[:, ['id']+ad+ac]

ad_NoCentrality=['name']
ac_NoCentrality=['Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoCentrality = nodesNoCentrality.loc[:, ['id']+ad_NoCentrality+ac_NoCentrality]

ad_NoClustering=['name']
ac_NoClustering=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoClustering = nodesNoClustering.loc[:, ['id']+ad_NoClustering+ac_NoClustering]

ad_NoCommunities=['name']
ac_NoCommunities=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core']
atributtesNoCommunities = nodesNoCommunities.loc[:, ['id']+ad_NoCommunities+ac_NoCommunities]

ad_NoKCore=['name']
ac_NoKCore=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','Comunidad','asyn_lpa_community']
atributtesNoKCore = nodesNoKCore.loc[:, ['id']+ad_NoKCore+ac_NoKCore]

#Elegimos el objetivo 
y = nodes['ml_target']

In [4]:
#Manipulación de datos 
#Para ello, usamos OrdinalEncoder
codificador_ad= OrdinalEncoder()
codificador_ad.fit(atributtes[ad])
codificador_ad.fit(atributtesNoCentrality[ad_NoCentrality])
codificador_ad.fit(atributtesNoClustering[ad_NoClustering])
codificador_ad.fit(atributtesNoCommunities[ad_NoCommunities])
codificador_ad.fit(atributtesNoKCore[ad_NoKCore])

In [5]:
#Transformamos los atributos
atributtes[ad]= codificador_ad.transform(atributtes[ad])
atributtesNoCentrality[ad_NoCentrality]= codificador_ad.transform(atributtesNoCentrality[ad_NoCentrality])
atributtesNoClustering[ad_NoClustering]= codificador_ad.transform(atributtesNoClustering[ad_NoClustering])
atributtesNoCommunities[ad_NoCommunities]= codificador_ad.transform(atributtesNoCommunities[ad_NoCommunities])
atributtesNoKCore[ad_NoKCore]= codificador_ad.transform(atributtesNoKCore[ad_NoKCore])

In [6]:
#Normalizamos name 
scaler = MinMaxScaler(
    feature_range=(0, 1)
)
atributtes['name'] = scaler.fit_transform(atributtes[['name']])
atributtesNoCentrality['name'] = scaler.fit_transform(atributtesNoCentrality[['name']])
atributtesNoClustering['name'] = scaler.fit_transform(atributtesNoClustering[['name']])
atributtesNoCommunities['name'] = scaler.fit_transform(atributtesNoCommunities[['name']])
atributtesNoKCore['name'] = scaler.fit_transform(atributtesNoKCore[['name']])

In [7]:
#Discretizamos los atributos
discretizer = KBinsDiscretizer(
    n_bins=5,
    encode='ordinal',
    strategy='uniform'
)
#Copiamos el dataframe y discretizamos los atributos
atributtes_discretized = atributtes.copy()
atributtes_discretized[ac] = discretizer.fit_transform(atributtes[ac])

atributtesNoCentrality_discretized = atributtesNoCentrality.copy()
atributtesNoCentrality_discretized[ac_NoCentrality] = discretizer.fit_transform(atributtesNoCentrality[ac_NoCentrality])

atributtesNoClustering_discretized = atributtesNoClustering.copy()
atributtesNoClustering_discretized[ac_NoClustering] = discretizer.fit_transform(atributtesNoClustering[ac_NoClustering])

atributtesNoCommunities_discretized = atributtesNoCommunities.copy()
atributtesNoCommunities_discretized[ac_NoCommunities] = discretizer.fit_transform(atributtesNoCommunities[ac_NoCommunities])

atributtesNoKCore_discretized = atributtesNoKCore.copy()
atributtesNoKCore_discretized[ac_NoKCore] = discretizer.fit_transform(atributtesNoKCore[ac_NoKCore])

In [8]:
#Modelo 
tub_knn = Pipeline([
    ('preprocessor', discretizer),
    ('knn', KNeighborsClassifier())
    
])

params= {
    'knn__n_neighbors': [1,3,5,7,9],
    'knn__weights': ['manhattan']
}


In [9]:
def evaluate_model(data, target):
    scores = cross_validate(tub_knn, data, target, cv=10, scoring=['accuracy', 'recall', 'f1'])
    mean_accuracy = scores['test_accuracy'].mean()
    mean_recall = scores['test_recall'].mean()
    mean_f1_macro = scores['test_f1'].mean()
    return mean_accuracy, mean_recall, mean_f1_macro

In [10]:
accuracy_all, recall_all, f1_all = evaluate_model(atributtes_discretized, y)
print("Resultados con todos los atributos:")
print(f"Accuracy promedio: {accuracy_all}")
print(f"Recall promedio: {recall_all}")
print(f"F1-Macro promedio: {f1_all}")

Resultados con todos los atributos:
Accuracy promedio: 0.6918567639257295
Recall promedio: 0.26513566500862085
F1-Macro promedio: 0.29496384960804206


In [11]:
atributtesNoCentrality, recall_no_centrality, f1_no_centrality = evaluate_model(atributtesNoCentrality, y)
print("\nResultados sin atributos de Centralidad:")
print(f"Accuracy promedio: {atributtesNoCentrality}")
print(f"Recall promedio: {recall_no_centrality}")
print(f"F1-Macro promedio: {f1_no_centrality}")


Resultados sin atributos de Centralidad:
Accuracy promedio: 0.6530769230769231
Recall promedio: 0.2861017492840577
F1-Macro promedio: 0.2545831941237622


In [12]:
atributtesNoClustering, recall_no_clustering, f1_no_clustering = evaluate_model(atributtesNoClustering, y)
print("\nResultados sin atributos de Centralidad:")
print(f"Accuracy promedio: {atributtesNoClustering}")
print(f"Recall promedio: {recall_no_clustering}")
print(f"F1-Macro promedio: {f1_no_clustering}")


Resultados sin atributos de Centralidad:
Accuracy promedio: 0.6551193633952255
Recall promedio: 0.3567378775184604
F1-Macro promedio: 0.3269818601666438


In [13]:
atributtesNoCommunities, recall_no_communities, f1_no_communities = evaluate_model(atributtesNoCommunities, y)
print("\nResultados sin atributos de Centralidad:")
print(f"Accuracy promedio: {atributtesNoCommunities}")
print(f"Recall promedio: {recall_no_communities}")
print(f"F1-Macro promedio: {f1_no_communities}")



Resultados sin atributos de Centralidad:
Accuracy promedio: 0.6765517241379311
Recall promedio: 0.30662613353142654
F1-Macro promedio: 0.31179789506398575


In [14]:
atributtesNoKCore, recall_no_kcore, f1_no_kcore = evaluate_model(atributtesNoKCore, y)
print("\nResultados sin atributos de Centralidad:")
print(f"Accuracy promedio: {atributtesNoKCore}")
print(f"Recall promedio: {recall_no_kcore}")
print(f"F1-Macro promedio: {f1_no_kcore}")


Resultados sin atributos de Centralidad:
Accuracy promedio: 0.689920424403183
Recall promedio: 0.28968810870927786
F1-Macro promedio: 0.30852152060974264


In [15]:
print("\n\n")
print("Resultados finales:")
print(f"Recall Todos atributos: {accuracy_all}")
print(f"Recall Sin Centralidad: {atributtesNoCentrality}")
print(f"Recall Sin Clustering: {atributtesNoClustering}")
print(f"Recall Sin Comunidades: {atributtesNoCommunities}")
print(f"Recall Sin KCore: {atributtesNoKCore}")





Resultados finales:
Recall Todos atributos: 0.6918567639257295
Recall Sin Centralidad: 0.6530769230769231
Recall Sin Clustering: 0.6551193633952255
Recall Sin Comunidades: 0.6765517241379311
Recall Sin KCore: 0.689920424403183


In [16]:
#ANALISIS
#El mejor modelo es el que tiene todos los atributos, ya que tiene un recall de 0.69
#El peor modelo es el que no tiene atributos de centralidad, ya que tiene un recall de 0.53
#Esto significa que los atributos de centralidad son los más importantes para el modelo
#El modelo sin comunidades tiene un recall de 0.68, lo que significa que las comunidades no son tan importantes
#El modelo sin clustering tiene un recall de 0.68, lo que significa que el clustering no es tan importante
#El modelo sin kcore tiene un recall de 0.68, lo que significa que el kcore no es tan importante
#En general , este modelo no es muy bueno, ya que el recall es bajo