In [10]:

import csv 
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier


In [11]:
#Leemos los csv de las tablas 
nodes = pd.read_csv('../Tablas/TablaAtributos.csv')
nodesNoCentrality = pd.read_csv('../Tablas/TablaNoCentrality.csv')
nodesNoClustering = pd.read_csv('../Tablas/TablaNoClustering.csv')
nodesNoCommunities = pd.read_csv('../Tablas/TablaNoCommunities.csv')
nodesNoKCore = pd.read_csv('../Tablas/TablaNoKCore.csv')

In [12]:
ad=['name']
ac=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtes = nodes.loc[:, ['id']+ad+ac]

ad_NoCentrality=['name']
ac_NoCentrality=['Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoCentrality = nodesNoCentrality.loc[:, ['id']+ad_NoCentrality+ac_NoCentrality]

ad_NoClustering=['name']
ac_NoClustering=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoClustering = nodesNoClustering.loc[:, ['id']+ad_NoClustering+ac_NoClustering]

ad_NoCommunities=['name']
ac_NoCommunities=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core']
atributtesNoCommunities = nodesNoCommunities.loc[:, ['id']+ad_NoCommunities+ac_NoCommunities]

ad_NoKCore=['name']
ac_NoKCore=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','Comunidad','asyn_lpa_community']
atributtesNoKCore = nodesNoKCore.loc[:, ['id']+ad_NoKCore+ac_NoKCore]

#Elegimos el objetivo 
y = nodes['ml_target']

In [13]:
#Manipulación de datos 
#Para ello, usamos OrdinalEncoder
codificador_ad= OrdinalEncoder()
codificador_ad.fit(atributtes[ad])
codificador_ad.fit(atributtesNoCentrality[ad_NoCentrality])
codificador_ad.fit(atributtesNoClustering[ad_NoClustering])
codificador_ad.fit(atributtesNoCommunities[ad_NoCommunities])
codificador_ad.fit(atributtesNoKCore[ad_NoKCore])

In [14]:
#Transformamos los atributos
atributtes[ad]= codificador_ad.transform(atributtes[ad])
atributtesNoCentrality[ad_NoCentrality]= codificador_ad.transform(atributtesNoCentrality[ad_NoCentrality])
atributtesNoClustering[ad_NoClustering]= codificador_ad.transform(atributtesNoClustering[ad_NoClustering])
atributtesNoCommunities[ad_NoCommunities]= codificador_ad.transform(atributtesNoCommunities[ad_NoCommunities])
atributtesNoKCore[ad_NoKCore]= codificador_ad.transform(atributtesNoKCore[ad_NoKCore])

In [15]:
#Normalizamos name 
scaler = MinMaxScaler(
    feature_range=(0, 1)
)
atributtes['name'] = scaler.fit_transform(atributtes[['name']])
atributtesNoCentrality['name'] = scaler.fit_transform(atributtesNoCentrality[['name']])
atributtesNoClustering['name'] = scaler.fit_transform(atributtesNoClustering[['name']])
atributtesNoCommunities['name'] = scaler.fit_transform(atributtesNoCommunities[['name']])
atributtesNoKCore['name'] = scaler.fit_transform(atributtesNoKCore[['name']])

In [16]:
#Discretizamos los atributos
discretizer = KBinsDiscretizer(
    n_bins=5,
    encode='ordinal',
    strategy='uniform'
)
#Copiamos el dataframe y discretizamos los atributos
atributtes_discretized = atributtes.copy()
atributtes_discretized[ac] = discretizer.fit_transform(atributtes[ac])

atributtesNoCentrality_discretized = atributtesNoCentrality.copy()
atributtesNoCentrality_discretized[ac_NoCentrality] = discretizer.fit_transform(atributtesNoCentrality[ac_NoCentrality])

atributtesNoClustering_discretized = atributtesNoClustering.copy()
atributtesNoClustering_discretized[ac_NoClustering] = discretizer.fit_transform(atributtesNoClustering[ac_NoClustering])

atributtesNoCommunities_discretized = atributtesNoCommunities.copy()
atributtesNoCommunities_discretized[ac_NoCommunities] = discretizer.fit_transform(atributtesNoCommunities[ac_NoCommunities])

atributtesNoKCore_discretized = atributtesNoKCore.copy()
atributtesNoKCore_discretized[ac_NoKCore] = discretizer.fit_transform(atributtesNoKCore[ac_NoKCore])

In [21]:
#Modelo 
tub_knn = Pipeline([
    ('preprocessor', discretizer),
    ('knn', KNeighborsClassifier())
    
])


In [None]:
#CON TODOS LOS ATRIBUTOS 
#Validamos el modelo
scores = cross_validate(
    tub_knn,
    atributtes_discretized,
    y,
    cv=10,
    scoring=['accuracy', 'recall', 'f1']
)
scores


In [20]:
mean_accuracy = scores['test_accuracy'].mean()
mean_recall = scores['test_recall'].mean()
mean_f1_macro = scores['test_f1'].mean()

print(f"Accuracy promedio: {mean_accuracy}")
print(f"Recall promedio: {mean_recall}")
print(f"F1-Macro promedio: {mean_f1_macro}")

Accuracy promedio: 0.6918567639257295
Recall promedio: 0.26513566500862085
F1-Macro promedio: 0.29496384960804206


In [22]:
#SIN CENTRALITY
#Validamos el modelo Knn con validación cruzada
scores = cross_validate(
    tub_knn,
    atributtesNoCentrality_discretized,
    y,
    cv=10,
    scoring=['accuracy', 'recall', 'f1']
)
scores


{'fit_time': array([0.0744679 , 0.0707562 , 0.08264256, 0.07693148, 0.08611941,
        0.08144712, 0.06404591, 0.08534145, 0.08503556, 0.08460045]),
 'score_time': array([0.24897575, 0.26023006, 0.25749636, 0.27580404, 0.36603785,
        0.25728345, 0.26566672, 0.32592726, 0.28078532, 0.32449365]),
 'test_accuracy': array([0.58514589, 0.71299735, 0.59310345, 0.71299735, 0.52175066,
        0.70503979, 0.61909814, 0.65809019, 0.68938992, 0.7331565 ]),
 'test_recall': array([0.63103803, 0.05954825, 0.47125257, 0.1201232 , 0.65605749,
        0.12422998, 0.38603696, 0.22792608, 0.14887064, 0.03593429]),
 'test_f1': array([0.43982808, 0.09682805, 0.37438825, 0.17781155, 0.41480039,
        0.17872969, 0.34369287, 0.25620312, 0.19849418, 0.06505576])}

In [23]:
mean_accuracy = scores['test_accuracy'].mean()
mean_recall = scores['test_recall'].mean()
mean_f1_macro = scores['test_f1'].mean()

print(f"Accuracy promedio: {mean_accuracy}")
print(f"Recall promedio: {mean_recall}")
print(f"F1-Macro promedio: {mean_f1_macro}")

Accuracy promedio: 0.6530769230769231
Recall promedio: 0.2861017492840577
F1-Macro promedio: 0.2545831941237622


In [24]:
#SIN CLUSTERING
#Validamos el modelo Knn con validación cruzada
scores = cross_validate(
    tub_knn,
    atributtesNoClustering_discretized,
    y,
    cv=10,
    scoring=['accuracy', 'recall', 'f1']
)
scores

{'fit_time': array([0.10412192, 0.09221601, 0.11502457, 0.09175301, 0.18725777,
        0.0807178 , 0.09665537, 0.11002707, 0.09738135, 0.09588289]),
 'score_time': array([0.34076428, 0.26919055, 0.38094497, 0.4055295 , 0.4134655 ,
        0.33879638, 0.41118646, 0.40279889, 0.34894943, 0.43448567]),
 'test_accuracy': array([0.57453581, 0.70901857, 0.60477454, 0.71061008, 0.55570292,
        0.71538462, 0.61061008, 0.6938992 , 0.67745358, 0.72122016]),
 'test_recall': array([0.62692703, 0.21252567, 0.49486653, 0.22073922, 0.60985626,
        0.32443532, 0.44558522, 0.13449692, 0.36447639, 0.09548255]),
 'test_f1': array([0.43201133, 0.27399073, 0.39282804, 0.28270874, 0.41494935,
        0.37067449, 0.37157534, 0.18502825, 0.36863967, 0.15036378])}

In [25]:
mean_accuracy = scores['test_accuracy'].mean()
mean_recall = scores['test_recall'].mean()
mean_f1_macro = scores['test_f1'].mean()

print(f"Accuracy promedio: {mean_accuracy}")
print(f"Recall promedio: {mean_recall}")
print(f"F1-Macro promedio: {mean_f1_macro}")

Accuracy promedio: 0.6573209549071619
Recall promedio: 0.35293910955131463
F1-Macro promedio: 0.3242769728898023


In [26]:
#SIN COMMUNITIES
#Validamos el modelo Knn con validación cruzada
scores = cross_validate(
    tub_knn,
    atributtesNoCommunities_discretized,
    y,
    cv=10,
    scoring=['accuracy', 'recall', 'f1']
)
scores

{'fit_time': array([0.08024836, 0.0778563 , 0.11216784, 0.09263849, 0.10602498,
        0.0818181 , 0.07315564, 0.07524681, 0.07347035, 0.08789015]),
 'score_time': array([0.22113895, 0.25289059, 0.32018638, 0.26498675, 0.35286713,
        0.29434443, 0.27808809, 0.2755456 , 0.2764101 , 0.31982517]),
 'test_accuracy': array([0.62679045, 0.69363395, 0.66286472, 0.71777188, 0.63501326,
        0.70291777, 0.65384615, 0.64535809, 0.69071618, 0.71564987]),
 'test_recall': array([0.5385406 , 0.21457906, 0.37268994, 0.16221766, 0.48049281,
        0.30800821, 0.43634497, 0.24435318, 0.24845996, 0.08726899]),
 'test_f1': array([0.42688391, 0.26573427, 0.36354532, 0.22898551, 0.40484429,
        0.34883721, 0.39443155, 0.26254826, 0.29333333, 0.13687601])}

In [27]:
mean_accuracy = scores['test_accuracy'].mean()
mean_recall = scores['test_recall'].mean()
mean_f1_macro = scores['test_f1'].mean()

print(f"Accuracy promedio: {mean_accuracy}")
print(f"Recall promedio: {mean_recall}")
print(f"F1-Macro promedio: {mean_f1_macro}")

Accuracy promedio: 0.6744562334217508
Recall promedio: 0.3092955380488803
F1-Macro promedio: 0.31260196581515204


In [28]:
#SIN KCORE
#Validamos el modelo Knn con validación cruzada
scores = cross_validate(
    tub_knn,
    atributtesNoKCore_discretized,
    y,
    cv=10,
    scoring=['accuracy', 'recall', 'f1']
)
scores

{'fit_time': array([0.09926486, 0.09299326, 0.10755801, 0.08173275, 0.09856892,
        0.09566236, 0.09603882, 0.0933125 , 0.10135913, 0.12178969]),
 'score_time': array([0.29764318, 0.26326394, 0.32795882, 0.3402319 , 0.32585311,
        0.43120551, 0.36303067, 0.29989839, 0.48859978, 0.4937191 ]),
 'test_accuracy': array([0.60450928, 0.70583554, 0.7204244 , 0.67718833, 0.66525199,
        0.71750663, 0.64429708, 0.71538462, 0.71883289, 0.7270557 ]),
 'test_recall': array([0.56217883, 0.36755647, 0.24229979, 0.18685832, 0.42710472,
        0.3100616 , 0.38090349, 0.14989733, 0.13449692, 0.13141684]),
 'test_f1': array([0.42321083, 0.39232877, 0.30930537, 0.23023403, 0.39732569,
        0.36189335, 0.35621699, 0.21391941, 0.19818457, 0.19922179])}

In [29]:
mean_accuracy = scores['test_accuracy'].mean()
mean_recall = scores['test_recall'].mean()
mean_f1_macro = scores['test_f1'].mean()

print(f"Accuracy promedio: {mean_accuracy}")
print(f"Recall promedio: {mean_recall}")
print(f"F1-Macro promedio: {mean_f1_macro}")

Accuracy promedio: 0.6896286472148542
Recall promedio: 0.289277431091208
F1-Macro promedio: 0.3081840810588357


In [None]:
#ANALISIS
#Se observa que el modelo Knn tiene un mejor rendimiento cuando se eliminan los atributos de centralidad y clustering.
#En el caso de las comunidades y kcore, el rendimiento es similar al modelo con todos los atributos.
#En general, el modelo Knn tiene un rendimiento bajo, por lo que no es recomendable usarlo para clasificar los nodos.