In [8]:

import csv 
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier



In [9]:
#Leemos los csv de las tablas 
nodes = pd.read_csv('../Tablas/TablaAtributos.csv')
nodesNoCentrality = pd.read_csv('../Tablas/TablaNoCentrality.csv')
nodesNoClustering = pd.read_csv('../Tablas/TablaNoClustering.csv')
nodesNoCommunities = pd.read_csv('../Tablas/TablaNoCommunities.csv')
nodesNoKCore = pd.read_csv('../Tablas/TablaNoKCore.csv')

In [10]:
ad=['name']
ac=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtes = nodes.loc[:, ['id']+ad+ac]

ad_NoCentrality=['name']
ac_NoCentrality=['Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoCentrality = nodesNoCentrality.loc[:, ['id']+ad_NoCentrality+ac_NoCentrality]

ad_NoClustering=['name']
ac_NoClustering=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoClustering = nodesNoClustering.loc[:, ['id']+ad_NoClustering+ac_NoClustering]

ad_NoCommunities=['name']
ac_NoCommunities=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core']
atributtesNoCommunities = nodesNoCommunities.loc[:, ['id']+ad_NoCommunities+ac_NoCommunities]

ad_NoKCore=['name']
ac_NoKCore=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','Comunidad','asyn_lpa_community']
atributtesNoKCore = nodesNoKCore.loc[:, ['id']+ad_NoKCore+ac_NoKCore]

#Elegimos el objetivo 
y = nodes['ml_target']

In [11]:
#Manipulación de datos 
#Para ello, usamos OrdinalEncoder
codificador_ad= OrdinalEncoder()
codificador_ad.fit(atributtes[ad])
codificador_ad.fit(atributtesNoCentrality[ad_NoCentrality])
codificador_ad.fit(atributtesNoClustering[ad_NoClustering])
codificador_ad.fit(atributtesNoCommunities[ad_NoCommunities])
codificador_ad.fit(atributtesNoKCore[ad_NoKCore])

In [12]:
#Transformamos los atributos
atributtes[ad]= codificador_ad.transform(atributtes[ad])
atributtesNoCentrality[ad_NoCentrality]= codificador_ad.transform(atributtesNoCentrality[ad_NoCentrality])
atributtesNoClustering[ad_NoClustering]= codificador_ad.transform(atributtesNoClustering[ad_NoClustering])
atributtesNoCommunities[ad_NoCommunities]= codificador_ad.transform(atributtesNoCommunities[ad_NoCommunities])
atributtesNoKCore[ad_NoKCore]= codificador_ad.transform(atributtesNoKCore[ad_NoKCore])

In [13]:
#Normalizamos name 
scaler = MinMaxScaler(
    feature_range=(0, 1)
)
atributtes['name'] = scaler.fit_transform(atributtes[['name']])
atributtesNoCentrality['name'] = scaler.fit_transform(atributtesNoCentrality[['name']])
atributtesNoClustering['name'] = scaler.fit_transform(atributtesNoClustering[['name']])
atributtesNoCommunities['name'] = scaler.fit_transform(atributtesNoCommunities[['name']])
atributtesNoKCore['name'] = scaler.fit_transform(atributtesNoKCore[['name']])

In [14]:
#Discretizamos los atributos
discretizer = KBinsDiscretizer(
    n_bins=5,
    encode='ordinal',
    strategy='uniform'
)
#Copiamos el dataframe y discretizamos los atributos
atributtes_discretized = atributtes.copy()
atributtes_discretized[ac] = discretizer.fit_transform(atributtes[ac])

atributtesNoCentrality_discretized = atributtesNoCentrality.copy()
atributtesNoCentrality_discretized[ac_NoCentrality] = discretizer.fit_transform(atributtesNoCentrality[ac_NoCentrality])

atributtesNoClustering_discretized = atributtesNoClustering.copy()
atributtesNoClustering_discretized[ac_NoClustering] = discretizer.fit_transform(atributtesNoClustering[ac_NoClustering])

atributtesNoCommunities_discretized = atributtesNoCommunities.copy()
atributtesNoCommunities_discretized[ac_NoCommunities] = discretizer.fit_transform(atributtesNoCommunities[ac_NoCommunities])

atributtesNoKCore_discretized = atributtesNoKCore.copy()
atributtesNoKCore_discretized[ac_NoKCore] = discretizer.fit_transform(atributtesNoKCore[ac_NoKCore])

In [15]:
#Modelo 


tub_model = Pipeline([
    ('preprocessor', discretizer),
    ('model', DecisionTreeClassifier())
])

In [17]:
#CON TODOS LOS ATRIBUTOS
#Validación cruzada
cv_results = cross_validate(
    estimator=tub_model,
    X=atributtes,
    y=y,
    cv=10,
    scoring=['accuracy', 'recall','f1']
)
cv_results

{'fit_time': array([0.05670834, 0.04875231, 0.06108022, 0.05457067, 0.04367089,
        0.05108309, 0.04420424, 0.10723042, 0.05541444, 0.05289221]),
 'score_time': array([0.00897217, 0.00524902, 0.0137291 , 0.01217604, 0.01080465,
        0.00971437, 0.01403522, 0.02274227, 0.01178622, 0.01284146]),
 'test_accuracy': array([0.73633952, 0.73899204, 0.74111406, 0.73501326, 0.73793103,
        0.74429708, 0.74562334, 0.74111406, 0.73236074, 0.73156499]),
 'test_recall': array([0.0606372 , 0.04209446, 0.05749487, 0.06673511, 0.08008214,
        0.08316222, 0.07289528, 0.0687885 , 0.0349076 , 0.05749487]),
 'test_f1': array([0.10611511, 0.07692308, 0.10294118, 0.11514615, 0.13636364,
        0.14387211, 0.12897366, 0.12072072, 0.06313835, 0.09964413])}

In [19]:
mean_accuracy = cv_results['test_accuracy'].mean()
mean_recall = cv_results['test_recall'].mean()
mean_f1_macro = cv_results['test_f1'].mean()

print(f"Accuracy promedio: {mean_accuracy}")
print(f"Recall promedio: {mean_recall}")
print(f"F1-Macro promedio: {mean_f1_macro}")

Accuracy promedio: 0.7384350132625994
Recall promedio: 0.062429223532291786
F1-Macro promedio: 0.10938381147847945


In [20]:
#CON TODOS LOS ATRIBUTOS EXCEPTO CENTRALITY
#Validación cruzada
cv_results = cross_validate(
    estimator=tub_model,
    X=atributtesNoCentrality,
    y=y,
    cv=10,
    scoring=['accuracy', 'recall','f1']
)
cv_results

{'fit_time': array([0.03987861, 0.03083682, 0.03969646, 0.03239894, 0.03337932,
        0.04013276, 0.03606057, 0.03090262, 0.03137088, 0.03166246]),
 'score_time': array([0.01183581, 0.00851655, 0.00051332, 0.00926518, 0.00700784,
        0.0136528 , 0.0076015 , 0.01456213, 0.01009369, 0.01599383]),
 'test_accuracy': array([0.73899204, 0.7403183 , 0.7403183 , 0.73872679, 0.73740053,
        0.74403183, 0.74217507, 0.73899204, 0.73633952, 0.73819629]),
 'test_recall': array([0.00924974, 0.01129363, 0.01437372, 0.01232033, 0.04928131,
        0.03182752, 0.03182752, 0.0174538 , 0.01026694, 0.00821355]),
 'test_f1': array([0.01796407, 0.02197802, 0.02780536, 0.02378593, 0.08839779,
        0.06037001, 0.05996132, 0.03339882, 0.01972387, 0.01595214])}

In [21]:
mean_accuracy = cv_results['test_accuracy'].mean()
mean_recall = cv_results['test_recall'].mean()
mean_f1_macro = cv_results['test_f1'].mean()

print(f"Accuracy promedio: {mean_accuracy}")
print(f"Recall promedio: {mean_recall}")
print(f"F1-Macro promedio: {mean_f1_macro}")

Accuracy promedio: 0.7395490716180372
Recall promedio: 0.019610805928445864
F1-Macro promedio: 0.03693373286950195


In [22]:
#CON TODOS LOS ATRIBUTOS EXCEPTO CLUSTERING
#Validación cruzada
cv_results = cross_validate(
    estimator=tub_model,
    X=atributtesNoClustering,
    y=y,
    cv=10,
    scoring=['accuracy', 'recall','f1']
)
cv_results

{'fit_time': array([0.04437685, 0.04432917, 0.04445195, 0.04058194, 0.05245566,
        0.0597558 , 0.06176567, 0.05175757, 0.05236363, 0.04480147]),
 'score_time': array([0.01116633, 0.00478506, 0.00833201, 0.00447798, 0.01076007,
        0.02462316, 0.01711631, 0.01059341, 0.01086354, 0.00931144]),
 'test_accuracy': array([0.73846154, 0.74137931, 0.74244032, 0.73846154, 0.74429708,
        0.74615385, 0.74270557, 0.74217507, 0.73660477, 0.73793103]),
 'test_recall': array([0.05035971, 0.03593429, 0.03080082, 0.04517454, 0.04209446,
        0.04722793, 0.0349076 , 0.03798768, 0.01334702, 0.01950719]),
 'test_f1': array([0.0904059 , 0.06698565, 0.05819593, 0.08193669, 0.07839388,
        0.08770257, 0.0655106 , 0.0707457 , 0.02551521, 0.03703704])}

In [23]:
mean_accuracy = cv_results['test_accuracy'].mean()
mean_recall = cv_results['test_recall'].mean()
mean_f1_macro = cv_results['test_f1'].mean()

print(f"Accuracy promedio: {mean_accuracy}")
print(f"Recall promedio: {mean_recall}")
print(f"F1-Macro promedio: {mean_f1_macro}")

Accuracy promedio: 0.7410610079575596
Recall promedio: 0.035734123173740265
F1-Macro promedio: 0.06624291601263552


In [24]:
#CON TODOS LOS ATRIBUTOS EXCEPTO COMMUNITIES
#Validación cruzada
cv_results = cross_validate(
    estimator=tub_model,
    X=atributtesNoCommunities,
    y=y,
    cv=10,
    scoring=['accuracy', 'recall','f1']
)
cv_results

{'fit_time': array([0.04797387, 0.0427444 , 0.03791142, 0.04707003, 0.04301596,
        0.0517993 , 0.04022598, 0.04353619, 0.05018115, 0.04017949]),
 'score_time': array([0.01087546, 0.00442815, 0.00828242, 0.01210499, 0.00296783,
        0.01640749, 0.01188493, 0.0040915 , 0.01212287, 0.00995564]),
 'test_accuracy': array([0.73687003, 0.73740053, 0.74244032, 0.73899204, 0.73607427,
        0.74164456, 0.74297082, 0.74190981, 0.7331565 , 0.73156499]),
 'test_recall': array([0.04316547, 0.02977413, 0.05338809, 0.06365503, 0.06776181,
        0.05441478, 0.05544148, 0.06057495, 0.03182752, 0.0513347 ]),
 'test_f1': array([0.07806691, 0.05534351, 0.09674419, 0.11191336, 0.11712511,
        0.09814815, 0.10027855, 0.10815765, 0.05805243, 0.08992806])}

In [25]:
mean_accuracy = cv_results['test_accuracy'].mean()
mean_recall = cv_results['test_recall'].mean()
mean_f1_macro = cv_results['test_f1'].mean()

print(f"Accuracy promedio: {mean_accuracy}")
print(f"Recall promedio: {mean_recall}")
print(f"F1-Macro promedio: {mean_f1_macro}")

Accuracy promedio: 0.7383023872679045
Recall promedio: 0.051133795222548864
F1-Macro promedio: 0.09137579255296271


In [26]:
#CON TODOS LOS ATRIBUTOS EXCEPTO KCORE
#Validación cruzada
cv_results = cross_validate(
    estimator=tub_model,
    X=atributtesNoKCore,
    y=y,
    cv=10,
    scoring=['accuracy', 'recall','f1']
)
cv_results

{'fit_time': array([0.0446918 , 0.03554249, 0.03864145, 0.03778958, 0.04541016,
        0.04629612, 0.0424974 , 0.04471421, 0.03829885, 0.04221606]),
 'score_time': array([0.01209307, 0.01343131, 0.00974298, 0.01099658, 0.01405382,
        0.01123619, 0.01343536, 0.01431537, 0.00912166, 0.00886035]),
 'test_accuracy': array([0.73899204, 0.7397878 , 0.74217507, 0.73660477, 0.73846154,
        0.7464191 , 0.74429708, 0.74005305, 0.73448276, 0.73448276]),
 'test_recall': array([0.04727646, 0.13449692, 0.04620123, 0.06673511, 0.06057495,
        0.07186858, 0.06570842, 0.05749487, 0.02977413, 0.04312115]),
 'test_f1': array([0.08550186, 0.21078037, 0.08474576, 0.11576135, 0.10688406,
        0.12773723, 0.11721612, 0.1025641 , 0.05476865, 0.07741935])}

In [None]:
#ANALISIS DE RESULTADOS
#Se observa que el modelo con todos los atributos tiene un mejor desempeño que los modelos que no tienen ciertos atributos.
#En particular, el modelo que no tiene el atributo de centralidad tiene un desempeño muy bajo.
#Por otro lado, los modelos que no tienen los atributos de clustering, communities y kcore tienen un desempeño similar.
#Por lo tanto, se concluye que el atributo de centralidad es el más relevante para el modelo.
#En general, el modelo tiene un desempeño aceptable, con un accuracy promedio de 0.75 y un recall promedio de 0.74.