In [1]:
#Vamos a calcular como necesitamos las métricas para realizar el modelo NaiveBayes

import csv 
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [2]:
#Leemos los csv de las tablas 
nodes = pd.read_csv('../Tablas/TablaAtributos.csv')
nodesNoCentrality = pd.read_csv('../Tablas/TablaNoCentrality.csv')
nodesNoClustering = pd.read_csv('../Tablas/TablaNoClustering.csv')
nodesNoCommunities = pd.read_csv('../Tablas/TablaNoCommunities.csv')
nodesNoKCore = pd.read_csv('../Tablas/TablaNoKCore.csv')


In [4]:
ad=['name']
ac=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtes = nodes.loc[:, ['id']+ad+ac]

ad_NoCentrality=['name']
ac_NoCentrality=['Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoCentrality = nodesNoCentrality.loc[:, ['id']+ad_NoCentrality+ac_NoCentrality]

ad_NoClustering=['name']
ac_NoClustering=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoClustering = nodesNoClustering.loc[:, ['id']+ad_NoClustering+ac_NoClustering]

ad_NoCommunities=['name']
ac_NoCommunities=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core']
atributtesNoCommunities = nodesNoCommunities.loc[:, ['id']+ad_NoCommunities+ac_NoCommunities]

ad_NoKCore=['name']
ac_NoKCore=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','Comunidad','asyn_lpa_community']
atributtesNoKCore = nodesNoKCore.loc[:, ['id']+ad_NoKCore+ac_NoKCore]

#Elegimos el objetivo 
y = nodes['ml_target']


In [5]:
#Manipulación de datos 
#Naive bayes no puede manejar atributos categóricos, por lo que necesitamos convertirlos a numéricos
#Para ello, usamos OrdinalEncoder
codificador_ad= OrdinalEncoder()
codificador_ad.fit(atributtes[ad])
codificador_ad.fit(atributtesNoCentrality[ad_NoCentrality])
codificador_ad.fit(atributtesNoClustering[ad_NoClustering])
codificador_ad.fit(atributtesNoCommunities[ad_NoCommunities])
codificador_ad.fit(atributtesNoKCore[ad_NoKCore])

In [6]:
#Transformamos los atributos
atributtes[ad]= codificador_ad.transform(atributtes[ad])
atributtesNoCentrality[ad_NoCentrality]= codificador_ad.transform(atributtesNoCentrality[ad_NoCentrality])
atributtesNoClustering[ad_NoClustering]= codificador_ad.transform(atributtesNoClustering[ad_NoClustering])
atributtesNoCommunities[ad_NoCommunities]= codificador_ad.transform(atributtesNoCommunities[ad_NoCommunities])
atributtesNoKCore[ad_NoKCore]= codificador_ad.transform(atributtesNoKCore[ad_NoKCore])

In [9]:
#Normalizamos name 
scaler = MinMaxScaler(
    feature_range=(0, 1)
)
atributtes['name'] = scaler.fit_transform(atributtes[['name']])
atributtesNoCentrality['name'] = scaler.fit_transform(atributtesNoCentrality[['name']])
atributtesNoClustering['name'] = scaler.fit_transform(atributtesNoClustering[['name']])
atributtesNoCommunities['name'] = scaler.fit_transform(atributtesNoCommunities[['name']])
atributtesNoKCore['name'] = scaler.fit_transform(atributtesNoKCore[['name']])

In [11]:
#Discretizamos los atributos
discretizer = KBinsDiscretizer(
    n_bins=5,
    encode='ordinal',
    strategy='uniform'
)
#Copiamos el dataframe y discretizamos los atributos
atributtes_discretized = atributtes.copy()
atributtes_discretized[ac] = discretizer.fit_transform(atributtes[ac])

atributtesNoCentrality_discretized = atributtesNoCentrality.copy()
atributtesNoCentrality_discretized[ac_NoCentrality] = discretizer.fit_transform(atributtesNoCentrality[ac_NoCentrality])

atributtesNoClustering_discretized = atributtesNoClustering.copy()
atributtesNoClustering_discretized[ac_NoClustering] = discretizer.fit_transform(atributtesNoClustering[ac_NoClustering])

atributtesNoCommunities_discretized = atributtesNoCommunities.copy()
atributtesNoCommunities_discretized[ac_NoCommunities] = discretizer.fit_transform(atributtesNoCommunities[ac_NoCommunities])

atributtesNoKCore_discretized = atributtesNoKCore.copy()
atributtesNoKCore_discretized[ac_NoKCore] = discretizer.fit_transform(atributtesNoKCore[ac_NoKCore])


In [None]:
#CON TODOS LOS ATRIBUTOS 

In [23]:
#Validamos el modelo , diviendo los datos en entrenamiento y test
#alpha = 1
tub= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])

#Validación cruzada
cv_results = cross_validate(
    tub,
    atributtes_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results

{'fit_time': array([0.03855562, 0.0364964 , 0.03543043, 0.03202391, 0.03182888,
        0.03390431, 0.03315783, 0.03892279, 0.03412366, 0.03797722]),
 'score_time': array([0.00854397, 0.01392365, 0.0081799 , 0.00910926, 0.00822806,
        0.0096097 , 0.00852013, 0.00990534, 0.0101409 , 0.00972414]),
 'test_recall': array([0.16135663, 0.15811088, 0.15195072, 0.1550308 , 0.14373717,
        0.17453799, 0.16119097, 0.18172485, 0.13963039, 0.16735113]),
 'test_accuracy': array([0.73554377, 0.73262599, 0.73554377, 0.73183024, 0.72970822,
        0.74456233, 0.73687003, 0.73183024, 0.73156499, 0.73209549]),
 'test_f1_macro': array([0.53972701, 0.53604441, 0.53466419, 0.53382507, 0.52613765,
        0.55326757, 0.54065071, 0.54780798, 0.52503665, 0.54060689])}

In [24]:
cv_results['test_accuracy'].mean(), cv_results['test_recall'].mean(), cv_results['test_f1_macro'].mean()

(0.7342175066312999, 0.15946215160461835, 0.5377768145975297)

In [25]:
#CON TODOS LOS ATRIBUTOS EXCEPTO CENTRALITY
#Alpha = 1
tubNoCentrality= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])
cv_results_NoCentrality = cross_validate(
    tubNoCentrality,
    atributtesNoCentrality_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results_NoCentrality

{'fit_time': array([0.04382443, 0.0338068 , 0.03024936, 0.02776074, 0.02778339,
        0.02927852, 0.03181434, 0.03442693, 0.03158379, 0.02628326]),
 'score_time': array([0.01237369, 0.01002717, 0.01160336, 0.00890183, 0.00982976,
        0.01254702, 0.01314759, 0.01203847, 0.00898099, 0.01004815]),
 'test_recall': array([0.00513875, 0.01129363, 0.01026694, 0.00924025, 0.00718686,
        0.00718686, 0.00821355, 0.01232033, 0.01026694, 0.01026694]),
 'test_accuracy': array([0.73899204, 0.7397878 , 0.74164456, 0.74217507, 0.74297082,
        0.74217507, 0.74323607, 0.74323607, 0.73952255, 0.7403183 ]),
 'test_f1_macro': array([0.42986978, 0.43593268, 0.43566378, 0.43489244, 0.43323056,
        0.4329462 , 0.43430458, 0.43818107, 0.43488062, 0.43517436])}

In [26]:
cv_results_NoCentrality['test_accuracy'].mean(), cv_results_NoCentrality['test_recall'].mean(), cv_results_NoCentrality['test_f1_macro'].mean()

(0.7414058355437666, 0.009138104594060158, 0.4345076070616452)

In [27]:
#CON TODOS LOS ATRIBUTOS EXCEPTO CLUSTERING
#Alpha = 1
tubNoClustering= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])
cv_results_NoClustering = cross_validate(
    tubNoClustering,
    atributtesNoClustering_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results_NoClustering

{'fit_time': array([0.04782486, 0.03381753, 0.03991771, 0.035007  , 0.03920293,
        0.04143715, 0.03982663, 0.04063272, 0.04215837, 0.15429473]),
 'score_time': array([0.01286769, 0.03083849, 0.0126884 , 0.01022696, 0.01433682,
        0.01533031, 0.01165676, 0.0142231 , 0.01790237, 0.01948881]),
 'test_recall': array([0.16238438, 0.16324435, 0.16324435, 0.16427105, 0.14887064,
        0.18069815, 0.16735113, 0.17659138, 0.13963039, 0.17043121]),
 'test_accuracy': array([0.73633952, 0.73527851, 0.73899204, 0.73660477, 0.73262599,
        0.74244032, 0.73872679, 0.73342175, 0.73050398, 0.73262599]),
 'test_f1_macro': array([0.54083541, 0.54064715, 0.54324838, 0.54212492, 0.53096697,
        0.55493866, 0.54526691, 0.54633564, 0.52433606, 0.54259027])}

In [28]:
cv_results_NoClustering['test_accuracy'].mean(), cv_results_NoClustering['test_recall'].mean(), cv_results_NoClustering['test_f1_macro'].mean()

(0.7357559681697612, 0.16367170270823528, 0.5411290382924825)

In [29]:
#CON TODOS LOS ATRIBUTOS EXCEPTO COMMUNITIES
#Alpha = 1
tubNoCommunities= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])
cv_results_NoCommunities = cross_validate(
    tubNoCommunities,
    atributtesNoCommunities_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results_NoCommunities

{'fit_time': array([0.04207015, 0.0263443 , 0.03162718, 0.02872372, 0.02913094,
        0.03231049, 0.03516269, 0.0411551 , 0.04242396, 0.08003259]),
 'score_time': array([0.00976491, 0.01144743, 0.00943875, 0.00922751, 0.00948286,
        0.00880146, 0.01479506, 0.01280212, 0.01306987, 0.03171158]),
 'test_recall': array([0.15724563, 0.15092402, 0.14476386, 0.15297741, 0.13552361,
        0.15811088, 0.14887064, 0.17761807, 0.13141684, 0.15913758]),
 'test_accuracy': array([0.73713528, 0.7331565 , 0.73501326, 0.73448276, 0.73129973,
        0.74376658, 0.73474801, 0.73554377, 0.73050398, 0.733687  ]),
 'test_f1_macro': array([0.53859564, 0.53246795, 0.53026518, 0.53450814, 0.52249548,
        0.54379645, 0.5324031 , 0.54837393, 0.51958234, 0.53733081])}

In [30]:
cv_results_NoCommunities['test_accuracy'].mean(), cv_results_NoCommunities['test_recall'].mean(), cv_results_NoCommunities['test_f1_macro'].mean()

(0.7349336870026526, 0.15165885478768643, 0.5339819023732753)

In [31]:
#CON TODOS LOS ATRIBUTOS EXCEPTO KCORE
#Alpha = 1
tubNoKCore= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])
cv_results_NoKCore = cross_validate(
    tubNoKCore,
    atributtesNoKCore_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results_NoKCore

{'fit_time': array([0.04402161, 0.03216147, 0.03630829, 0.03373718, 0.03734446,
        0.03770709, 0.03647375, 0.04184914, 0.03889394, 0.02954817]),
 'score_time': array([0.00966167, 0.0132103 , 0.01068878, 0.01154709, 0.01313639,
        0.01080275, 0.01145983, 0.011976  , 0.00995636, 0.00980425]),
 'test_recall': array([0.03494347, 0.04928131, 0.04312115, 0.03901437, 0.0349076 ,
        0.06057495, 0.04722793, 0.04722793, 0.0349076 , 0.03901437]),
 'test_accuracy': array([0.73899204, 0.73846154, 0.74270557, 0.73872679, 0.7403183 ,
        0.74801061, 0.74217507, 0.7397878 , 0.73925729, 0.73554377]),
 'test_f1_macro': array([0.45648709, 0.46802287, 0.46507176, 0.45980045, 0.45708485,
        0.48185037, 0.46818676, 0.46702385, 0.45660658, 0.45833107])}

In [32]:
cv_results_NoKCore['test_accuracy'].mean(), cv_results_NoKCore['test_recall'].mean(), cv_results_NoKCore['test_f1_macro'].mean()

(0.740397877984085, 0.043022068118459184, 0.4638465638085)

In [33]:
#ANALISIS 
print("CON TODOS LOS ATRIBUTOS", cv_results['test_accuracy'].mean(), cv_results['test_recall'].mean(), cv_results['test_f1_macro'].mean())
print("CON TODOS LOS ATRIBUTOS EXCEPTO CENTRALITY", cv_results_NoCentrality['test_accuracy'].mean(), cv_results_NoCentrality['test_recall'].mean(), cv_results_NoCentrality['test_f1_macro'].mean())
print("CON TODOS LOS ATRIBUTOS EXCEPTO CLUSTERING", cv_results_NoClustering['test_accuracy'].mean(), cv_results_NoClustering['test_recall'].mean(), cv_results_NoClustering['test_f1_macro'].mean())
print("CON TODOS LOS ATRIBUTOS EXCEPTO COMMUNITIES", cv_results_NoCommunities['test_accuracy'].mean(), cv_results_NoCommunities['test_recall'].mean(), cv_results_NoCommunities['test_f1_macro'].mean())
print("CON TODOS LOS ATRIBUTOS EXCEPTO KCORE", cv_results_NoKCore['test_accuracy'].mean(), cv_results_NoKCore['test_recall'].mean(), cv_results_NoKCore['test_f1_macro'].mean())


CON TODOS LOS ATRIBUTOS 0.7342175066312999 0.15946215160461835 0.5377768145975297
CON TODOS LOS ATRIBUTOS EXCEPTO CENTRALITY 0.7414058355437666 0.009138104594060158 0.4345076070616452
CON TODOS LOS ATRIBUTOS EXCEPTO CLUSTERING 0.7357559681697612 0.16367170270823528 0.5411290382924825
CON TODOS LOS ATRIBUTOS EXCEPTO COMMUNITIES 0.7349336870026526 0.15165885478768643 0.5339819023732753
CON TODOS LOS ATRIBUTOS EXCEPTO KCORE 0.740397877984085 0.043022068118459184 0.4638465638085


In [36]:
### Análisis y Conclusiones

#1. **Importancia de las Métricas de Comunidad:**
#   - Al eliminar los atributos relacionados con las comunidades, la precisión del modelo disminuye en un 0.64%, lo que indica que estas métricas juegan un papel crucial en la capacidad del modelo para clasificar correctamente las instancias.

#2. **Impacto de los Atributos de Centralidad:**
#   - Contrario a las expectativas, la eliminación de los atributos de centralidad resulta en una mejora del 0.79% en la precisión del modelo. Esto sugiere que estos atributos pueden no ser tan relevantes para la clasificación en este contexto específico o que podrían estar introduciendo ruido en el modelo.

#3. **Efecto de los Atributos de Agrupamiento y K-Core:**
#   - La exclusión de los atributos de agrupamiento y de k-core tiene un impacto marginal en la precisión del modelo, con cambios menores o incluso una ligera mejora en algunos casos. Esto indica que, aunque útiles, estos atributos no son críticos para lograr una buena precisión en la clasificación.

#En resumen, para nuestro problema específico de clasificación utilizando Naive Bayes, las métricas de comunidad emergen como las más relevantes, mostrando una influencia significativa en la capacidad del modelo para generalizar correctamente. Por otro lado, los atributos de centralidad pueden prescindirse sin comprometer la precisión e incluso mejorándola ligeramente. Los atributos de agrupamiento y k-core, aunque útiles, tienen un impacto menos decisivo en la precisión del modelo.