In [2]:
#Vamos a calcular como necesitamos las métricas para realizar el modelo NaiveBayes

import csv 
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [3]:
#Leemos los csv de las tablas 
nodes = pd.read_csv('../Tablas/TablaAtributos.csv')
nodesNoCentrality = pd.read_csv('../Tablas/TablaNoCentrality.csv')
nodesNoClustering = pd.read_csv('../Tablas/TablaNoClustering.csv')
nodesNoCommunities = pd.read_csv('../Tablas/TablaNoCommunities.csv')
nodesNoKCore = pd.read_csv('../Tablas/TablaNoKCore.csv')


In [4]:
ad=['name']
ac=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtes = nodes.loc[:, ['id']+ad+ac]

ad_NoCentrality=['name']
ac_NoCentrality=['Clustering_Coefficient','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoCentrality = nodesNoCentrality.loc[:, ['id']+ad_NoCentrality+ac_NoCentrality]

ad_NoClustering=['name']
ac_NoClustering=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Triangles','Squares','K_Core', 'Comunidad','asyn_lpa_community']
atributtesNoClustering = nodesNoClustering.loc[:, ['id']+ad_NoClustering+ac_NoClustering]

ad_NoCommunities=['name']
ac_NoCommunities=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','K_Core']
atributtesNoCommunities = nodesNoCommunities.loc[:, ['id']+ad_NoCommunities+ac_NoCommunities]

ad_NoKCore=['name']
ac_NoKCore=['Closeness_Centrality','Betweenness_Centrality','Degree_Centrality','Clustering_Coefficient','Triangles','Squares','Comunidad','asyn_lpa_community']
atributtesNoKCore = nodesNoKCore.loc[:, ['id']+ad_NoKCore+ac_NoKCore]

#Elegimos el objetivo 
y = nodes['ml_target']


In [5]:
#Manipulación de datos 
#Naive bayes no puede manejar atributos categóricos, por lo que necesitamos convertirlos a numéricos
#Para ello, usamos OrdinalEncoder
codificador_ad= OrdinalEncoder()
codificador_ad.fit(atributtes[ad])
codificador_ad.fit(atributtesNoCentrality[ad_NoCentrality])
codificador_ad.fit(atributtesNoClustering[ad_NoClustering])
codificador_ad.fit(atributtesNoCommunities[ad_NoCommunities])
codificador_ad.fit(atributtesNoKCore[ad_NoKCore])

In [6]:
#Transformamos los atributos
atributtes[ad]= codificador_ad.transform(atributtes[ad])
atributtesNoCentrality[ad_NoCentrality]= codificador_ad.transform(atributtesNoCentrality[ad_NoCentrality])
atributtesNoClustering[ad_NoClustering]= codificador_ad.transform(atributtesNoClustering[ad_NoClustering])
atributtesNoCommunities[ad_NoCommunities]= codificador_ad.transform(atributtesNoCommunities[ad_NoCommunities])
atributtesNoKCore[ad_NoKCore]= codificador_ad.transform(atributtesNoKCore[ad_NoKCore])

In [7]:
#Normalizamos name 
scaler = MinMaxScaler(
    feature_range=(0, 1)
)
atributtes['name'] = scaler.fit_transform(atributtes[['name']])
atributtesNoCentrality['name'] = scaler.fit_transform(atributtesNoCentrality[['name']])
atributtesNoClustering['name'] = scaler.fit_transform(atributtesNoClustering[['name']])
atributtesNoCommunities['name'] = scaler.fit_transform(atributtesNoCommunities[['name']])
atributtesNoKCore['name'] = scaler.fit_transform(atributtesNoKCore[['name']])

In [8]:
#Discretizamos los atributos
discretizer = KBinsDiscretizer(
    n_bins=5,
    encode='ordinal',
    strategy='uniform'
)
#Copiamos el dataframe y discretizamos los atributos
atributtes_discretized = atributtes.copy()
atributtes_discretized[ac] = discretizer.fit_transform(atributtes[ac])

atributtesNoCentrality_discretized = atributtesNoCentrality.copy()
atributtesNoCentrality_discretized[ac_NoCentrality] = discretizer.fit_transform(atributtesNoCentrality[ac_NoCentrality])

atributtesNoClustering_discretized = atributtesNoClustering.copy()
atributtesNoClustering_discretized[ac_NoClustering] = discretizer.fit_transform(atributtesNoClustering[ac_NoClustering])

atributtesNoCommunities_discretized = atributtesNoCommunities.copy()
atributtesNoCommunities_discretized[ac_NoCommunities] = discretizer.fit_transform(atributtesNoCommunities[ac_NoCommunities])

atributtesNoKCore_discretized = atributtesNoKCore.copy()
atributtesNoKCore_discretized[ac_NoKCore] = discretizer.fit_transform(atributtesNoKCore[ac_NoKCore])


In [9]:
#CON TODOS LOS ATRIBUTOS 

In [10]:
#Validamos el modelo , diviendo los datos en entrenamiento y test
#Alpha hace referencia al suavizado de Laplace 
tub= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])

#Validación cruzada
cv_results = cross_validate(
    tub,
    atributtes_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results

{'fit_time': array([0.04928684, 0.03557873, 0.04038954, 0.03411913, 0.03898668,
        0.03918839, 0.03490853, 0.06496692, 0.05340362, 0.04274082]),
 'score_time': array([0.01147389, 0.01025343, 0.011904  , 0.01184273, 0.01081324,
        0.00978994, 0.00914884, 0.01418471, 0.01811147, 0.01038551]),
 'test_recall': array([0.16135663, 0.15811088, 0.15195072, 0.1550308 , 0.14373717,
        0.17453799, 0.16119097, 0.18172485, 0.13963039, 0.16735113]),
 'test_accuracy': array([0.73554377, 0.73262599, 0.73554377, 0.73183024, 0.72970822,
        0.74456233, 0.73687003, 0.73183024, 0.73156499, 0.73209549]),
 'test_f1_macro': array([0.53972701, 0.53604441, 0.53466419, 0.53382507, 0.52613765,
        0.55326757, 0.54065071, 0.54780798, 0.52503665, 0.54060689])}

In [11]:
cv_results['test_accuracy'].mean(), cv_results['test_recall'].mean(), cv_results['test_f1_macro'].mean()

(0.7342175066312999, 0.15946215160461835, 0.5377768145975297)

In [12]:
#CON TODOS LOS ATRIBUTOS EXCEPTO CENTRALITY
##Alpha que hace referencia al suavizado de Laplace = 1
tubNoCentrality= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])
cv_results_NoCentrality = cross_validate(
    tubNoCentrality,
    atributtesNoCentrality_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results_NoCentrality

{'fit_time': array([0.03120828, 0.0323174 , 0.03089762, 0.03623128, 0.02934265,
        0.03185582, 0.02964735, 0.02956057, 0.02778482, 0.02830887]),
 'score_time': array([0.01139593, 0.00901914, 0.0124352 , 0.00892115, 0.01015878,
        0.01049089, 0.01302838, 0.01045465, 0.00910187, 0.01150799]),
 'test_recall': array([0.00513875, 0.01129363, 0.01026694, 0.00924025, 0.00718686,
        0.00718686, 0.00821355, 0.01232033, 0.01026694, 0.01026694]),
 'test_accuracy': array([0.73899204, 0.7397878 , 0.74164456, 0.74217507, 0.74297082,
        0.74217507, 0.74323607, 0.74323607, 0.73952255, 0.7403183 ]),
 'test_f1_macro': array([0.42986978, 0.43593268, 0.43566378, 0.43489244, 0.43323056,
        0.4329462 , 0.43430458, 0.43818107, 0.43488062, 0.43517436])}

In [13]:
cv_results_NoCentrality['test_accuracy'].mean(), cv_results_NoCentrality['test_recall'].mean(), cv_results_NoCentrality['test_f1_macro'].mean()

(0.7414058355437666, 0.009138104594060158, 0.4345076070616452)

In [14]:
#CON TODOS LOS ATRIBUTOS EXCEPTO CLUSTERING
##Alpha que hace referencia al suavizado de Laplace = 1

tubNoClustering= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])
cv_results_NoClustering = cross_validate(
    tubNoClustering,
    atributtesNoClustering_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results_NoClustering

{'fit_time': array([0.0324266 , 0.03605461, 0.03369999, 0.03492975, 0.03262281,
        0.03394437, 0.03529406, 0.03641105, 0.03289628, 0.04140925]),
 'score_time': array([0.00953484, 0.01080704, 0.01245165, 0.01276851, 0.00891519,
        0.01023293, 0.009305  , 0.00910687, 0.01288462, 0.01614952]),
 'test_recall': array([0.16238438, 0.16324435, 0.16324435, 0.16427105, 0.14887064,
        0.18069815, 0.16735113, 0.17659138, 0.13963039, 0.17043121]),
 'test_accuracy': array([0.73633952, 0.73527851, 0.73899204, 0.73660477, 0.73262599,
        0.74244032, 0.73872679, 0.73342175, 0.73050398, 0.73262599]),
 'test_f1_macro': array([0.54083541, 0.54064715, 0.54324838, 0.54212492, 0.53096697,
        0.55493866, 0.54526691, 0.54633564, 0.52433606, 0.54259027])}

In [15]:
cv_results_NoClustering['test_accuracy'].mean(), cv_results_NoClustering['test_recall'].mean(), cv_results_NoClustering['test_f1_macro'].mean()

(0.7357559681697612, 0.16367170270823528, 0.5411290382924825)

In [16]:
#CON TODOS LOS ATRIBUTOS EXCEPTO COMMUNITIES
##Alpha que hace referencia al suavizado de Laplace = 1

tubNoCommunities= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])
cv_results_NoCommunities = cross_validate(
    tubNoCommunities,
    atributtesNoCommunities_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results_NoCommunities

{'fit_time': array([0.04969144, 0.04408646, 0.05194712, 0.04292154, 0.0396235 ,
        0.03853846, 0.0368278 , 0.05596304, 0.03755879, 0.05322289]),
 'score_time': array([0.01759601, 0.02010059, 0.01628733, 0.01083541, 0.01504278,
        0.01364255, 0.01647687, 0.01567173, 0.0145762 , 0.01973128]),
 'test_recall': array([0.15724563, 0.15092402, 0.14476386, 0.15297741, 0.13552361,
        0.15811088, 0.14887064, 0.17761807, 0.13141684, 0.15913758]),
 'test_accuracy': array([0.73713528, 0.7331565 , 0.73501326, 0.73448276, 0.73129973,
        0.74376658, 0.73474801, 0.73554377, 0.73050398, 0.733687  ]),
 'test_f1_macro': array([0.53859564, 0.53246795, 0.53026518, 0.53450814, 0.52249548,
        0.54379645, 0.5324031 , 0.54837393, 0.51958234, 0.53733081])}

In [17]:
cv_results_NoCommunities['test_accuracy'].mean(), cv_results_NoCommunities['test_recall'].mean(), cv_results_NoCommunities['test_f1_macro'].mean()

(0.7349336870026526, 0.15165885478768643, 0.5339819023732753)

In [18]:
#CON TODOS LOS ATRIBUTOS EXCEPTO KCORE
##Alpha que hace referencia al suavizado de Laplace = 1

tubNoKCore= Pipeline([
    ('preprocess', discretizer),
    ('model', CategoricalNB(alpha=1))
])
cv_results_NoKCore = cross_validate(
    tubNoKCore,
    atributtesNoKCore_discretized,
    y,
    cv=10,
    scoring=['recall','accuracy',  'f1_macro']
)
cv_results_NoKCore

{'fit_time': array([0.04562879, 0.04253817, 0.03685021, 0.04650187, 0.04608631,
        0.03812456, 0.0377686 , 0.03686309, 0.03395391, 0.03379822]),
 'score_time': array([0.01365399, 0.01271749, 0.0111804 , 0.01238251, 0.01084495,
        0.01107407, 0.01810575, 0.01257539, 0.01080084, 0.00999999]),
 'test_recall': array([0.03494347, 0.04928131, 0.04312115, 0.03901437, 0.0349076 ,
        0.06057495, 0.04722793, 0.04722793, 0.0349076 , 0.03901437]),
 'test_accuracy': array([0.73899204, 0.73846154, 0.74270557, 0.73872679, 0.7403183 ,
        0.74801061, 0.74217507, 0.7397878 , 0.73925729, 0.73554377]),
 'test_f1_macro': array([0.45648709, 0.46802287, 0.46507176, 0.45980045, 0.45708485,
        0.48185037, 0.46818676, 0.46702385, 0.45660658, 0.45833107])}

In [19]:
cv_results_NoKCore['test_accuracy'].mean(), cv_results_NoKCore['test_recall'].mean(), cv_results_NoKCore['test_f1_macro'].mean()

(0.740397877984085, 0.043022068118459184, 0.4638465638085)

In [20]:
#ANALISIS 
print("CON TODOS LOS ATRIBUTOS", cv_results['test_accuracy'].mean())
print("CON TODOS LOS ATRIBUTOS EXCEPTO CENTRALITY", cv_results_NoCentrality['test_accuracy'].mean())
print("CON TODOS LOS ATRIBUTOS EXCEPTO CLUSTERING", cv_results_NoClustering['test_accuracy'].mean())
print("CON TODOS LOS ATRIBUTOS EXCEPTO COMMUNITIES", cv_results_NoCommunities['test_accuracy'].mean())
print("CON TODOS LOS ATRIBUTOS EXCEPTO KCORE", cv_results_NoKCore['test_accuracy'].mean())


CON TODOS LOS ATRIBUTOS 0.7342175066312999
CON TODOS LOS ATRIBUTOS EXCEPTO CENTRALITY 0.7414058355437666
CON TODOS LOS ATRIBUTOS EXCEPTO CLUSTERING 0.7357559681697612
CON TODOS LOS ATRIBUTOS EXCEPTO COMMUNITIES 0.7349336870026526
CON TODOS LOS ATRIBUTOS EXCEPTO KCORE 0.740397877984085


In [21]:
### Análisis y Conclusiones

#1. **Importancia de las Métricas de Comunidad:**
#   - Al eliminar los atributos relacionados con las comunidades, la precisión del modelo disminuye en un 0.64%, lo que indica que estas métricas juegan un papel crucial en la capacidad del modelo para clasificar correctamente las instancias.

#2. **Impacto de los Atributos de Centralidad:**
#   - Contrario a las expectativas, la eliminación de los atributos de centralidad resulta en una mejora del 0.79% en la precisión del modelo. Esto sugiere que estos atributos pueden no ser tan relevantes para la clasificación en este contexto específico o que podrían estar introduciendo ruido en el modelo.

#3. **Efecto de los Atributos de Agrupamiento y K-Core:**
#   - La exclusión de los atributos de agrupamiento y de k-core tiene un impacto marginal en la precisión del modelo, con cambios menores o incluso una ligera mejora en algunos casos. Esto indica que, aunque útiles, estos atributos no son críticos para lograr una buena precisión en la clasificación.

#En resumen, para nuestro problema específico de clasificación utilizando Naive Bayes, las métricas de comunidad emergen como las más relevantes, mostrando una influencia significativa en la capacidad del modelo para generalizar correctamente. Por otro lado, los atributos de centralidad pueden prescindirse sin comprometer la precisión e incluso mejorándola ligeramente. Los atributos de agrupamiento y k-core, aunque útiles, tienen un impacto menos decisivo en la precisión del modelo.