# Lista con métricas por texto (clustering, eficiencia global, centralidad,etc)

In [None]:
pip install text2graphapi

In [None]:
from text2graphapi.src.Cooccurrence import Cooccurrence #Se importa el grafo tipo co-ocurrencia

In [None]:
import numpy as np
import pandas as pd
import networkx as nx
from PIL import Image
import matplotlib.pyplot as plt

## Subtask_1 - Train/Test

In [None]:
ruta_1 = r'C:\Users\Yara\Documents\Servicio Social\Autextification-2024\subtask_1\Entrenar modelo\train_S1.csv'
ruta_2 = r'C:\Users\Yara\Documents\Servicio Social\Autextification-2024\Entrenar modelo\test_S1.csv'

train_data = pd.read_csv(ruta_1)
test_data = pd.read_csv(ruta_2)

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
df_train = train_data[['id', 'text']]
df_train = df_train.rename(columns={'text': 'doc'})
doc_1 = df_train.to_dict(orient='records')


df_test = test_data[['id', 'text']]
df_test = df_test.rename(columns={'text': 'doc'})
doc_2 = df_test.to_dict(orient='records')

In [None]:
print('documentos train: ',len(doc_1), '\n','documentos test: ',len(doc_2))

In [None]:
#Se define el grafo tipo co-ocurrencia
coocc_graph = Cooccurrence(graph_type = 'DiGraph',
                                   language = 'sp',
                                   apply_prep = False,
                                   window_size= 3,
                                   output_format = 'networkx')

In [None]:
coocc_graph_train = coocc_graph.transform(doc_1)
coocc_graph_test = coocc_graph.transform(doc_2)

In [None]:
#Se hacen 2 listas para guardar los grafos
list_graph_train = []

for grafo in coocc_graph_train:
    graph = grafo['graph']
    list_graph_train.append(graph)

In [None]:
list_graph_test = []

for grafo in coocc_graph_test:
    graph = grafo['graph']
    list_graph_test.append(graph)

## Comienzan las métricas de la paquetería Networkx

In [None]:
def clustering(list_graph):
    list_clustering = []
    for grafo in list_graph:
        clustering = nx.average_clustering(grafo)
        list_clustering.append(clustering)
    return  list_clustering

In [None]:
cl1 = clustering(list_graph_train)
cl2 = clustering(list_graph_test)

In [None]:
def efficiency(list_graph):
    list_efficiency = []
    for grafo in list_graph:
        grafo = grafo.to_undirected()
        efficiency = nx.global_efficiency(grafo)
        list_efficiency.append(efficiency)
    return list_efficiency

In [None]:
e1 = efficiency(list_graph_train)
e2 = efficiency(list_graph_test)

In [None]:
def closeness(list_graph):
    list_closeness = []
    for grafo in list_graph:
        closeness = nx.closeness_centrality(grafo)
        list_closeness.append(sum(closeness.values()) / len(closeness.values()))
    return list_closeness

In [None]:
c1 = closeness(list_graph_train)
c2 = closeness(list_graph_test)

In [None]:
def degree(list_graph):
    list_degree = []
    for grafo in list_graph:
        degree = np.mean(list(dict(grafo.degree()).values()))
        list_degree.append(degree)
    return list_degree

In [None]:
d1 = degree(list_graph_train)
d2 = degree(list_graph_test)

In [None]:
def degree_cent(list_graph):
    list_degree_cent = []
    for grafo in list_graph:
        centrality = list(nx.degree_centrality(grafo).values())
        count = np.mean(centrality)
        list_degree_cent.append(count)
    return list_degree_cent

In [None]:
dc_1 = degree_cent(list_graph_train)
dc_2 = degree_cent(list_graph_test)

In [None]:
def betweenness(list_graph):
    list_betweenness = []
    for grafo in list_graph:
        betweenness = list(nx.betweenness_centrality(grafo).values())
        count = np.mean(betweenness)
        list_betweenness.append(count)
    return list_betweenness

In [None]:
b1 = betweenness(list_graph_train)
b2 = betweenness(list_graph_test)

In [None]:
def neighbor_degree(list_graph):
    list_neighbor = []
    for grafo in list_graph:
        count = np.mean(list(nx.average_neighbor_degree(grafo).values()))
        list_neighbor.append(count)
    return list_neighbor

In [None]:
n1 = neighbor_degree(list_graph_train)
n2 = neighbor_degree(list_graph_test)

In [None]:
def assortativity(list_graph):
    list_assortativity = []
    for grafo in list_graph:
        try:
            assort = nx.degree_pearson_correlation_coefficient(grafo)
        except:
            assort = np.nan
        list_assortativity.append(assort)
    return list_assortativity

In [None]:
as_1 = assortativity(list_graph_train)
as_2 = assortativity(list_graph_test)

In [None]:
def transitivity(list_graph):
    list_transitivity = []
    for grafo in list_graph:
        trans = nx.transitivity(grafo)
        list_transitivity.append(trans)
    return list_transitivity

In [None]:
t1 = transitivity(list_graph_train)
t2 = transitivity(list_graph_test)

In [None]:
def number_components(list_graph):
    list_number_components = []
    for grafo in list_graph:
        grafo = grafo.to_undirected()
        num_components = nx.number_connected_components(grafo)
        list_number_components.append(num_components)
    return list_number_components

In [None]:
nc_1 = number_components(list_graph_train)
nc_2 = number_components(list_graph_test)

In [None]:
def number_strong_comp(list_graph):
    list_number_strong_comp = []
    for grafo in list_graph:
        num_strong_components = nx.number_strongly_connected_components(grafo)
        list_number_strong_comp.append(num_strong_components)
    return list_number_strong_comp

In [None]:
nsc_1 = number_strong_comp(list_graph_train)
nsc_2 = number_strong_comp(list_graph_test)

In [None]:
def entropy_centr(list_graph):
    list_entropy_centr = []
    for grafo in list_graph:
        centrality = list(nx.degree_centrality((grafo)).values())
        num_entropy = entropy(centrality)
        list_entropy_centr.append(num_entropy)
    return list_entropy_centr

In [None]:
ect_1 = entropy_centr(list_graph_train)
ect_2 = entropy_centr(list_graph_test)

In [None]:
def entropy_clos(list_graph):
    list_entropy_clos = []
    for grafo in list_graph:
        centrality = list(nx.closeness_centrality((grafo)).values())
        num_entropy = entropy(centrality)
        list_entropy_clos.append(num_entropy)
    return list_entropy_clos

In [None]:
ecl_1 = entropy_clos(list_graph_train)
ecl_2 = entropy_clos(list_graph_test)

In [None]:
def sparseness(list_graph):
    list_sparseness = []
    for grafo in list_graph:
        mat = nx.adjacency_matrix((grafo)).todense()
        num_num = np.count_nonzero(mat)
        num_val = np.prod(mat.shape)
        sparseness = float(num_num) / num_val
        list_sparseness.append(sparseness)
    return list_sparseness

In [None]:
s1 = sparseness(list_graph_train)
s2 = sparseness(list_graph_test)

In [None]:
def nodes(list_graph):
    list_nodes = []
    for grafo in list_graph:
        count = nx.number_of_nodes(grafo)
        list_nodes.append(count)
    return list_nodes

In [None]:
nt1 = nodes(list_graph_train)
nt2 = nodes(list_graph_test)

In [None]:
def edges(list_graph):
    list_edges = []
    for grafo in list_graph:
        count = nx.number_of_edges(grafo)
        list_edges.append(count)
    return list_edges

In [None]:
et1 = edges(list_graph_train)
et2 = edges(list_graph_test)

In [None]:
def node_connectivity(list_graph):
    list_connectivity = []
    for grafo in list_graph:
        connectivity = approximation.node_connectivity(grafo)
        list_connectivity.append(connectivity)
    return list_connectivity

In [None]:
cont1 = node_connectivity(list_graph_train)
cont2 = node_connectivity(list_graph_test)

In [None]:
def density(list_graph):
    list_density = []
    for grafo in list_graph:
        density = nx.density(grafo)
        list_density.append(density)
    return list_density

In [None]:
dt1 = density(list_graph_train)
dt2 = density(list_graph_test)

In [None]:
def self_loop(list_graph):
    list_sloop = []
    for grafo in list_graph:
        sloop = nx.number_of_selfloops(grafo)
        list_sloop.append(sloop)
    return list_sloop

In [None]:
slt1 = self_loop(list_graph_train)
slt2 = self_loop(list_graph_test)

In [None]:
def girth(list_graph):
    list_girth = []
    for grafo in list_graph:
        grafo = grafo.to_undirected()
        girth = nx.girth(grafo)
        list_girth.append(girth)
    return list_girth

In [None]:
gt1 = girth(list_graph_train)
gt2 = girth(list_graph_test)

### Train

In [None]:
#Se crea otro dataframe donde se guarden las métricas por texto
df_t1 = pd.DataFrame({'id': df_train['id'].tolist(),
                  'label': train_data['label'].tolist(),
                  'clustering': cl1,
                  'global_efficiency': e1,
                  'closeness': c1,
                 'degree': d1,
                 'degree_centrality': dc_1,
                'betweenness': b1,
                'neighbor_degree': n1,
                'assortativity': as_1,
                'transitivity': t1,
                'number_strong_comp': nsc_1,
                'entropy_centr': ect_1,
                'entropy_clos': ecl_1,
                'sparseness': s1,
                    'nodes': nt1,
                  'edges': et1,
                  'node_connectivity': cont1,
                 'density': dt1,
                 'self_loop': slt1,
                'girth': gt1})
df_t1.head()

In [None]:
df_t1.to_csv('data-train-feat.csv', index=False)

### Test

In [None]:
df_t2 = pd.DataFrame({'id': df_test['id'].tolist(),
                  'label': test_data['label'].tolist(),
                  'clustering': cl2,
                  'global_efficiency': e2,
                  'closeness': c2,
                 'degree': d2,
                 'degree_centrality': dc_2,
                'betweenness': b2,
                'neighbor_degree': n2,
                'assortativity': as_2,
                'transitivity': t2,
                'number_strong_comp': nsc_2,
                'entropy_centr': ect_2,
                'entropy_clos': ecl_2,
                'sparseness': s2,
                    'nodes': nt2,
                  'edges': et2,
                  'node_connectivity': cont2,
                 'density': dt2,
                 'self_loop': slt2,
                'girth': gt2})
df_t2.head()

In [None]:
df_t2.to_csv('data-test-feat.csv', index=False)

## Subtask_2

Para la subtask 2 se hace lo mismo pero se separan los textos por ABCDEF que corresponden a distintas máquinas generadoras de texto

In [None]:
ruta_2 = r'C:\Users\Yara\Documents\Servicio Social\Autextification-2024\subtask_2\subtask_2.jsonl'

df_2 = pd.read_json(ruta_2, lines=True)

df_2.head()

In [None]:
df_ABCDEF = df_2[['id', 'text']]
df_ABCDEF = df_ABCDEF.rename(columns={'text': 'doc'})
doc_ABCDEF = df_ABCDEF.to_dict(orient='records')

In [None]:
print('documentos-generados (A, B, C, D, E, F): ',len(doc_ABCDEF))

In [None]:
coocc_graph_ABCDEF = coocc_graph.transform(doc_ABCDEF)

In [None]:
list_graph_ABCDEF = []

for grafo in coocc_graph_ABCDEF:
    graph = grafo['graph']
    list_graph_ABCDEF.append(graph)

In [None]:
cl_ABCDEF = clustering(list_graph_ABCDEF)

In [None]:
e_ABCDEF = efficiency(list_graph_ABCDEF)

In [None]:
c_ABCDEF = closeness(list_graph_ABCDEF)

In [None]:
d_ABCDEF = degree(list_graph_ABCDEF)

In [None]:
dc_ABCDEF = degree_cent(list_graph_ABCDEF)

In [None]:
b_ABCDEF = betweenness(list_graph_ABCDEF)

In [None]:
n_ABCDEF = neighbor_degree(list_graph_ABCDEF)

In [None]:
as_ABCDEF = assortativity(list_graph_ABCDEF)

In [None]:
t_ABCDEF = transitivity(list_graph_ABCDEF)

In [None]:
nc_ABCDEF = number_components(list_graph_ABCDEF)

In [None]:
nsc_ABCDEF = number_strong_comp(list_graph_ABCDEF)

In [None]:
ect_ABCDEF = entropy_centr(list_graph_ABCDEF)

In [None]:
ecl_ABDCEF = entropy_clos(list_graph_ABCDEF)

In [None]:
s_ABCDEF = sparseness(list_graph_ABCDEF)

In [None]:
nt_ABCDEF = nodes(list_graph_ABCDEF)

In [None]:
et_ABCDEF = edges(list_graph_train_ABCDEF)

In [None]:
cont_ABCDEF = node_connectivity(list_graph_ABCDEF)

In [None]:
dt_ABCDEF = density(list_graph_ABCDEF)

In [None]:
slt_ABCDEF = self_loop(list_graph_ABCDEF)

In [None]:
gt_ABCDEF = girth(list_graph_ABCDEF)

In [None]:
id_list_ABCDEF = df_ABCDEF['id'].tolist()
print(len(id_list_ABCDEF))

In [None]:
label_list_ABCDEF = df_2['label'].tolist()
print(len(label_list_ABCDEF))

In [None]:
df_subtask_2 = pd.DataFrame({'id': id_list_ABCDEF,
                          'label': label_list_ABCDEF,
                          'clustering': cl_ABCDEF,
                          'global_efficiency': e_ABCDEF,
                          'closeness': c_ABCDEF,
                                     'degree': d_ABCDEF,
                            'degree_centrality': dc_ABCDEF,
                            'betweenness': b_ABCDEF,
                            'neighbor_degree': n_ABCDEF,
                            'assortativity': as_ABCDEF,
                            'transitivity': t_ABCDEF,
                            'number_strong_comp': nsc_ABCDEF,
                            'entropy_centr': ect_ABCDEF,
                            'entropy_clos': ecl_ABDCEF,
                            'sparseness': s_ABCDEF,
                            'nodes': nt_ABCDEF,
                              'edges': et_ABCDEF,
                              'node_connectivity': cont_ABCDEF,
                             'density': dt_ABCDEF,
                             'self_loop': slt_ABCDEF,
                            'girth': gt_ABCDEF})
df_subtask_2.head()

In [None]:
df_subtask_2.to_csv('data-subtask_2.csv', index=False)