# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import ast
import csv

In [None]:
def parse_edgelist(edgelist_str):
    """
    - Parse the edge list string to a list of tuples
    """
    return ast.literal_eval(edgelist_str)

In [None]:
def centralities(edgelist):
    """
    - edgelist is a list of node pairs e.g. [(7,2),(1,7),(1,9),...]
    - returns a dictionary of vertex -> (centrality values)
    """
    T = nx.from_edgelist(edgelist)
    dc = nx.degree_centrality(T)
    cc = nx.harmonic_centrality(T)
    bc = nx.betweenness_centrality(T)
    pc = nx.pagerank(T)
    return {v: (dc[v], cc[v], bc[v], pc[v]) for v in T}

In [None]:
def read_data(file_path):
    """
    - Read the training data from CSV file
    """
    return pd.read_csv(file_path)

In [None]:
def generate_binary_classification_dataset(data):
    """
    - Generate a binary classification dataset from the input data
    """
    all_rows = []
    
    for _, row in data.iterrows():
        language = row['language']
        sentence_id = row['sentence']
        n = row['n']  
        edgelist = parse_edgelist(row['edgelist'])
        root = int(row['root'])
        
        # Compute centrality metrics using the provided function
        cent_dict = centralities(edgelist)
        
        # Create rows for each vertex
        for vertex, (degree, closeness, betweenness, pagerank) in cent_dict.items():
            is_root = 1 if vertex == root else 0
            
            row_data = {
                'language': language,
                'sentence': sentence_id,
                'n': n,
                'vertex': vertex,
                'degree': degree,
                'closeness': closeness,
                'betweenness': betweenness,
                'pagerank': pagerank,
                'is_root': is_root
            }
            
            all_rows.append(row_data)
    
    result = pd.DataFrame(all_rows)
    return result

## Load Data

In [None]:
# Read the training data
train_data = pd.read_csv('../data/train.csv')
train_data.head(10)

In [None]:
# Generate binary classification dataset from training data
expanded_dataset = generate_binary_classification_dataset(train_data)
expanded_dataset.head(10)