# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import ast
import csv

In [None]:
def parse_edgelist(edgelist_str):
    """
    - Parse the edge list string to a list of tuples
    """
    return ast.literal_eval(edgelist_str)

In [None]:
def centralities(edgelist):
    """
    - edgelist is a list of node pairs e.g. [(7,2),(1,7),(1,9),...]
    - returns a dictionary of vertex -> (centrality values)
    """
    T = nx.from_edgelist(edgelist)
    # dc = nx.degree_centrality(T)
    cc = nx.closeness_centrality(T)
    bc = nx.betweenness_centrality(T)
    pc = nx.pagerank(T)

    # additional features
    # ec = nx.eccentricity(T)
    lc = nx.load_centrality(T)
    eic = nx.eigenvector_centrality(T, max_iter=3000)
    ap = list(nx.articulation_points(T))

    return {v: (cc[v], bc[v], pc[v], lc[v], eic[v], 1 if v in ap else 0) for v in T}

    # return {v: (dc[v], cc[v], bc[v], pc[v], ec[v], lc[v], 1 if v in ap else 0) for v in T}

In [None]:
def generate_features(data):
    """Generate new features for the dataset"""
    prepared_data = data.copy()

    # Language - one-hot encoding
    # The first one is dropped since it does not add more info (all 0 implies the other language)
    encoded_language = pd.get_dummies(prepared_data['language'], prefix='lang', drop_first=True, dtype=int)
    prepared_data = pd.concat([prepared_data, encoded_language], axis=1)

    # Language - label encoding
    # prepared_data['language_code'] = pd.factorize(prepared_data['language'])[0]  

    return prepared_data

In [None]:
def generate_binary_classification_dataset(data, is_test=False):
    """
    - extracts a feature set from a set of edge lists
    """
    all_rows = []
    
    for _, row in data.iterrows():
        language = row['language']
        sentence_id = row['sentence']
        n = row['n']  
        edgelist = parse_edgelist(row['edgelist'])
        root = int(row['root']) if not is_test else None

        cent_dict = centralities(edgelist)
        
        for vertex, (closeness, betweenness, pagerank, load, eigenvector, is_articulation) in cent_dict.items():
            row_data = {
                'language': language,
                'sentence': sentence_id,
                'n': n,
                'vertex': vertex,
                'closeness': closeness,
                'betweenness': betweenness,
                'pagerank': pagerank,
                'load': load,
                'eigenvector': eigenvector,
                'is_articulation': is_articulation,
            }

            if is_test:
                row_data['id'] = row['id']
            else:
                row_data['is_root'] = 1 if vertex == root else 0
            
            all_rows.append(row_data)
    
    return generate_features(pd.DataFrame(all_rows))

## Load Data

In [None]:
# Read the training data
train_data = pd.read_csv('../data/train.csv')

train_data.head(10)

In [None]:
# Generate binary classification dataset from training data
train_processed = generate_binary_classification_dataset(train_data)

# Load the transformed data to file
train_processed.to_csv('../data/train_processed.csv', index=False)

train_processed.head(10)

In [None]:
# Read the training data
test_data = pd.read_csv('../data/test.csv')

# Generate binary classification dataset from training data
test_processed = generate_binary_classification_dataset(test_data, True)

# Load the transformed data to file
test_processed.to_csv('../data/test_processed.csv', index=False)

test_processed.head(10)