## Libraries

In [3]:
import networkx as nx
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

## Load network and node attributes

In [None]:
# Load network
G = nx.read_edgelist('edges_train.edgelist', delimiter=',', nodetype=int)

# Load node attributes
attributes = pd.read_csv('attributes.csv', header=None, names=['attribute'])

# Build mapping: node_id -> attribute_value
attr_map = dict(enumerate(attributes['attribute'].values))

nx.set_node_attributes(G, attr_map, name='attribute')

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 1500
Number of edges: 6600


## Feature engineering

In [10]:
def extract_features(G, u, v):
    # Structural features
    common_neighbors = len(list(nx.common_neighbors(G, u, v)))
    degree_u = G.degree(u)
    degree_v = G.degree(v)

    # Same community / attribute
    attr_u = G.nodes[u]['attribute']
    attr_v = G.nodes[v]['attribute']
    same_attr = int(attr_u == attr_v)

    return [common_neighbors, degree_u, degree_v, same_attr]


## Get positive and negative example

In [11]:
# Positive examples (existing edges)
positive_edges = list(G.edges())
y_pos = np.ones(len(positive_edges))

# Negative examples (non-existent edges)
non_edges = list(nx.non_edges(G))
np.random.shuffle(non_edges)
negative_edges = non_edges[:len(positive_edges)]
y_neg = np.zeros(len(negative_edges))

# Combine
edges = positive_edges + negative_edges
y = np.concatenate([y_pos, y_neg])

# Compute features
X = np.array([extract_features(G, u, v) for u, v in edges])


## Train and test split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

## Train supervised model

In [13]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))

Accuracy: 0.822979797979798
ROC-AUC: 0.822979797979798


## Make predictions using solutionInput.csv and generate output for Kaggle competition

In [None]:
test_pairs = pd.read_csv('solutionInput.csv')

# Extract features for test pairs
X_final = np.array([extract_features(G, int(u), int(v)) 
                    for u, v in zip(test_pairs['int1'], test_pairs['int2'])])

preds = model.predict(X_final)

# Save output for kaggle
output = pd.DataFrame({
    'ID': test_pairs.index,
    'Predicted': preds.astype(int)
})
output.to_csv('solutionOutput.csv', index=False)


Saved solutionOutput.csv ✅
