Goal : How graph structures influence the classical ml models

Dataset : 
Citation Network (Cora)
Nodes : papers
Edges : citations
Labels : research topic

Approach : 
Extract the relevant graph features
Perform train-test split
Evaluate classical ml models using these features

Future Goals : Multi-Hop neighbourhood effects on classical ml models

In [84]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler


In [85]:
df = pd.read_csv("data/node_dataset.csv")
df.head()

Unnamed: 0,node_id,in_degree,out_degree,clustering_coefficient,triangle_count,betweenness_centrality,closeness_centrality,eigenvector_centrality,mean_neighbor_degree,max_neighbor_degree,mean_neighbor_clustering,mean_neighbor_betweenness,mean_neighbor_closeness,label
0,1033,2,3,0.1,1,0.00243,0.18171,0.052207,39.6,168,0.087736,0.048188,0.175865,1
1,35,166,3,0.011406,160,0.232488,0.222769,0.6543,5.178571,30,0.375302,0.002789,0.182459,1
2,103482,5,1,0.133333,2,0.00451,0.186238,0.050281,30.666667,168,0.201901,0.040584,0.182574,2
3,103515,9,2,0.163636,9,0.001681,0.180501,0.072508,20.0,168,0.34108,0.022325,0.173767,1
4,1050679,0,4,0.166667,1,0.024078,0.207347,0.050567,63.0,168,0.049125,0.095707,0.207885,1


In [86]:
feature_cols = [
    "in_degree",
    "out_degree",
    "clustering_coefficient",
    "triangle_count",
    "betweenness_centrality",
    "closeness_centrality",
    "eigenvector_centrality",
    "mean_neighbor_degree",
    "max_neighbor_degree",
    "mean_neighbor_clustering",
    "mean_neighbor_betweenness",
    "mean_neighbor_closeness",
]


In [87]:
train_frac = 0.8
random_seed = 42 # to ensure on every run you get the same shuffled data frame
np.random.seed(random_seed)

train_rows = []
test_rows = []

for label in df["label"].unique():
    class_df = df[df["label"] == label]
    
    shuffled = class_df.sample(frac=1, random_state=random_seed)
    
    split_idx = int(len(shuffled) * train_frac)
    
    train_rows.append(shuffled.iloc[:split_idx]) # iloc means integer location
    test_rows.append(shuffled.iloc[split_idx:])

train_df = pd.concat(train_rows).sample(frac=1, random_state=random_seed)
test_df = pd.concat(test_rows).sample(frac=1, random_state=random_seed)

print("Train size:", train_df.shape)
print("Test size:", test_df.shape)

#print("\nTrain label distribution:")
#print(train_df["label"].value_counts(normalize=True)) # returns the fraction of each label 

#print("\nTest label distribution:")
#print(test_df["label"].value_counts(normalize=True)) # returns the fraction of each label 

Train size: (2163, 14)
Test size: (545, 14)


In [88]:
X_train = train_df[feature_cols].values
y_train = train_df["label"].values

X_test = test_df[feature_cols].values
y_test = test_df["label"].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [89]:
softmaxreg = LogisticRegression(
   
    solver="lbfgs",
    max_iter=100000
)

softmaxreg.fit(X_train_scaled, y_train)
y_pred_lr = softmaxreg.predict(X_test_scaled)

print("SoftMax Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Macro F1:", f1_score(y_test, y_pred_lr, average="macro"))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))


SoftMax Regression
Accuracy: 0.42568807339449544
Macro F1: 0.29004670851284475
Confusion Matrix:
 [[  0   2  44   8   0   0   6]
 [  0  63  18   1   2   0   0]
 [  1   5 136   7   6   0   9]
 [  0   0  67  13   0   0   6]
 [  0   4  24   0  13   0   3]
 [  0   0  25   5   0   0   6]
 [  0   3  57   4   0   0   7]]


In [90]:
nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

y_pred_nb = nb.predict(X_test_scaled)

print("Naive Bayes")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Macro F1:", f1_score(y_test, y_pred_nb, average="macro"))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))


Naive Bayes
Accuracy: 0.24220183486238533
Macro F1: 0.21360376417769453
Confusion Matrix:
 [[ 3  1  2 35  0 15  4]
 [34 27  1  7  7  8  0]
 [ 6 18 16 61 14 41  8]
 [ 4  0  3 56  2 20  1]
 [ 6  4  4 10  8 10  2]
 [ 2  0  2  9  1 18  4]
 [ 7  1  3 30  1 25  4]]


In [91]:
rf = RandomForestClassifier(
    n_estimators=3000,
    max_depth=None,
    random_state=42,
    n_jobs=-10
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Macro F1:", f1_score(y_test, y_pred_rf, average="macro"))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


Random Forest
Accuracy: 0.6220183486238532
Macro F1: 0.6097502665198373
Confusion Matrix:
 [[ 27   1  22   3   0   0   7]
 [  4  73   4   0   3   0   0]
 [  4   5 121  13   6   0  15]
 [  1   1  27  47   2   1   7]
 [  2   5  11   0  24   0   2]
 [  0   2   9   1   0  17   7]
 [  2   3  29   6   1   0  30]]
