# Tutorial:2

## Classification of Graph Data using Machine Learning Algorithm based on Graph features

In [1]:
# importing required libraries

import numpy as np
import pandas as pd
import networkx as nx

## Example Graph using NetworkX:

**Karate Club Graph:**

Zachary's karate club is a social network of a university karate club, described in the paper "An Information Flow Model for Conflict and Fission in Small Groups" by Wayne W. Zachary.


Ref: http://vlado.fmf.uni-lj.si/pub/networks/data/Ucinet/UciData.htm#zachary

https://networkx.org/documentation/stable/auto_examples/graph/plot_karate_club.html

In [2]:
# importing Karate club graph data

kg = nx.karate_club_graph()

Karate club data is basically devided into two classes
- Mr. Hi
- officer

In [3]:
# printing the labels or the classes to which all the data point belongs

club_labels = nx.get_node_attributes(kg,'club')
club_labels

{0: 'Mr. Hi',
 1: 'Mr. Hi',
 2: 'Mr. Hi',
 3: 'Mr. Hi',
 4: 'Mr. Hi',
 5: 'Mr. Hi',
 6: 'Mr. Hi',
 7: 'Mr. Hi',
 8: 'Mr. Hi',
 9: 'Officer',
 10: 'Mr. Hi',
 11: 'Mr. Hi',
 12: 'Mr. Hi',
 13: 'Mr. Hi',
 14: 'Officer',
 15: 'Officer',
 16: 'Mr. Hi',
 17: 'Mr. Hi',
 18: 'Officer',
 19: 'Mr. Hi',
 20: 'Officer',
 21: 'Mr. Hi',
 22: 'Officer',
 23: 'Officer',
 24: 'Officer',
 25: 'Officer',
 26: 'Officer',
 27: 'Officer',
 28: 'Officer',
 29: 'Officer',
 30: 'Officer',
 31: 'Officer',
 32: 'Officer',
 33: 'Officer'}

In [4]:
# getting the degree of every node of the graph

degree = nx.degree(kg)
degree

DegreeView({0: 16, 1: 9, 2: 10, 3: 6, 4: 3, 5: 4, 6: 4, 7: 4, 8: 5, 9: 2, 10: 3, 11: 1, 12: 2, 13: 5, 14: 2, 15: 2, 16: 2, 17: 2, 18: 2, 19: 3, 20: 2, 21: 2, 22: 2, 23: 5, 24: 3, 25: 3, 26: 2, 27: 4, 28: 3, 29: 4, 30: 4, 31: 6, 32: 12, 33: 17})

## Getting different features of the graph

These are some of the features we are considering for this graph to do the classification task
- clusterring coefficient
- degree centrality 
- closeness centrality
- betweenness centrality
- eigenvector centrality
- pagerank
- hubs
- authorities

In [5]:
# clusterring coefficient
cl_coef = nx.clustering(kg) 

# degree centrality
dc = nx.degree_centrality(kg)

# closeness centrality
cc = nx.closeness_centrality(kg)

# betweenness centrality
bc = nx.betweenness_centrality(kg)

# eigenvector centrality
eigc = nx.eigenvector_centrality(kg)

# pagerank
pagerank = nx.pagerank(kg)


# hubs and authorities
hub,auth = nx.hits(kg)


In [25]:
print(pagerank)

{0: 0.09700181758983709, 1: 0.05287839103742701, 2: 0.057078423047636745, 3: 0.03586064322306479, 4: 0.021979406974834498, 5: 0.02911334166344221, 6: 0.02911334166344221, 7: 0.024490758039509182, 8: 0.029765339186167028, 9: 0.014308950284462801, 10: 0.021979406974834498, 11: 0.009564916863537148, 12: 0.014645186487916191, 13: 0.029536314977202986, 14: 0.014535161524273825, 15: 0.014535161524273825, 16: 0.016785378110253487, 17: 0.014558859774243493, 18: 0.014535161524273825, 19: 0.019604416711937293, 20: 0.014535161524273825, 21: 0.014558859774243493, 22: 0.014535161524273825, 23: 0.03152091531163228, 24: 0.021075455001162945, 25: 0.021005628174745786, 26: 0.015043395360629753, 27: 0.025638803528350497, 28: 0.01957296050943854, 29: 0.02628726283711208, 30: 0.02458933653429248, 31: 0.03715663592267942, 32: 0.07169213006588289, 33: 0.1009179167487121}


In [26]:
print(hub)

{0: 0.07141272875773573, 1: 0.053427231205172614, 2: 0.06371906453963268, 3: 0.04242273710428976, 4: 0.01526095969815266, 5: 0.015966913494418547, 6: 0.015966913494418547, 7: 0.034343167206797434, 8: 0.0456819251308063, 9: 0.020625667757182626, 10: 0.01526095969815266, 11: 0.01061789150852051, 12: 0.01692545078543599, 13: 0.04549486406600547, 14: 0.020370345842716076, 15: 0.020370345842716076, 16: 0.004748031841562519, 17: 0.018561637031907358, 18: 0.020370345842716076, 19: 0.02971333389111539, 20: 0.020370345842716076, 21: 0.018561637031907358, 22: 0.020370345842716076, 23: 0.030156497528902444, 24: 0.011460952230139869, 25: 0.01189366439609368, 26: 0.015182734341447207, 27: 0.02681349412708363, 28: 0.0263315057833753, 29: 0.027111539646424865, 30: 0.03510623798827733, 31: 0.03837574188047834, 32: 0.06200184647463986, 33: 0.07500294214634279}


In [27]:
# Converting the degree values of every node in list format

degrees = list(dict(degree).values())


# storing values of all features in list

data = [list(x.values()) for x in (cl_coef,dc,cc,bc,eigc,pagerank,hub,auth)]

data.append(degrees)

In [28]:
data.append(list(club_labels.values()))

In [29]:
data

[[0.15,
  0.3333333333333333,
  0.24444444444444444,
  0.6666666666666666,
  0.6666666666666666,
  0.5,
  0.5,
  1.0,
  0.5,
  0,
  0.6666666666666666,
  0,
  1.0,
  0.6,
  1.0,
  1.0,
  1.0,
  1.0,
  1.0,
  0.3333333333333333,
  1.0,
  1.0,
  1.0,
  0.4,
  0.3333333333333333,
  0.3333333333333333,
  1.0,
  0.16666666666666666,
  0.3333333333333333,
  0.6666666666666666,
  0.5,
  0.2,
  0.19696969696969696,
  0.11029411764705882],
 [0.48484848484848486,
  0.2727272727272727,
  0.30303030303030304,
  0.18181818181818182,
  0.09090909090909091,
  0.12121212121212122,
  0.12121212121212122,
  0.12121212121212122,
  0.15151515151515152,
  0.06060606060606061,
  0.09090909090909091,
  0.030303030303030304,
  0.06060606060606061,
  0.15151515151515152,
  0.06060606060606061,
  0.06060606060606061,
  0.06060606060606061,
  0.06060606060606061,
  0.06060606060606061,
  0.09090909090909091,
  0.06060606060606061,
  0.06060606060606061,
  0.06060606060606061,
  0.15151515151515152,
  0.090909090

In [30]:
# converting data into numpy array

data_arr = np.array(data)

In [32]:
# converting data array into pandas dataframe

df1 = pd.DataFrame(data_arr.T, columns=['clustering_coef','degree_centrality','closeness_centrality','betweenness_centrality','eigenvalue_centrality','pagerank','hub','authority','degrees_of_nodes','club_labels'])

In [33]:
df1.head()

Unnamed: 0,clustering_coef,degree_centrality,closeness_centrality,betweenness_centrality,eigenvalue_centrality,pagerank,hub,authority,degrees_of_nodes,club_labels
0,0.15,0.4848484848484848,0.5689655172413793,0.4376352813852814,0.3554834941851943,0.097001817589837,0.0714127287577357,0.0714127288087085,16,Mr. Hi
1,0.3333333333333333,0.2727272727272727,0.4852941176470588,0.0539366883116883,0.2659538704545025,0.052878391037427,0.0534272312051726,0.0534272312287039,9,Mr. Hi
2,0.2444444444444444,0.303030303030303,0.559322033898305,0.1436568061568061,0.3171893899684447,0.0570784230476367,0.0637190645396326,0.0637190645558713,10,Mr. Hi
3,0.6666666666666666,0.1818181818181818,0.4647887323943662,0.0119092712842712,0.2111740783205706,0.0358606432230647,0.0424227371042897,0.0424227371061152,6,Mr. Hi
4,0.6666666666666666,0.0909090909090909,0.3793103448275862,0.0006313131313131,0.0759664588165738,0.0219794069748344,0.0152609596981526,0.0152609596922517,3,Mr. Hi


In [34]:
# converting target labels into onehot encoded form 

df1.club_labels = df1.club_labels.apply(lambda x : 0 if x=="Mr. Hi" else 1)

In [35]:
# converting all the datatypes of columns into float

df1 = df1.astype(float)


In [36]:
# final dataframe

df1.head()

Unnamed: 0,clustering_coef,degree_centrality,closeness_centrality,betweenness_centrality,eigenvalue_centrality,pagerank,hub,authority,degrees_of_nodes,club_labels
0,0.15,0.484848,0.568966,0.437635,0.355483,0.097002,0.071413,0.071413,16.0,0.0
1,0.333333,0.272727,0.485294,0.053937,0.265954,0.052878,0.053427,0.053427,9.0,0.0
2,0.244444,0.30303,0.559322,0.143657,0.317189,0.057078,0.063719,0.063719,10.0,0.0
3,0.666667,0.181818,0.464789,0.011909,0.211174,0.035861,0.042423,0.042423,6.0,0.0
4,0.666667,0.090909,0.37931,0.000631,0.075966,0.021979,0.015261,0.015261,3.0,0.0


In [37]:
df1.shape

(34, 10)

### Importing Machine learning algorithms for classification 

We are going to use `logistic regression` for classification using `sklearn` library.

In [38]:
# Importing libraries for classification

import sklearn
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [39]:
# deviding data into train and test

X = df1.drop(columns=['club_labels'])
y = df1.club_labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# defining model

# all parameters not specified are set to their defaults
logisticRegr = LogisticRegression()

# fitting the model
logisticRegr.fit(X_train, y_train)

# making predictions
y_pred = logisticRegr.predict(X_test)

In [40]:
# Measuring model performance

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.50      0.67      0.57         3
         1.0       0.67      0.50      0.57         4

    accuracy                           0.57         7
   macro avg       0.58      0.58      0.57         7
weighted avg       0.60      0.57      0.57         7



In [41]:
print(confusion_matrix(y_test,y_pred))
print("------")
print(accuracy_score(y_test,y_pred))

[[2 1]
 [2 2]]
------
0.5714285714285714


In [42]:
y_test

15    1.0
19    0.0
27    1.0
26    1.0
8     0.0
24    1.0
21    0.0
Name: club_labels, dtype: float64

In [43]:
y_pred

array([0., 1., 1., 0., 0., 1., 0.])