# High school contact and friendship networks

### Name: Shovan Biswas


Data Source: http://www.sociopatterns.org/datasets/high-school-contact-and-friendship-networks/


This dataset correspond to the contacts and friendship relations between students in a high school in Marseilles, France, in December 2013, as measured through several techniques.

It gives the contacts of the students of nine classes during 5 days in Dec. 2013, as measured by the SocioPatterns infrastructure. The file contains a tab-separated list representing the active contacts during 20-second intervals of the data collection. Each line has the form "t i j Ci Cj", where i and j are the anonymous IDs of the persons in contact, Ci and Cj are their classes, and the interval during which this contact was active. If multiple contacts are active in a given interval, you will see multiple lines starting with the same value of t. Time is measured in seconds.

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pylab as pyplot
import scipy.stats

import nxviz as nv
from pyvis import network as net
import seaborn as sns
%matplotlib inline

In [2]:
hight_school = pd.read_csv('High-School_data_2013.csv', 
                     delimiter = ' ', 
                     header = None, 
                     names = ['timestamp','node_from', 'node_to', 'classname_from', 'classname_to'])


In [3]:
# built the graph from the pandas dataset school
G = nx.from_pandas_edgelist(hight_school, 'node_from', 'node_to', 'timestamp')

In [4]:
numNodes = len(G.nodes())
print("# Nodes: ", numNodes)
print("# Edges: ", len(G.edges()))

# print(G.nodes())

#return the list of nodes and their degree of centrality <nx.degree_centrality(G)>
y = list(nx.degree_centrality(G).items())
y.sort
i = 0
for n in y:
    print (n)
    i = i + 1
    if i > 6:
        break

dfObj = pd.DataFrame(list(nx.degree_centrality(G).items()))
dfObj.head(7)


# Nodes:  327
# Edges:  5818
(454, 0.07975460122699386)
(640, 0.08895705521472393)
(1, 0.0705521472392638)
(939, 0.11042944785276074)
(185, 0.13190184049079756)
(258, 0.10429447852760737)
(55, 0.08588957055214724)


Unnamed: 0,0,1
0,454,0.079755
1,640,0.088957
2,1,0.070552
3,939,0.110429
4,185,0.131902
5,258,0.104294
6,55,0.08589


In [5]:
print("Number of Nodes: " + str(nx.number_of_nodes(G)))
print("Number of Edges: " + str(nx.number_of_edges(G)))

#nx.is_connected(G)
#nx.connected_components(G)

Number of Nodes: 327
Number of Edges: 5818


Let us examine the density, diameter and max degree of centrality

In [6]:
print("Density: " + str(nx.density(G)))
print("Diameter: " + str(nx.diameter(G)))
print("Max degree_centrality: ", max(list(nx.degree_centrality(G).values())))

Density: 0.10915367441511416
Diameter: 4
Max degree_centrality:  0.2668711656441718


In [7]:
for n in G.nodes():
    print (str(n) + ' ' + str(len(list(G.neighbors(n)))))

dfObj = pd.DataFrame(list(nx.degree_centrality(G).items()))
dfObj.head(7)

454 26
640 29
1 23
939 36
185 43
258 34
55 28
170 33
9 69
453 27
45 62
14 38
190 48
400 20
637 26
255 30
275 67
176 61
533 22
116 39
151 39
866 51
280 29
484 24
243 37
687 29
54 41
364 41
374 36
295 34
441 38
101 45
425 32
47 15
241 35
179 59
202 49
63 47
564 24
577 17
265 37
494 30
443 33
209 18
843 33
222 19
205 39
894 33
1359 69
1383 16
376 67
638 27
1238 33
1260 21
487 40
984 40
226 29
353 44
1342 32
1518 84
122 52
1067 25
1324 35
70 37
132 43
779 37
279 38
908 22
510 14
545 39
634 43
1332 77
1401 45
582 34
605 72
252 41
3 56
884 51
339 29
691 41
869 39
72 47
954 41
160 43
117 40
346 29
111 38
124 36
276 43
621 35
39 35
871 38
694 42
778 56
513 30
236 19
883 30
1594 38
1828 29
1214 41
196 51
201 33
245 60
390 45
938 42
923 29
106 87
272 76
753 32
486 20
531 29
254 53
382 41
119 42
240 41
447 40
649 39
1204 37
466 20
841 26
199 38
674 61
857 24
945 30
1218 42
1512 51
653 55
502 33
587 16
626 36
420 35
504 32
311 40
267 47
177 52
480 39
771 10
312 39
612 28
450 29
89 42
322 52
520 15

Unnamed: 0,0,1
0,454,0.079755
1,640,0.088957
2,1,0.070552
3,939,0.110429
4,185,0.131902
5,258,0.104294
6,55,0.08589


Calculate the eigenvector centrality and degree centrality for each node

In [7]:
# degree centrality
deg_centrality = nx.degree_centrality(G)

# eigenvector centrality
eigen_centrality = nx.eigenvector_centrality(G)

In [8]:
# Add centrality to dataset
deg_centrality = pd.DataFrame.from_dict(deg_centrality, orient = 'index').rename(columns={0:'degree_centrality'})
# print(deg_centrality)

# since we had to use orient as index for the row indicies, make a column out of the nodes (row indicies)
deg_centrality['node'] = deg_centrality.index
print(deg_centrality)

# drop the row indicies
deg_centrality.reset_index(drop=True, inplace=True)
print(deg_centrality)

# sort by node id this will be easier to import the classnames for analysis
deg_centrality = deg_centrality.sort_values(by=['node'])
print(deg_centrality)

     degree_centrality  node
454           0.079755   454
640           0.088957   640
1             0.070552     1
939           0.110429   939
185           0.131902   185
..                 ...   ...
34            0.104294    34
239           0.042945   239
62            0.006135    62
452           0.116564   452
445           0.030675   445

[327 rows x 2 columns]
     degree_centrality  node
0             0.079755   454
1             0.088957   640
2             0.070552     1
3             0.110429   939
4             0.131902   185
..                 ...   ...
322           0.104294    34
323           0.042945   239
324           0.006135    62
325           0.116564   452
326           0.030675   445

[327 rows x 2 columns]
     degree_centrality  node
2             0.070552     1
76            0.171779     3
202           0.107362     4
8             0.211656     9
11            0.116564    14
..                 ...   ...
187           0.116564  1805
319           0.095092  

In [9]:
# Do the same for the eigenvector centrality

eigen_centrality = pd.DataFrame.from_dict(eigen_centrality, orient = 'index').rename(columns={0: 'eigenvector_centrality'})
# print(eigen_centrality)

# since we had to use orient as index for the row indicies, make a column out of the nodes (row indicies)
eigen_centrality['node'] = eigen_centrality.index
# print(eigen_centrality)

# drop the row indicies
eigen_centrality.reset_index(drop=True, inplace=True)
# print(eigen_centrality)

# sort by node id so this way we can import the classnames and gender easily
eigen_centrality = eigen_centrality.sort_values(by=['node'])
print(eigen_centrality)

     eigenvector_centrality  node
2                  0.036727     1
76                 0.081243     3
202                0.031000     4
8                  0.117096     9
11                 0.055661    14
..                      ...   ...
187                0.056646  1805
319                0.045996  1819
98                 0.043179  1828
303                0.023923  1870
193                0.052882  1894

[327 rows x 2 columns]


In [12]:
# Add gender/classname columns
metadata_school = pd.read_csv('HighSchoolMetadata.txt', delimiter = '\t', header = None,
                              names = ['node', 'classname', 'gender'])
print(metadata_school)
metadata_school = metadata_school.sort_values(by=['node'])
print(metadata_school)

     node classname   gender
0     650     2BIO1        F
1     498     2BIO1        F
2     627     2BIO1        F
3     857     2BIO1        F
4     487     2BIO1        F
..    ...       ...      ...
324    58       PC*  Unknown
325   209       PC*  Unknown
326   979     2BIO2  Unknown
327   205     2BIO3        M
328   520        MP        F

[329 rows x 3 columns]
     node classname gender
71      1     2BIO3      M
264     2       PC*      M
47      3     2BIO2      M
203     4      PSI*      M
214     9        PC      M
..    ...       ...    ...
150  1805      MP*2      M
139  1819      MP*2      M
168  1828      MP*2      M
162  1870      MP*2      M
153  1894      MP*2      F

[329 rows x 3 columns]


In [13]:
deg_centrality['classname'] = metadata_school['classname']
print(deg_centrality)
print(metadata_school['classname'])
eigen_centrality['classname'] = metadata_school['classname']
deg_centrality['gender'] = metadata_school['gender']
eigen_centrality['gender'] = metadata_school['gender']

print(deg_centrality)
# print(eigen_centrality)

     degree_centrality  node classname gender
2             0.070552     1     2BIO1      F
76            0.171779     3     2BIO3      F
202           0.107362     4      PSI*      F
8             0.211656     9     2BIO1      F
11            0.116564    14     2BIO1      F
..                 ...   ...       ...    ...
187           0.116564  1805      PSI*      F
319           0.095092  1819        MP      F
98            0.088957  1828     2BIO3      M
303           0.046012  1870        MP      M
193           0.122699  1894      PSI*      F

[327 rows x 4 columns]
71     2BIO3
264      PC*
47     2BIO2
203     PSI*
214       PC
       ...  
150     MP*2
139     MP*2
168     MP*2
162     MP*2
153     MP*2
Name: classname, Length: 329, dtype: object
     degree_centrality  node classname gender
2             0.070552     1     2BIO1      F
76            0.171779     3     2BIO3      F
202           0.107362     4      PSI*      F
8             0.211656     9     2BIO1      F
11     

In [None]:
deg_centrality['classname'] = metadata_school['classname']
print(deg_centrality)
print(metadata_school['classname'])
eigen_centrality['classname'] = metadata_school['classname']
deg_centrality['gender'] = metadata_school['gender']
eigen_centrality['gender'] = metadata_school['gender']

print(deg_centrality)
# print(eigen_centrality)