In [23]:
import numpy as np
## Util functions
def get_dept_from_roll(roll):
    DEPTS = {"01":" Aerospace Engineering ","02":' Chemical Engineering ',"04":' Civil Engineering ',
             "05":' Computer Science & Engineering ',"07":' Electrical Engineering ',
             "10":' Mechanical Engineering ',"11":' Metallurgical Engineering & Materials Science ',
             "26":' Engineering Physics ',"03":' Chemistry ',"08":' Economics (HSS) ',
             "13":' Industrial Design Centre ',"17":' Energy Science and Engineering ',
             "35":' Random ',"30":' Random ',"09":' Random ','':" Random ","18":' Random ',"06":' Random ',
             "31":' Random '} 
    try:
        return DEPTS[roll[3:5]]
    except:
        print(roll)
        assert False

def get_dict_stats(d1,d2=None,num=1000):
    d3 = {}
    for k in d1:
        if d2 is not None:
            d3[k] = d1[k]/(d2[k] + 1e-10)
        else:
            d3[k] = np.median(d1[k])
    d3 = (sorted(d3.items(), key=lambda item: item[1],reverse=True))
    for i in range(min(num,len(d3))):
        print(d3[i][0],d3[i][1])


In [2]:
## Reading and loading the data
import json
import numpy as np
import matplotlib.pyplot as plt 

file = open("raw_data.json")
data = json.load(file)


## Most popular person, simply looking at who has written the most
written_posts = {}
about_posts = {}
rolls = data.keys()
for id in data:
    written_posts[id] = len(data[id]['Posts Auth'])
for id in data:
    about_posts[id] = len(data[id]['Posts Subj'])

about_posts_sort = (sorted(about_posts.items(), key=lambda item: item[1],reverse=True))
written_posts_sort = (sorted(written_posts.items(), key=lambda item: item[1],reverse=True))
print("--------------------------")
print("Most written about")
print([x[0] for x in about_posts_sort[:10]])
print("Most authored")
print([x[0] for x in written_posts_sort[:10]])


## Average number of posts written and about
written = [written_posts[x] for x in rolls]
about   = [about_posts[x] for x in rolls]
print(f"Average posts written - {np.mean(written)},Average posts about - {np.mean(about)}")

## Scatter plot of posts written vs posts about
plt.scatter(written,about)
plt.show()

--------------------------
Most written about
['17D110013', '170110054', '170050056', '170010039', '170070028', '170070012', '17D100013', '170020025', '170020082', '170050094']
Most authored
['170110054', '17D110013', '170050056', '170110049', '170050094', '170110089', '17D070049', '17B080016', '170020025', '170020006']
Average posts written - 5.809071729957806,Average posts about - 6.236286919831223


<Figure size 640x480 with 1 Axes>

In [26]:

## Most popular department (by looking at avg number of subj posts)
dept_subj  = {}
dept_total = {}

for id in data:
	dept = data[id]["Department"] 
	if dept is None:
		dept = get_dept_from_roll(id)
	if dept not in dept_subj:
		dept_subj[dept] = len(data[id]["Posts Subj"])
		dept_total[dept] = 1
	else:
		dept_subj[dept] += len(data[id]["Posts Subj"])
		dept_total[dept] += 1

print("Department        Number of posts about")
print("---------------------------------------")
get_dict_stats(dept_subj,dept_total)

	
## Most popular department (by looking at avg number of authored posts)
dept_subj  = {}
dept_total = {}

for id in data:
	dept = data[id]["Department"] 
	if dept is None:
		dept = get_dept_from_roll(id)
	if dept not in dept_subj:
		dept_subj[dept] = len(data[id]["Posts Auth"])
		dept_total[dept] = 1
	else:
		dept_subj[dept] += len(data[id]["Posts Auth"])
		dept_total[dept] += 1
print("\n\n")
print("Department        Number of posts authored")
print("---------------------------------------")
get_dict_stats(dept_subj,dept_total)



# Now we compute the medians instead
dept_subj  = {}
dept_total = {}

for id in data:
	dept = data[id]["Department"] 
	if dept is None:
		dept = get_dept_from_roll(id)
	if dept not in dept_subj:
		dept_subj[dept] = [len(data[id]["Posts Subj"])]
	else:
		dept_subj[dept].append(len(data[id]["Posts Subj"]))
print("\n\n")
print("Department        Number of posts about")
print("---------------------------------------")
get_dict_stats(dept_subj)



## Both of these seem to suggest that the CSE department is the most popular!

Department        Number of posts about
---------------------------------------
 Computer Science & Engineering  8.475409836058626
 Economics (HSS)  8.22222222219177
 Electrical Engineering  7.473684210520697
 Chemical Engineering  6.983606557371325
 Mechanical Engineering  6.19594594594176
 Civil Engineering  5.902912621353492
 Metallurgical Engineering & Materials Science  5.688679245277652
 Chemistry  5.5666666666481115
 Physics  5.333333333155555
 Aerospace Engineering  4.999999999991803
 Mathematics  3.799999999924
 Engineering Physics  3.3846153846067057
 Energy Science and Engineering  1.4285714285663267
 Computer Centre  0.9999999999
 Industrial Design Centre  0.0999999999995



Department        Number of posts authored
---------------------------------------
 Computer Science & Engineering  8.581967213107719
 Economics (HSS)  7.629629629601372
 Electrical Engineering  7.473684210520697
 Chemical Engineering  6.106557377044175
 Metallurgical Engineering & Materials Science  5.

In [29]:
## Looking at how many edges are within the department and how many outside
id_dept_map = {}
for id in data:
    id_dept_map[id] = data[id].get('Department') if 'Department' in data[id] else get_dept_from_roll(id)

dept_edges = {}
intra_dept_edges = {}
for id in data:
    dept = data[id].get('Department') if 'Department' in data[id] else get_dept_from_roll(id)
    # We only consider subject as the edge, since each subject has some writer as well
    if dept not in dept_edges:
        dept_edges[dept] = len(data[id]["Posts Subj"])
        intra_dept_edges[dept] = 0
    else:
        dept_edges[dept] += len(data[id]["Posts Subj"])
    for post in data[id]["Posts Subj"]:
        new_dept = id_dept_map.get(post["Author"]) if post["Author"] in id_dept_map else get_dept_from_roll(post["Author"])
        if new_dept == dept:
            intra_dept_edges[dept] += 1


print("Department           Fraction of internal edges")
print("-----------------------------------------------")
get_dict_stats(intra_dept_edges,dept_edges)


Department           Fraction of internal edges
-----------------------------------------------
 Economics (HSS)  0.6081081081078342
 Computer Science & Engineering  0.5434782608695115
 Electrical Engineering  0.5010309278349999
 Metallurgical Engineering & Materials Science  0.4929328621907256
 Civil Engineering  0.481543624160993
 Chemical Engineering  0.41445783132525127
 Chemistry  0.4121621621618837
 Aerospace Engineering  0.38620689655159096
 Mechanical Engineering  0.3363028953229024
 Engineering Physics  0.15315315315301517
 Energy Science and Engineering  0.02631578947361496
None 0.0
 Mathematics  0.0
 Computer Centre  0.0
 Physics  0.0
 Industrial Design Centre  0.0
