In [177]:
from pathlib import Path
from typing import Dict
import os
import json
import numpy as np
import pandas as pd

In [178]:
cwd = Path().resolve()
result_path = cwd.joinpath('result')

majors = os.listdir(result_path)

compulstory = set()
elective = set()

for major in majors:
    major_path = result_path.joinpath(major)
    for wajib in os.listdir(major_path.joinpath('wajib')):
        compulstory.add(wajib)

    for pilihan in os.listdir(major_path.joinpath('pilihan')):
        elective.add(pilihan)

print(f"Majors count {len(majors)}")
print(f"Compulsory count {compulstory.__len__()}")
print(f"Elective count {len(elective)}")

print(f"Total {len(majors) + len(compulstory) + len(elective)}")

Majors count 50
Compulsory count 1584
Elective count 1120
Total 2754


In [179]:
content = None

with open("major_subjects_keyword.json") as r:
    content = json.load(r)

keywords_length = []
extracted_keywords = []
subjects = []

for major in content:
    major_keyword_counter = 0
    for subject in major["subjects"]:
        for keyword in subject['keywords']:
            keywords_length.append(len(keyword))
        keywords = len(subject['keywords'])
        major_keyword_counter+=keywords
        extracted_keywords.append(keywords)

    subjects.append(major_keyword_counter)

print("SUBJECT KEYWORDS")
keywords_pd = pd.DataFrame(np.array(extracted_keywords))
keywords_pd.describe()


# print(f"Keywords count {len(extracted_keywords)}")
# print(f"Subjects (not unique) count {len(subjects)}")
# print(f"Average keywords per subject")


SUBJECT KEYWORDS


Unnamed: 0,0
count,2810.0
mean,9.204626
std,6.519608
min,0.0
25%,5.0
50%,8.0
75%,12.0
max,61.0


In [180]:
keywords_pd.sum()

0    25865
dtype: int64

In [181]:
print("Major keywords")
majors_pd = pd.DataFrame(np.array(subjects))
majors_pd.describe()


Major keywords


Unnamed: 0,0
count,50.0
mean,517.3
std,154.671508
min,226.0
25%,427.0
50%,506.5
75%,600.75
max,986.0


In [182]:
majors_pd.sum()

0    25865
dtype: int64

In [183]:
print(keywords)

keywords_length_pd = pd.DataFrame(np.array(keywords_length))
keywords_length_pd.describe()

13


Unnamed: 0,0
count,25865.0
mean,29.879992
std,24.620447
min,5.0
25%,16.0
50%,22.0
75%,34.0
max,639.0


In [184]:
weight = []
weight_normalized = []

with open('weight.txt') as r:
    content = r.read().split('\n')

    for line in content:
        weight.append(float(line.split(' ')[-1]))

with open('weight_normalized.txt') as r:
    content = r.read().split('\n')

    for line in content:
        weight_normalized.append(float(line.split(' ')[-1]))

In [185]:
weight_pd = pd.DataFrame(weight)
weight_pd.describe()

Unnamed: 0,0
count,1225.0
mean,0.9397
std,0.029977
min,0.801619
25%,0.922746
50%,0.943052
75%,0.961652
max,0.995614


In [186]:
weight_normalized_pd = pd.DataFrame(weight_normalized)
weight_normalized_pd.describe()

Unnamed: 0,0
count,1225.0
mean,71.17789
std,15.452411
min,0.0
25%,62.438108
50%,72.905751
75%,82.493678
max,100.0


In [187]:
reader = open('faculty_major.json')
data = json.load(reader)
reader.close()

majors = {}

for each in data:
    majs = each["majors"]

    for major in majs:
        majors[str(major["code"])] = major["acronym"]

In [188]:
import networkx as nx
from typing import List, Tuple

reader = open('weight_normalized.txt')
raw = reader.read().split('\n')
reader.close()

data: List[Tuple[str,str,float]] = []

numbers = []

nodes = set()

for each in raw:
    splitted = each.split(' ')
    node_a = majors[splitted[0].split("-")[-1]]
    node_b = majors[splitted[1].split('-')[-1]]
    nodes.add(node_a)
    nodes.add(node_b)
    number = float(splitted[2])# rescale to 1-10
    numbers.append(number)
    data.append((node_a, node_b, number))

nodes = list(nodes)

In [189]:
mean = np.array(numbers).mean()
cutoff = np.quantile(numbers, 0.85)
high_cutoff = np.quantile(numbers, 0.98)
mid_cutoff = np.quantile(numbers, 0.95)

degrees: Dict[str, Dict[str, float]] = {}

for node_a, node_b, weight in data:
    if node_a not in degrees:
        degrees[node_a] = {"below_mean": 0, "above_mean": 0, "above_cutoff" : 0, "above_mid": 0, "above_high":0}

    if node_b not in degrees:
        degrees[node_b] = {"below_mean": 0, "above_mean": 0, "above_cutoff" : 0, "above_mid": 0, "above_high":0}

    if weight < mean:
        key = "below_mean"
    elif weight >= mean and weight < cutoff:
        key = "above_mean"
    elif weight >= cutoff and weight < mid_cutoff:
        key ="above_cutoff"
    elif weight >= mid_cutoff and weight < high_cutoff:
        key="above_mid"
    else:
        key="above_high"

    degrees[node_a][key] += 1
    degrees[node_b][key] += 1

print(mean)
degrees_as_list = []

for k,v in degrees.items():
    degrees_as_list.append([k, v["below_mean"], v["above_mean"], v["above_cutoff"], v["above_mid"], v["above_high"]])

degress_pd = pd.DataFrame(degrees_as_list, columns=["vertex", "<mean degree", ">mean degree", ">85% degree", ">95% degree", ">98% degree"])
degress_pd['sortval'] = degress_pd[">98% degree"]*5 + degress_pd['>95% degree']*4 + degress_pd[">85% degree"]*2 + degress_pd['>mean degree']
degress_pd = degress_pd.sort_values(by=['sortval'], ascending=False)
degress_pd = degress_pd.drop('sortval', 1)
degress_pd.describe()

71.17789048751627


  degress_pd = degress_pd.drop('sortval', 1)


Unnamed: 0,<mean degree,>mean degree,>85% degree,>95% degree,>98% degree
count,50.0,50.0,50.0,50.0,50.0
mean,21.56,20.08,4.88,1.48,1.0
std,10.339463,6.974706,3.414495,1.403203,1.228904
min,6.0,3.0,0.0,0.0,0.0
25%,13.25,19.0,2.25,0.25,0.0
50%,20.0,21.5,4.0,1.0,1.0
75%,25.75,24.0,6.75,2.0,1.0
max,45.0,30.0,14.0,6.0,6.0


In [190]:
for line in degress_pd.to_string(index=False).split('\n'):
    print(line)

vertex  <mean degree  >mean degree  >85% degree  >95% degree  >98% degree
    TF            11            20           11            1            6
    BE             6            27           10            4            2
    AE             8            22           14            3            2
    MS            13            21            7            5            3
    TK            11            21           12            2            3
    TB            11            24            7            6            1
    EB            10            25           10            2            2
    MR             8            29            9            2            1
    TI             8            29            9            2            1
    EL            19            18            6            2            4
    FI            15            21            8            4            1
    KL            10            27           10            1            1
    IF            19            19    

In [191]:
G = nx.Graph()

selected_nodes = list(nodes)[:5]

for node in selected_nodes:
    G.add_node(node)

for edge in data:
    node_a, node_b, weight = edge
    if node_a in selected_nodes and node_b in selected_nodes:
        G.add_edge(node_a, node_b, weight=weight)

mat = nx.adjacency_matrix(G)
np.set_printoptions(precision=2, threshold=np.inf)
mat.todense()
# print(mat.shape)

  mat = nx.adjacency_matrix(G)


matrix([[ 0.  , 72.3 , 68.03, 55.99, 84.89],
        [72.3 ,  0.  , 72.46, 72.45, 75.06],
        [68.03, 72.46,  0.  , 56.85, 64.19],
        [55.99, 72.45, 56.85,  0.  , 58.04],
        [84.89, 75.06, 64.19, 58.04,  0.  ]])

In [192]:
print(list(G.nodes))

['SI', 'AK', 'PL', 'FK', 'ME']
