## Lab 09 - NodeLink Diagrams

#### Used libraries

1. **pandas**  
    `pip install pandas`

1. **plotly**  
    `pip install plotly`

1. **matplotlib**   
    `pip install matplotlib`   
    for plotting static graphs  

1. **networkx**  
    `pip install networkx`  
    for creating graph and performing operations on graphs  

1. **pyvis**  
    `pip install pyvis`  
    for creating dynamic trees and graphs

In [2]:
# imports
import pandas as pd 
import numpy as np
import os

import networkx as nx
import matplotlib as mpl
import matplotlib.pyplot as plt
import random

The university dataset has many .csv files.

For this experiment we will be exploring only the cwurData.csv and timesData.csv

In [3]:
def loadCSVData(path):
    '''
        Function helps load the given path name if it is a CSV file into a DataFrame using pandas
    '''
    if os.path.splitext(path)[1] == ".csv": # read only csv files from the dataset
        df = pd.read_csv(path, delimiter=',')

        return df
    return None

#### Dataset timesData.csv for other tasks

In [4]:
# loading timesData.csv
timesUniData = loadCSVData("../world_university_ranking/timesData.csv")

# info on columns
print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2603 non-null   object 
 1   university_name         2603 non-null   object 
 2   country                 2603 non-null   object 
 3   teaching                2603 non-null   float64
 4   international           2603 non-null   object 
 5   research                2603 non-null   float64
 6   citations               2603 non-null   float64
 7   income                  2603 non-null   object 
 8   total_score             2603 non-null   object 
 9   num_students            2544 non-null   object 
 10  student_staff_ratio     2544 non-null   float64
 11  international_students  2536 non-null   object 
 12  female_male_ratio       2370 non-null   object 
 13  year                    2603 non-null   int64  
dtypes: float64(4), int64(1), object(9)
memor

In [5]:
timesUniData.isnull().sum()

world_rank                  0
university_name             0
country                     0
teaching                    0
international               0
research                    0
citations                   0
income                      0
total_score                 0
num_students               59
student_staff_ratio        59
international_students     67
female_male_ratio         233
year                        0
dtype: int64

In [6]:
before = timesUniData.shape[0]
timesUniData = timesUniData.dropna()

print(f"Dropped {before - timesUniData.shape[0]} NaN values")
print(timesUniData.isnull().sum())

Dropped 241 NaN values
world_rank                0
university_name           0
country                   0
teaching                  0
international             0
research                  0
citations                 0
income                    0
total_score               0
num_students              0
student_staff_ratio       0
international_students    0
female_male_ratio         0
year                      0
dtype: int64


In [7]:
def parseNumberOfStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace(',', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

def parseGenderRatio(df : pd.Series) -> list[float]:
    lst = []
    ratio = 0

    if df.dtype == object:
        for ind, item in df.iteritems():
            try:
                tmp = list(map(int, item.split(":") ) )
                lst.append(tmp[0]) 
            except:
                lst.append(0) # no data

        return lst
    else:
        return list(df)

# international student is given in the format 27% so we can just remove the % to
# convert it into a numeric data type
def parseInternationalStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace('%', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

#convert income to float 64
def parseObjects(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = []
        mx, mn = -1e9, 1e9

        for ind, item  in df.iteritems():
            try:
                tmp.append(float(item))

                if float(item) > mx:
                    mx = float(item)
                
                if float(item) < mn:
                    mn = float(item)

            except Exception as e:
                # assign random value except assign 0
                tmp.append(np.random.randint(mn, mx))
            
        return tmp
    else:
        return list(df)


In [8]:
# here since number of students and gender ratio is given in strings we need to
# convert them into numeric types

timesUniData.num_students = parseNumberOfStudents(timesUniData.num_students)
pd.to_numeric(timesUniData.num_students)

timesUniData.female_male_ratio = parseGenderRatio(timesUniData.female_male_ratio)
pd.to_numeric(timesUniData.female_male_ratio)

timesUniData.international_students = parseInternationalStudents(
    timesUniData.international_students)
pd.to_numeric(timesUniData.international_students)

timesUniData.income = parseObjects(timesUniData.income)
pd.to_numeric(timesUniData.income)

total_score = parseObjects(timesUniData.total_score)
timesUniData.total_score = total_score
pd.to_numeric(timesUniData.total_score)

timesUniData.international = parseObjects(timesUniData.international)
pd.to_numeric(timesUniData.international)

print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2362 entries, 1 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2362 non-null   object 
 1   university_name         2362 non-null   object 
 2   country                 2362 non-null   object 
 3   teaching                2362 non-null   float64
 4   international           2362 non-null   float64
 5   research                2362 non-null   float64
 6   citations               2362 non-null   float64
 7   income                  2362 non-null   float64
 8   total_score             2362 non-null   float64
 9   num_students            2362 non-null   float64
 10  student_staff_ratio     2362 non-null   float64
 11  international_students  2362 non-null   float64
 12  female_male_ratio       2362 non-null   int64  
 13  year                    2362 non-null   int64  
dtypes: float64(9), int64(2), object(3)
memor

In [9]:
# file cwurData.csv
currUniData = loadCSVData("../world_university_ranking/cwurData.csv")
print(currUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB
None


In [10]:
currUniData.isnull().sum()

world_rank                0
institution               0
country                   0
national_rank             0
quality_of_education      0
alumni_employment         0
quality_of_faculty        0
publications              0
influence                 0
citations                 0
broad_impact            200
patents                   0
score                     0
year                      0
dtype: int64

In [11]:
currUniData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB


In [12]:
shanghaiData = loadCSVData("../world_university_ranking/shanghaiData.csv")
# info on columns
print(shanghaiData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4897 entries, 0 to 4896
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       4897 non-null   object 
 1   university_name  4896 non-null   object 
 2   national_rank    4896 non-null   object 
 3   total_score      1101 non-null   float64
 4   alumni           4896 non-null   float64
 5   award            4895 non-null   float64
 6   hici             4895 non-null   float64
 7   ns               4875 non-null   float64
 8   pub              4895 non-null   float64
 9   pcp              4895 non-null   float64
 10  year             4897 non-null   int64  
dtypes: float64(7), int64(1), object(3)
memory usage: 421.0+ KB
None


In [13]:
shanghaiData.isnull().sum()

world_rank            0
university_name       1
national_rank         1
total_score        3796
alumni                1
award                 2
hici                  2
ns                   22
pub                   2
pcp                   2
year                  0
dtype: int64

In [14]:
# total score has 3000 misssing values so we just drop the table
try:
    shanghaiData = shanghaiData.drop(columns='total_score')
except Exception as e:
    pass

before = shanghaiData.shape[0]
shanghaiData = shanghaiData.dropna()

print(f"Dropped {before - shanghaiData.shape[0]} NaN values")
print(shanghaiData.isnull().sum())

Dropped 22 NaN values
world_rank         0
university_name    0
national_rank      0
alumni             0
award              0
hici               0
ns                 0
pub                0
pcp                0
year               0
dtype: int64


In [15]:
shanghaiData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4875 entries, 0 to 4896
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       4875 non-null   object 
 1   university_name  4875 non-null   object 
 2   national_rank    4875 non-null   object 
 3   alumni           4875 non-null   float64
 4   award            4875 non-null   float64
 5   hici             4875 non-null   float64
 6   ns               4875 non-null   float64
 7   pub              4875 non-null   float64
 8   pcp              4875 non-null   float64
 9   year             4875 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 418.9+ KB


In [16]:
# function returns a random color
def getRandomColor():
    return [random.randint(0, 255), random.randint(0, 255), 
            random.randint(0, 255), max(.2, round(random.random(), 1) ) ]

def getRandomColorString(vals = None): 
    '''
        returns random color in rgba
    '''
    if vals is None:
        vals = getRandomColor()
    return f"rgba({vals[0]}, {vals[1]}, {vals[2]}, {vals[3]})"

def createEdgeListData(groupedData, col1, col2, col3):
    edgeData = []
    attribData = []

    for key, row in groupedData.iterrows():
        edgeData.append([row[col1], row[col2] ])
        attribData.append([row[col3], getRandomColor()])
        

    return edgeData, attribData

In [17]:
# plot 1
# graph of 3 quantitative attributes
%matplotlib qt 

def getNUniqueCoutnries(countryList, numC):
    cSet = set(countryList)
    cList = random.sample(list(cSet), numC)
    return cList

numC = 4
year = 2013
uni2013 = timesUniData[timesUniData["year"] == year]

countries = getNUniqueCoutnries(uni2013['country'], numC)

# filter by columns
filteredData = uni2013.query('country in @countries')

# group by country and extract only the first five rows
groupedData = filteredData.groupby(by='country').apply(lambda x : x[:5])

edgeData, attribData = createEdgeListData(groupedData, 'university_name', 
        'country', 'international')

# print(edgeList)
graph1 = nx.Graph()

for ed, ad in zip(edgeData, attribData):
    graph1.add_edge(ed[0], ed[1], weight=ad[0], color=ad[1])

colors = list(nx.get_edge_attributes(graph1, 'color').values())
weights = nx.get_edge_attributes(graph1, 'weight').values()
weights = [i / 10 for i in weights]


In [29]:
# plot 1
# graph of 3 quantitative attributes
%matplotlib qt 

def getNUniqueCoutnries(countryList, numC):
    cSet = set(countryList)
    cList = random.sample(list(cSet), numC)
    return cList

numC = 4
year = 2013
uni2013 = timesUniData[timesUniData["year"] == year]

countries = getNUniqueCoutnries(uni2013['country'], numC)

# filter by columns
filteredData = uni2013.query('country in @countries')

# group by country and extract only the first five rows
groupedData = filteredData.groupby(by='country').apply(lambda x : x[:5])

edgeData, attribData = createEdgeListData(groupedData, 'university_name', 
        'country', 'international')

# print(edgeList)
graph1 = nx.Graph()

for ed, ad in zip(edgeData, attribData):
    graph1.add_edge(ed[0], ed[1], weight=ad[0], color=ad[1])

colors = list(nx.get_edge_attributes(graph1, 'color').values())
weights = nx.get_edge_attributes(graph1, 'weight').values()
weights = [i / 10 for i in weights]


In [32]:
fig, ax = plt.subplots(figsize=(9, 7))
ax.set_title("Univeristy with their respective countries with\nInternational score as weight", fontsize=14)

pos = nx.spring_layout(graph1, k=.8, iterations=20)

deg = dict(graph1.degree)

plt.axes(ax)
nx.draw(graph1, pos=pos,
    edge_color=colors, width=weights,
    node_size=[v * 100 for v in deg.values()])

for node, (x, y) in pos.items():
    sz = 18 if(node in countries) else 8
    plt.text(x, y, node, fontsize=sz, ha='center', va='center')

In [31]:
fig, ax = plt.subplots(figsize=(9, 7))
ax.set_title("Univeristy with their respective countries with\nInternational score as weight", fontsize=14)

pos = nx.circular_layout(graph1, scale=2)

deg = dict(graph1.degree)

plt.axes(ax)
nx.draw(graph1, pos=pos,
    edge_color=colors, width=weights,
    node_size=[v * 100 for v in deg.values()])

for node, (x, y) in pos.items():
    sz = 18 if(node in countries) else 8
    plt.text(x, y, node, fontsize=sz, ha='center', va='center')

In [21]:
# plot 2
# graph of 3 quantitative attributes quality_of_education, quality_of_faculty, alumni_employment
%matplotlib qt 


numC = 15
year = 2012
uni2012 = currUniData[currUniData["year"] == year]
title = "Universities with quantitative attributes Quality of Education\
\nQuality of Faculty and Alumni Employment"

edgeData, attribData = createEdgeListData(uni2012, 'quality_of_education', 
        'quality_of_faculty', 'alumni_employment')

graph2 = nx.Graph()

for ed, ad in zip(edgeData, attribData):
    graph2.add_edge(ed[0], ed[1], weight=ad[0], color=ad[1])

colors = list(nx.get_edge_attributes(graph2, 'color').values())
weights = nx.get_edge_attributes(graph2, 'weight').values()
weights = [i / 50 for i in weights]


In [33]:
fig, ax = plt.subplots(figsize=(9, 7))
ax.set_title(title, fontsize=14)

pos = nx.spiral_layout(graph2, scale=4)

plt.axes(ax)
nx.draw(graph2, pos = pos, alpha =.7, width = weights,
    edge_color = colors, with_labels = True)


In [34]:
fig, ax = plt.subplots(figsize=(9, 7))
ax.set_title(title, fontsize=14)

pos = nx.random_layout(graph2)

plt.axes(ax)
nx.draw(graph2, pos = pos, alpha =.7, width = weights,
    edge_color = colors, with_labels = True)

In [24]:
# plot 1
# graph of 3 quantitative attributes

def getNUniqueCoutnries(countryList, numC):
    cSet = set(countryList)
    cList = random.sample(list(cSet), numC)
    return cList

numC = 4
year = 2013
uni2013 = timesUniData[timesUniData["year"] == year]

countries = getNUniqueCoutnries(uni2013['country'], numC)

# filter by columns
filteredData = uni2013.query('country in @countries')

# group by country and extract only the first five rows
groupedData = filteredData.groupby(by='country').apply(lambda x : x[:5])

edgeData, attribData = createEdgeListData(groupedData, 'university_name', 
        'country', 'international')

# print(edgeList)
graph1 = nx.Graph()

for ed, ad in zip(edgeData, attribData):
    graph1.add_edge(ed[0], ed[1], weight=ad[0], color=ad[1])

colors = list(nx.get_edge_attributes(graph1, 'color').values())
weights = nx.get_edge_attributes(graph1, 'weight').values()
weights = [i / 10 for i in weights]

In [25]:
# working plotting with pyvis
# the data preprocessing(quering is gonna be the same)
# as done in the first plot
import pyvis.network as pvn

network = pvn.Network(width="800px", height="600px")

countryColMapping = { i : getRandomColorString() for i in countries}
defEdgeColor = 'rgba(107, 135, 227, .5)'
defFont = '12 arial black'
specialFont = '20 arial black'

for ed, ad in zip(edgeData, attribData):
    col1, col2 = defEdgeColor, defEdgeColor
    f1 = f2 = defFont

    if ed[0] in countries:
        col1 = countryColMapping[ed[0]]
        f1 = specialFont
    
    if ed[1] in countries:
        col2 = countryColMapping[ed[1]]
        f2 = specialFont

    network.add_node(ed[0], color = col1, font=f1)
    network.add_node(ed[1], color = col2, font=f2)
    
    network.add_edge(ed[0], ed[1], value=ad[0], 
        color = getRandomColorString(ad[1]) )

network.write_html("./graph1.html")

In [26]:
# plot 2
# graph of 3 quantitative attributes Quality of Education, Quality of Faculty, Alumni Employment

numC = 15
year = 2012
uni2012 = currUniData[currUniData["year"] == year]
title = "Universities with quantitative attributes Quality of Education\
\nQuality of Faculty and Alumni Employment"

edgeData, attribData = createEdgeListData(uni2012, 'quality_of_education', 
        'quality_of_faculty', 'alumni_employment')

graph2 = nx.Graph()

for ed, ad in zip(edgeData, attribData):
    graph2.add_edge(ed[0], ed[1], weight=ad[0], color=ad[1])

colors = list(nx.get_edge_attributes(graph2, 'color').values())
weights = nx.get_edge_attributes(graph2, 'weight').values()
weights = [i / 50 for i in weights]


In [27]:
# second plot in pyvis
# working plotting with pyvis
# the data preprocessing(quering is gonna be the same)
# as done in the first plot
import pyvis.network as pvn

network = pvn.Network(width="800px", height="600px")

# defEdgeColor = 'rgba(107, 135, 227, .5)'
# defFont = '12 arial black'
# specialFont = '20 arial black'

for ed, ad in zip(edgeData, attribData):
    # col1, col2 = defEdgeColor, defEdgeColor
    # f1 = f2 = defFont

    network.add_node(ed[0]) #, color = col1, font=f1)
    network.add_node(ed[1]) #, color = col2, font=f2)
    
    network.add_edge(ed[0], ed[1], value=ad[0], 
        color = getRandomColorString(ad[1]) )

network.write_html("./graph2.html")