## Lab 10 - Node Edge Manipulation

#### Used libraries

1. **pandas**  
    `pip install pandas`

1. **plotly**  
    `pip install plotly`

1. **matplotlib**   
    `pip install matplotlib`   
    for plotting static graphs  

1. **networkx**  
    `pip install networkx`  
    for creating graph and performing operations on graphs  

1. **pyvis**  
    `pip install pyvis`  
    for creating dynamic trees and graphs

In [1]:
# imports
import pandas as pd 
import numpy as np
import os

import networkx as nx
import matplotlib as mpl
import matplotlib.pyplot as plt
import random

The university dataset has many .csv files.

For this experiment we will be exploring only the cwurData.csv and timesData.csv

In [2]:
def loadCSVData(path):
    '''
        Function helps load the given path name if it is a CSV file into a DataFrame using pandas
    '''
    if os.path.splitext(path)[1] == ".csv": # read only csv files from the dataset
        df = pd.read_csv(path, delimiter=',')

        return df
    return None

#### Dataset timesData.csv for other tasks

In [3]:
# loading timesData.csv
timesUniData = loadCSVData("../world_university_ranking/timesData.csv")

# info on columns
print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2603 non-null   object 
 1   university_name         2603 non-null   object 
 2   country                 2603 non-null   object 
 3   teaching                2603 non-null   float64
 4   international           2603 non-null   object 
 5   research                2603 non-null   float64
 6   citations               2603 non-null   float64
 7   income                  2603 non-null   object 
 8   total_score             2603 non-null   object 
 9   num_students            2544 non-null   object 
 10  student_staff_ratio     2544 non-null   float64
 11  international_students  2536 non-null   object 
 12  female_male_ratio       2370 non-null   object 
 13  year                    2603 non-null   int64  
dtypes: float64(4), int64(1), object(9)
memor

In [4]:
timesUniData.isnull().sum()

world_rank                  0
university_name             0
country                     0
teaching                    0
international               0
research                    0
citations                   0
income                      0
total_score                 0
num_students               59
student_staff_ratio        59
international_students     67
female_male_ratio         233
year                        0
dtype: int64

In [5]:
before = timesUniData.shape[0]
timesUniData = timesUniData.dropna()

print(f"Dropped {before - timesUniData.shape[0]} NaN values")
print(timesUniData.isnull().sum())

Dropped 241 NaN values
world_rank                0
university_name           0
country                   0
teaching                  0
international             0
research                  0
citations                 0
income                    0
total_score               0
num_students              0
student_staff_ratio       0
international_students    0
female_male_ratio         0
year                      0
dtype: int64


In [6]:
def parseNumberOfStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace(',', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

def parseGenderRatio(df : pd.Series) -> list[float]:
    lst = []
    ratio = 0

    if df.dtype == object:
        for ind, item in df.iteritems():
            try:
                tmp = list(map(int, item.split(":") ) )
                lst.append(tmp[0]) 
            except:
                lst.append(0) # no data

        return lst
    else:
        return list(df)

# international student is given in the format 27% so we can just remove the % to
# convert it into a numeric data type
def parseInternationalStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace('%', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

#convert income to float 64
def parseObjects(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = []
        mx, mn = -1e9, 1e9

        for ind, item  in df.iteritems():
            try:
                tmp.append(float(item))

                if float(item) > mx:
                    mx = float(item)
                
                if float(item) < mn:
                    mn = float(item)

            except Exception as e:
                # assign random value except assign 0
                tmp.append(np.random.randint(mn, mx))
            
        return tmp
    else:
        return list(df)


In [7]:
# here since number of students and gender ratio is given in strings we need to
# convert them into numeric types

timesUniData.num_students = parseNumberOfStudents(timesUniData.num_students)
pd.to_numeric(timesUniData.num_students)

timesUniData.female_male_ratio = parseGenderRatio(timesUniData.female_male_ratio)
pd.to_numeric(timesUniData.female_male_ratio)

timesUniData.international_students = parseInternationalStudents(
    timesUniData.international_students)
pd.to_numeric(timesUniData.international_students)

timesUniData.income = parseObjects(timesUniData.income)
pd.to_numeric(timesUniData.income)

total_score = parseObjects(timesUniData.total_score)
timesUniData.total_score = total_score
pd.to_numeric(timesUniData.total_score)

timesUniData.international = parseObjects(timesUniData.international)
pd.to_numeric(timesUniData.international)

print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2362 entries, 1 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2362 non-null   object 
 1   university_name         2362 non-null   object 
 2   country                 2362 non-null   object 
 3   teaching                2362 non-null   float64
 4   international           2362 non-null   float64
 5   research                2362 non-null   float64
 6   citations               2362 non-null   float64
 7   income                  2362 non-null   float64
 8   total_score             2362 non-null   float64
 9   num_students            2362 non-null   float64
 10  student_staff_ratio     2362 non-null   float64
 11  international_students  2362 non-null   float64
 12  female_male_ratio       2362 non-null   int64  
 13  year                    2362 non-null   int64  
dtypes: float64(9), int64(2), object(3)
memor

In [8]:
# file cwurData.csv
currUniData = loadCSVData("../world_university_ranking/cwurData.csv")
print(currUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB
None


In [9]:
currUniData.isnull().sum()

world_rank                0
institution               0
country                   0
national_rank             0
quality_of_education      0
alumni_employment         0
quality_of_faculty        0
publications              0
influence                 0
citations                 0
broad_impact            200
patents                   0
score                     0
year                      0
dtype: int64

In [10]:
currUniData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB


In [11]:
shanghaiData = loadCSVData("../world_university_ranking/shanghaiData.csv")
# info on columns
print(shanghaiData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4897 entries, 0 to 4896
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       4897 non-null   object 
 1   university_name  4896 non-null   object 
 2   national_rank    4896 non-null   object 
 3   total_score      1101 non-null   float64
 4   alumni           4896 non-null   float64
 5   award            4895 non-null   float64
 6   hici             4895 non-null   float64
 7   ns               4875 non-null   float64
 8   pub              4895 non-null   float64
 9   pcp              4895 non-null   float64
 10  year             4897 non-null   int64  
dtypes: float64(7), int64(1), object(3)
memory usage: 421.0+ KB
None


In [12]:
shanghaiData.isnull().sum()

world_rank            0
university_name       1
national_rank         1
total_score        3796
alumni                1
award                 2
hici                  2
ns                   22
pub                   2
pcp                   2
year                  0
dtype: int64

In [13]:
# total score has 3000 misssing values so we just drop the table
try:
    shanghaiData = shanghaiData.drop(columns='total_score')
except Exception as e:
    pass

before = shanghaiData.shape[0]
shanghaiData = shanghaiData.dropna()

print(f"Dropped {before - shanghaiData.shape[0]} NaN values")
print(shanghaiData.isnull().sum())

Dropped 22 NaN values
world_rank         0
university_name    0
national_rank      0
alumni             0
award              0
hici               0
ns                 0
pub                0
pcp                0
year               0
dtype: int64


In [14]:
shanghaiData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4875 entries, 0 to 4896
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       4875 non-null   object 
 1   university_name  4875 non-null   object 
 2   national_rank    4875 non-null   object 
 3   alumni           4875 non-null   float64
 4   award            4875 non-null   float64
 5   hici             4875 non-null   float64
 6   ns               4875 non-null   float64
 7   pub              4875 non-null   float64
 8   pcp              4875 non-null   float64
 9   year             4875 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 418.9+ KB


In [15]:
# function returns a random color
def getRandomColor():
    return [random.randint(0, 255), random.randint(0, 255), 
            random.randint(0, 255), max(.2, round(random.random(), 1) ) ]

def getRandomColorString(vals = None): 
    '''
        returns random color in rgba
    '''
    if vals is None:
        vals = getRandomColor()
    return f"rgba({vals[0]}, {vals[1]}, {vals[2]}, {vals[3]})"

def createEdgeListData(groupedData, col1, col2, col3):
    edgeData = []
    attribData = []

    for key, row in groupedData.iterrows():
        edgeData.append([row[col1], row[col2] ])
        attribData.append([row[col3], getRandomColor()])
        

    return edgeData, attribData

In [89]:
# ques a
# graph of 3 quantitative attributes quality_of_education, quality_of_faculty, alumni_employment
%matplotlib qt 

def setUpGraph(df, col1, col2, col3):
    edgeData, attribData = createEdgeListData(df, col1, 
        col2, col3)
        
    graph = nx.Graph()

    for ed, ad in zip(edgeData, attribData):
        graph.add_edge(ed[0], ed[1], weight=ad[0], color=ad[1])

    return graph

numC = 21
year = 2012
title = "Universities with quantitative attributes Quality of Education\
\nQuality of Faculty and Alumni Employment"

data1 = currUniData[currUniData["year"] == year].iloc[:numC]

# print(data1)

graph1 = setUpGraph(data1, 'quality_of_education', 
        'quality_of_faculty', 'alumni_employment')

colors1 = nx.get_edge_attributes(graph1, 'color').values()
weights1 = nx.get_edge_attributes(graph1, 'weight').values()
weights1 = [i / 20 for i in weights1]

In [None]:
# ques b
# graph 2
# get only 3 attributes
data2 = currUniData[currUniData["year"] == year].iloc[numC:(numC + 3), : ] 

graph2 = setUpGraph(data2, 'quality_of_education', 
        'quality_of_faculty', 'alumni_employment')
        
colors2 = list(nx.get_edge_attributes(graph2, 'color').values())
weights2 = nx.get_edge_attributes(graph2, 'weight').values()
weights2 = [i / 20 for i in weights2]

In [100]:
# plot graph 1 and graph 2
fig, axs = plt.subplots(figsize = (11, 8), ncols = 2)
axs = axs.flatten()

fig.suptitle(title, fontsize=14)

# plot first graph
graphs = [graph1, graph2]
pos = [nx.spring_layout(graph1, k = 1.5, iterations = 30), 
    nx.spring_layout(graph2, k = 1.5, iterations = 30)]

colorLst = [colors1, colors2]
weightLst = [weights1, weights2]

for i in range(len(axs)):
    plt.sca(axs[i])

    deg = dict(graphs[i].degree)

    nx.draw_networkx(graphs[i], pos = pos[i], ax = axs[i], alpha = .7,
        edge_color = colorLst[i], width = weightLst[i],
        node_size = [v * 500 for v in deg.values()] )

    nx.draw_networkx_edge_labels(graphs[i], pos = pos[i], 
        edge_labels = nx.get_edge_attributes(graphs[i], 'weight') )
    
    axs[i].set_title(f"Graph{i + 1}")


fig.tight_layout()

In [101]:
# ques c
# merge graph1 and graph2

graph3 = nx.compose(graph1, graph2)

# to have more and longer connecting components
extraEdges = [(8, 11, 20, getRandomColor() ), 
            (11, 22, 34, getRandomColor() ),
            (10, 12, 26, getRandomColor() ),
            (61, 34, 15, getRandomColor() ),
            (34, 89, 22, getRandomColor() ),
            (89, 62, 6, getRandomColor() ),
            (6, 62, 16, getRandomColor() ),
            (23, 24, 26, getRandomColor() ),

]

for vals in extraEdges:
    graph3.add_edge(vals[0], vals[1], 
        weight=vals[2], color=vals[3])

weights = nx.get_edge_attributes(graph3, 'weight')
colors = list(nx.get_edge_attributes(graph3, 'color').values())
weights = [i / 20 for i in weights.values()]

deg = dict(graph3.degree)

fig, ax = plt.subplots(figsize=(16, 8))

fig.suptitle(title + " - Merged Graph", fontsize=14)

pos = nx.spring_layout(graph3, k = 1.5, iterations=30)

nx.draw_networkx(graph3, pos = pos, with_labels=True, alpha = .7,
    edge_color = colors, width = weights, 
    node_size = [v * 600 for v in deg.values()]
)

edgeLabels = nx.draw_networkx_edge_labels(graph3, pos, 
    edge_labels =  nx.get_edge_attributes(graph3, 'weight') )



In [102]:
# highlight the shortest path

# srcNode = min(graph3.nodes().keys())
srcNode = 3
destNode = 8 

pos = nx.spring_layout(graph3, k = 1.5, iterations=30)

path = nx.shortest_path(graph3, srcNode, destNode)

path_edges = list(zip(path, path[1:]))

print(path)
print(path_edges)

pathNodeSize = []
tmp = set(path)

for v in deg.keys():
    if v in tmp:
        pathNodeSize.append(deg[v] * 500)

fig, ax = plt.subplots(figsize=(16, 8))

fig.suptitle(f"Shortest path from node '{srcNode}' to '{destNode}'", fontsize=14)

nx.draw_networkx(graph3, pos = pos, with_labels = True, 
    ax = ax, alpha = .5,
    node_size = [v * 500 for v in deg.values()])

nx.draw_networkx_nodes(graph3, pos = pos, ax = ax,
    nodelist=path, node_color= 'red',
    node_size = pathNodeSize)

nx.draw_networkx_edges(graph3, pos = pos, ax = ax, width = 15, 
    edgelist = path_edges , edge_color = 'skyblue'
)

nx.draw_networkx_edge_labels(graph3, pos = pos, ax = ax,
        edge_labels = nx.get_edge_attributes(graph3, 'weight')
    )

print("Done")

[3, 9, 13, 24, 23, 10, 4, 22, 11, 8]
[(3, 9), (9, 13), (13, 24), (24, 23), (23, 10), (10, 4), (4, 22), (22, 11), (11, 8)]
Done


In [105]:
# pyvis implementation
import pyvis.network as pvn

network = pvn.Network(width="800px", height="600px")

specialCol = 'rgba(212, 83, 162, 1)'
specialFont = '18px arial black'
defColor = 'rgba(107, 135, 227, .5)'
defEdgeColor = 'rgba(0, 0, 0, .7)'
specialEdgeColor = 'rgba(68, 215, 235, .8)'
defFont = '14px arial black'

pathSet = set(path)
pathEdgesSet = set(path_edges)

for n in graph3.nodes():
    col = defColor
    font = defFont

    if n in path:
        col = specialCol
        font = specialFont
    
    network.add_node(n, color = col, font = font)

print(pathEdgesSet)

for e in graph3.edges():
    col = defEdgeColor
    font = defFont
    width = 2

    if e in pathEdgesSet or e[::-1] in pathEdgesSet:
        col = specialEdgeColor
        font = specialFont
        width = 8
    
    network.add_edge(e[0], e[1], color = col, font = font, width = width)

network.write_html("./shortestPath.html")

{(9, 13), (24, 23), (22, 11), (23, 10), (10, 4), (4, 22), (3, 9), (11, 8), (13, 24)}
