## Lab 03 - Distributional and relational visualization

In [None]:
# imports
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import numpy as np
import os

import plotly.io as pio 
pio.renderers.default = "notebook+pdf"

In [17]:
%matplotlib qt

The university dataset has many .csv files.

For this experiment we will be exploring only the cwurData.csv and timesData.csv

In [None]:
def loadCSVData(path):
    '''
        Function helps load the given path name if it is a CSV file into a DataFrame using pandas
    '''
    if os.path.splitext(path)[1] == ".csv": # read only csv files from the dataset
        df = pd.read_csv(path, delimiter=',')

        return df
    return None

#### Dataset timesData.csv for other tasks

In [None]:
# file cwurData.csv
currUniData = loadCSVData("../world_university_ranking/cwurData.csv")
print(currUniData.info())

In [None]:
currUniData.isnull().sum()

In [None]:
# loading timesData.csv
timesUniData = loadCSVData("../world_university_ranking/timesData.csv")

# info on columns
print(timesUniData.info())

In [None]:
timesUniData.isnull().sum()

In [None]:
before = timesUniData.shape[0]
timesUniData = timesUniData.dropna()

print(f"Dropped {before - timesUniData.shape[0]} NaN values")
print(timesUniData.isnull().sum())

In [None]:
def parseNumberOfStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace(',', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

def parseGenderRatio(df : pd.Series) -> list[float]:
    lst = []
    ratio = 0

    if df.dtype == object:
        for ind, item in df.iteritems():
            try:
                tmp = list(map(int, item.split(":") ) )
                lst.append(tmp[0]) 
            except ZeroDivisionError:
                lst.append(100)
            except:
                lst.append(0) # no data

        return lst
    else:
        return list(df)

# international student is given in the format 27% so we can just remove the % to
# convert it into a numeric data type
def parseInternationalStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace('%', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)


In [None]:
# here since number of students and gender ratio is given in strings we need to
# convert them into numeric types

num_stud = parseNumberOfStudents(timesUniData.num_students)
timesUniData.num_students = num_stud
pd.to_numeric(timesUniData.num_students)

gender_ratio = parseGenderRatio(timesUniData.female_male_ratio)
timesUniData.female_male_ratio = gender_ratio
pd.to_numeric(timesUniData.female_male_ratio)

international_students = parseInternationalStudents(
    timesUniData.international_students)
timesUniData.international_students = international_students
pd.to_numeric(timesUniData.international_students)

print(timesUniData.info())


In [31]:
# extra plotly plot

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

year = 2015

uni2015 = timesUniData[timesUniData["year"] == year].head(18)
    
tableData = go.Table(
        header=dict(values=['Country', 'Number of Students', 'Research', 'Citations'], align='left', 
        line_color='darkslategray', fill_color=['lavender', 'aqua', 'greenyellow', 'greenyellow']),
        cells=dict(values=[uni2015.country, uni2015.num_students, uni2015.research, uni2015.citations],
            fill_color=['lavender', 'aqua', 'greenyellow', 'greenyellow'], align='left')
        )


fig = go.Figure(data=tableData, layout= go.Layout(title=f"Table coding for top univeristies of {year}") )

fig.show()    

In [22]:
# plot 1
# table color coding of quantitative, ordinal and categorical attributes of 
# top 18 univerisites of the year 2014

year = 2014
numUni = 18

uni2014 = timesUniData[timesUniData.year == year].iloc[: numUni]

colors = ['lavender', 'aqua', 'greenyellow', 'greenyellow']
cellColors = [colors] * numUni
cellData = []

for ind, row in uni2014.iterrows():
    tmp = [row.country, row.num_students, row.research, row.citations]
    cellData.append(tmp)

# print(cellData)

lavenderPatch = mpatches.Patch(color='lavender', label='Categorical data')
aquaPatch = mpatches.Patch(color='aqua', label='Quantitative data')
greenYellowPatch = mpatches.Patch(color='greenyellow', label='Ordinal data')

# here research and citations is given in percentage but other than that
# there is measure of how these were calculated

columnHeaders = ['$\\bf{Country}$', '$\\bf{Number\ of\ Students}$', 
        '$\\bf{Research}$', '$\\bf{Citations}$']
rowHeaders = [i for i in range(1, numUni + 1)]

fig, ax = plt.subplots()

ax.set_axis_off()

table = ax.table(cellText= cellData,
    colLabels=columnHeaders,
    colColours= colors,
    cellColours= cellColors,
    colWidths=[.50, .25, .25 / 2, .25 / 2],
    rowLabels=rowHeaders,
    cellLoc='left',
    loc='upper left')

table.auto_set_font_size(False)
table.set_fontsize(14)

ax.legend(handles=[lavenderPatch, aquaPatch, greenYellowPatch],
    bbox_to_anchor=(1.1, 1.1), prop=dict(size = 16))

ax.set_title("Table color coding with fields Univeristy Name, Number of student, Research and citations",
     fontdict=dict(size=20))

plt.figure(figsize=(20, 10))
plt.show()

In [25]:
# plot 2
# table color coding of quantitative, ordinal and categorical attributes of
# top 20 univerisites of the year 2015 (times uni data)

#here income is given in percentage so we dont have any idea of what unit we are talking about 
# millions, hundred thousands etc.

year = 2015
numUni = 15

uni2014 = timesUniData[timesUniData.year == year].iloc[: numUni]

colors = ['lightcoral', 'skyblue', 'turquoise']
cellColors = [colors] * numUni
cellData = []

for ind, row in uni2014.iterrows():
    tmp = [row.country, row.income, row.student_staff_ratio]
    cellData.append(tmp)

print(cellData)

lightcoralPatch = mpatches.Patch(color='lightcoral', label='Categorical data')
skybluePatch = mpatches.Patch(color='skyblue', label='Ordinal data')
turquoisePatch = mpatches.Patch(color='turquoise', label='Quantitative data')

lstPatches = [lightcoralPatch, skybluePatch, turquoisePatch]

columnHeaders = ['$\\bf{Country}$', '$\\bf{Income}$', 
        '$\\bf{Student\ Staff\ Ratio}$']

rowHeaders = [i for i in range(1, numUni + 1)]

fig, ax = plt.subplots()

ax.set_axis_off()

table = ax.table(cellText= cellData,
    colLabels=columnHeaders,
    colColours= colors,
    cellColours= cellColors,
    rowLabels=rowHeaders,
    cellLoc='center',
    loc='upper left')

table.auto_set_font_size(False)
table.set_fontsize(14)

ax.legend(handles=lstPatches, prop=dict(size = 16),
    bbox_to_anchor=(1.1, 1.1))

ax.set_title("Table color coding with fields Country, income, student staff ratio",
         fontdict=dict(size=24))

plt.figure(figsize=(6, 6), dpi=600)
plt.show()

[['United States of America', '89.1', 6.9], ['United Kingdom', '72.9', 11.6], ['United States of America', '63.1', 7.8], ['United Kingdom', '51.1', 11.8], ['United States of America', '95.7', 9.0], ['United States of America', '82.7', 8.4], ['United States of America', '44.8', 16.4], ['United Kingdom', '72.7', 11.7], ['United States of America', '42.0', 4.4], ['United States of America', '36.8', 6.9], ['United States of America', '-', 10.3], ['Switzerland', '73.2', 14.7], ['United States of America', '100.0', 3.6], ['United States of America', '43.0', 6.5], ['United States of America', '55.7', 9.0]]


In [27]:
#plot 3 (cwurData)
# table color coding of quantitative, ordinal and categorical attributes of
# top 15 univerisites of the year 2013

# alumni employment is given a score but we dont have any scale
# so alumni employment is ordinal

year = 2013
numUni = 17

uni2014 = currUniData[currUniData.year == year].iloc[: numUni]

colors = ['wheat', 'limegreen', 'plum']
cellColors = [colors] * numUni
cellData = []

for ind, row in uni2014.iterrows():
    tmp = [row.national_rank, row.score, row.alumni_employment]
    cellData.append(tmp)

print(cellData)


lstPatches = [mpatches.Patch(color=colors[0], label='Categorical data'),
            mpatches.Patch(color=colors[1], label='Quantitative data'),
            mpatches.Patch(color=colors[2], label='Ordinal data')]

columnHeaders = ['$\\bf{National\ Rank}$', '$\\bf{Score}$', 
        '$\\bf{Alumni\ Employment}$']

rowHeaders = [i for i in range(1, numUni + 1)]

fig, ax = plt.subplots()

ax.set_axis_off()

table = ax.table(cellText= cellData,
    colLabels=columnHeaders,
    colColours= colors,
    cellColours= cellColors,
    rowLabels=rowHeaders,
    cellLoc='center',
    loc='upper left')

table.auto_set_font_size(False)
table.set_fontsize(14)

ax.legend(handles=lstPatches, loc='upper left', prop=dict(size = 16),
    bbox_to_anchor=(-.15, 1))

ax.set_title("Table color coding with fields National rank, Score and Alumni Employment", fontdict=dict(size=24))

plt.figure(figsize=(6, 6), dpi=600)
plt.show()

[[1, 100.0, 1], [2, 93.94, 2], [1, 92.54, 12], [3, 91.45, 16], [2, 90.24, 15], [4, 88.21, 8], [5, 85.07, 28], [6, 82.17, 14], [7, 79.16, 19], [8, 78.83, 25], [9, 77.59, 101], [10, 77.24, 5], [11, 76.99, 23], [1, 76.23, 3], [2, 69.46, 17], [12, 67.84, 35], [13, 65.64, 96]]


In [30]:
#plot 4 (cwurData)
# table color coding of quantitative, ordinal and categorical attributes of 
# top 19 univerisites of the year 2014

# influence is given a number but no units to refer to
# so Influcence can be called an ordinal attribute

year = 2014
numUni = 19

uni2014 = currUniData[currUniData.year == year].iloc[: numUni]

colors = ['wheat', 'limegreen', 'plum']
cellColors = [colors] * numUni
cellData = []

for ind, row in uni2014.iterrows():
    tmp = [row.country, row.quality_of_education, row.influence]
    cellData.append(tmp)

print(cellData)


lstPatches = [mpatches.Patch(color=colors[0], label='Categorical data'),
            mpatches.Patch(color=colors[1], label='Quantitative data'),
            mpatches.Patch(color=colors[2], label='Ordinal data')]

columnHeaders = ['$\\bf{Country}$', '$\\bf{Quality\ of\ Education}$', 
        '$\\bf{Influence}$']

rowHeaders = [i for i in range(1, numUni + 1)]

fig, ax = plt.subplots()

ax.set_axis_off()

table = ax.table(cellText= cellData,
    colLabels=columnHeaders,
    colColours= colors,
    cellColours= cellColors,
    rowLabels=rowHeaders,
    cellLoc='center',
    loc='upper left')

table.auto_set_font_size(False)
table.set_fontsize(14)

ax.legend(handles=lstPatches, loc='upper left', prop=dict(size = 16),
    bbox_to_anchor=(-.15, 1))

ax.set_title("Table color coding with fields Country, Quality of Eduation and Influence", 
        fontdict=dict(size=24))

plt.figure(figsize=(6, 6), dpi=600)
plt.show()

[['USA', 1, 1], ['USA', 11, 3], ['USA', 3, 2], ['United Kingdom', 2, 9], ['United Kingdom', 7, 12], ['USA', 13, 13], ['USA', 4, 4], ['USA', 10, 19], ['USA', 5, 25], ['USA', 9, 7], ['USA', 12, 15], ['USA', 6, 6], ['Japan', 17, 16], ['USA', 21, 17], ['USA', 27, 14], ['Japan', 30, 41], ['USA', 33, 29], ['Switzerland', 16, 28], ['USA', 24, 10]]
