## Lab 03 - Distributional and relational visualization

#### Used libraries

1. **pandas**  
    `pip install pandas`

1. **plotly**  
    `pip install plotly`

1. **pycountry** - for getting country codes for geoPlot   
    `pip install pycountry`

1. **geopandas** - for working with geo plots  
    `pip install geopandas`

In [None]:
# imports
import pandas as pd 

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np
import os
import pycountry
# import geopandas as gpd


The university dataset has many .csv files.

For this experiment we will be exploring only the cwurData.csv and timesData.csv

In [None]:
def loadCSVData(path):
    '''
        Function helps load the given path name if it is a CSV file into a DataFrame using pandas
    '''
    if os.path.splitext(path)[1] == ".csv": # read only csv files from the dataset
        df = pd.read_csv(path, delimiter=',')

        return df
    return None

#### Dataset timesData.csv for other tasks

In [32]:
# loading timesData.csv
timesUniData = loadCSVData("../world_university_ranking/timesData.csv")

# info on columns
print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2603 non-null   object 
 1   university_name         2603 non-null   object 
 2   country                 2603 non-null   object 
 3   teaching                2603 non-null   float64
 4   international           2603 non-null   object 
 5   research                2603 non-null   float64
 6   citations               2603 non-null   float64
 7   income                  2603 non-null   object 
 8   total_score             2603 non-null   object 
 9   num_students            2544 non-null   object 
 10  student_staff_ratio     2544 non-null   float64
 11  international_students  2536 non-null   object 
 12  female_male_ratio       2370 non-null   object 
 13  year                    2603 non-null   int64  
dtypes: float64(4), int64(1), object(9)
memor

In [None]:
timesUniData.isnull().sum()

In [None]:
before = timesUniData.shape[0]
timesUniData = timesUniData.dropna()

print(f"Dropped {before - timesUniData.shape[0]} NaN values")
print(timesUniData.isnull().sum())

In [None]:
def parseNumberOfStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace(',', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

def parseGenderRatio(df : pd.Series) -> list[float]:
    lst = []
    ratio = 0

    if df.dtype == object:
        for ind, item in df.iteritems():
            try:
                tmp = list(map(int, item.split(":") ) )
                lst.append(tmp[0]) 
            except ZeroDivisionError:
                lst.append(100)
            except:
                lst.append(0) # no data

        return lst
    else:
        return list(df)

# international student is given in the format 27% so we can just remove the % to
# convert it into a numeric data type
def parseInternationalStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace('%', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)


In [None]:
# here since number of students and gender ratio is given in strings we need to
# convert them into numeric types

num_stud = parseNumberOfStudents(timesUniData.num_students)
timesUniData.num_students = num_stud
pd.to_numeric(timesUniData.num_students)

gender_ratio = parseGenderRatio(timesUniData.female_male_ratio)
timesUniData.female_male_ratio = gender_ratio
pd.to_numeric(timesUniData.female_male_ratio)

international_students = parseInternationalStudents(
    timesUniData.international_students)
timesUniData.international_students = international_students
pd.to_numeric(timesUniData.international_students)

print(timesUniData.info())


In [10]:
# plot 1 
# Student staff ratio in the year 2015-2016
year = [2015, 2016]

uniData = timesUniData.query(f"year in {year}")
uni2015 = timesUniData[timesUniData["year"] == 2015]
uni2016 = timesUniData[timesUniData["year"] == 2016]

hist1 = go.Histogram(x = uni2015.student_staff_ratio, name='2015',
        marker = dict(color = 'rgba(119, 157, 230, 0.8)' ) )

hist2 = go.Histogram(x = uni2016.student_staff_ratio, name = '2016', 
        marker = dict(color = 'rgba(220, 112, 92, 0.87)' ) )


layout = dict(title = 'Student staff ratio in year 2015-2016', title_x = 0.5, barmode='overlay',
	xaxis_title = 'Count', yaxis_title = 'Student-staff ratio')

fig = go.Figure(data = [hist1, hist2], layout = layout)

fig.update_layout(title = 'Student staff ratio in year 2015-2016', title_x = 0.3, barmode='overlay',
xaxis_title = 'Count', yaxis_title = 'Student-staff ratio', font_size = 16)
fig.update_traces(opacity = .5)

fig.show(renderer='browser', auto_open=True)

In [11]:
# plot 2
# compare the teaching with world ranking for the top 50 universities
year = 2016
num_uni = 30

topUnis = timesUniData[timesUniData.year == year].iloc[: num_uni]

colors = [float(item) for item in topUnis.international]
num_students = topUnis.num_students

data = go.Scatter(x = topUnis.world_rank, y = topUnis.teaching,
	mode = 'markers+text',
	marker = dict(
			color = colors,
			size = num_students,
			sizeref= (5.0 * max(num_students) ) / (25.**2),
			showscale = True
	),
	text = topUnis.university_name, 
	textfont=dict(
        family="sans serif",
        size=12,
    ))

layout = dict(xaxis_title = 'World ranking', yaxis_title='Teaching score', legend_title ='International score',
        title = f"Worlds top {num_uni} univeristy with number of students(size) and international score(scale) - {year}",
		font = dict(
			family="Calibri",
			size=16,
			color="RebeccaPurple"
		))

fig = go.Figure(data = data, layout = layout)
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show(renderer='browser', auto_open=True)

In [12]:
# for geo plots we need the iso alpha country
# so we need to get country codes
jsonUrl = 'https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/all/all.json'

countyDf = pd.read_json(jsonUrl)

countryCodes = dict(zip(countyDf.name, countyDf["alpha-3"]))

# since UK, South Korea, Hong Kong, Taiwan, Czech Republic, Iran, Macau is not added
countryCodes['United Kingdom'] = 'GBR'
countryCodes['Hong Kong'] = 'HKG'
countryCodes["South Korea"] = 'KOR'
countryCodes['Republic of Ireland'] = 'IRL'
countryCodes['Taiwan'] = 'TWN'
countryCodes['Czech Republic'] = 'CZE'
countryCodes['Iran'] = 'IRN'
countryCodes['Macau'] = 'MAC'

# print(countryCodes["United Kingdom"])

In [27]:
# plot 3
# geo plot of the distribution of the universities in the world for the year 2015
year = 2015

def getUniCount(df : pd.DataFrame) -> pd.DataFrame:
    countryCount = {}

    for ind, row in df.iterrows():
        cname = row["country"]

        if cname in countryCount:
            countryCount[cname] += 1
        else:
            countryCount[cname] = 1

    countDf = pd.DataFrame(columns=['Country', 'Number of university', 'alpha3_code']) # create a dataframe

    for key, value in countryCount.items():
        appendVals = [key, value, countryCodes[key]]
        countDf.loc[len(countDf)] = appendVals # append the values to the dataframe

    countDf["Number of university"] = pd.to_numeric(countDf["Number of university"])

    return countDf.sort_values(by=["Number of university"], ascending=False) # sort by number of university in descending order

uniData = timesUniData[timesUniData["year"] == year]
uniCountData = getUniCount(uniData)


# fig = px.scatter_geo(uniCountData, locations="alpha3_code", color="Country", hover_name="Country",
#         projection="natural earth", size="Number of university" )

fig = px.choropleth(uniCountData, locations="alpha3_code", color="Number of university", hover_name="Country",
        color_continuous_scale="viridis")

fig.update_layout(title = f"Geo plot of the distribution of the universities in the world for the year {year}",
    title_x = .5, title_font_size = 20)

fig.show(renderer='browser', auto_open=True)

In [46]:
# plot 4
# world rank vs citations of the top 100 unviersity of year 2014, 2015, 2016

uni2014 = timesUniData[timesUniData["year"] == 2014].iloc[:100]
uni2015 = timesUniData[timesUniData["year"] == 2015].iloc[:100]
uni2016 = timesUniData[timesUniData["year"] == 2016].iloc[:100]

scatter2014 = go.Scatter(x = uni2014.world_rank, y = uni2014.citations, name = '2014', 
        text = uni2014.university_name, mode = 'markers', marker = dict(color = 'rgba(119, 157, 230, 0.8)' ) )

scatter2015 = go.Scatter(x = uni2015.world_rank, y = uni2015.citations, name = '2015',
        text = uni2015.university_name, mode = 'markers', marker = dict(color = 'rgba(220, 112, 92, 0.87)' ) )
    
scatter2016 = go.Scatter(x = uni2016.world_rank, y = uni2016.citations, name = '2016',
        text = uni2016.university_name, mode = 'markers', marker = dict(color = 'rgba(82, 84, 80, .8)' ) )

data = [scatter2014, scatter2015, scatter2016]

layout = dict(title = 'Citations vs world ranking for the top 100 univeristies of year 2014, 2015 and 2016',
             legend_title = 'Year', title_x = 0.5, title_font_size = 20,
             xaxis=dict(title = 'World ranking', ticklen = 30),
             yaxis = dict(title = "Citations", ticklen = 20))

fig = go.Figure(data = data, layout = layout)
fig.show(renderer='browser', auto_open=True)
# fig.show()
