## Lab 07 - Visual variables (shape, size and hue)

#### Used libraries

1. **pandas**  
    `pip install pandas`

1. **plotly**  
    `pip install plotly`

1. **matplotlib**   
    `pip install matplotlib`   
    inside matplotlib we use pyplot and colors



In [2]:
# imports
import pandas as pd 

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as clr

import numpy as np
import os
# import geopandas as gpd


The university dataset has many .csv files.

For this experiment we will be exploring only the cwurData.csv and timesData.csv

In [3]:
def loadCSVData(path):
    '''
        Function helps load the given path name if it is a CSV file into a DataFrame using pandas
    '''
    if os.path.splitext(path)[1] == ".csv": # read only csv files from the dataset
        df = pd.read_csv(path, delimiter=',')

        return df
    return None

#### Dataset timesData.csv for other tasks

In [4]:
# loading timesData.csv
timesUniData = loadCSVData("../world_university_ranking/timesData.csv")

# info on columns
print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2603 non-null   object 
 1   university_name         2603 non-null   object 
 2   country                 2603 non-null   object 
 3   teaching                2603 non-null   float64
 4   international           2603 non-null   object 
 5   research                2603 non-null   float64
 6   citations               2603 non-null   float64
 7   income                  2603 non-null   object 
 8   total_score             2603 non-null   object 
 9   num_students            2544 non-null   object 
 10  student_staff_ratio     2544 non-null   float64
 11  international_students  2536 non-null   object 
 12  female_male_ratio       2370 non-null   object 
 13  year                    2603 non-null   int64  
dtypes: float64(4), int64(1), object(9)
memor

In [5]:
timesUniData.isnull().sum()

world_rank                  0
university_name             0
country                     0
teaching                    0
international               0
research                    0
citations                   0
income                      0
total_score                 0
num_students               59
student_staff_ratio        59
international_students     67
female_male_ratio         233
year                        0
dtype: int64

In [6]:
before = timesUniData.shape[0]
timesUniData = timesUniData.dropna()

print(f"Dropped {before - timesUniData.shape[0]} NaN values")
print(timesUniData.isnull().sum())

Dropped 241 NaN values
world_rank                0
university_name           0
country                   0
teaching                  0
international             0
research                  0
citations                 0
income                    0
total_score               0
num_students              0
student_staff_ratio       0
international_students    0
female_male_ratio         0
year                      0
dtype: int64


In [7]:
def parseNumberOfStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace(',', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

def parseGenderRatio(df : pd.Series) -> list[float]:
    lst = []
    ratio = 0

    if df.dtype == object:
        for ind, item in df.iteritems():
            try:
                tmp = list(map(int, item.split(":") ) )
                lst.append(tmp[0]) 
            except:
                lst.append(0) # no data

        return lst
    else:
        return list(df)

# international student is given in the format 27% so we can just remove the % to
# convert it into a numeric data type
def parseInternationalStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace('%', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

#convert income to float 64
def parseObjects(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = []
        mx, mn = -1e9, 1e9

        for ind, item  in df.iteritems():
            try:
                tmp.append(float(item))

                if float(item) > mx:
                    mx = float(item)
                
                if float(item) < mn:
                    mn = float(item)

            except Exception as e:
                # assign random value except assign 0
                tmp.append(np.random.randint(mn, mx))
            
        return tmp
    else:
        return list(df)


In [8]:
# here we convert bunch of number data types to float64 or int64

timesUniData.num_students = parseNumberOfStudents(timesUniData.num_students)
pd.to_numeric(timesUniData.num_students)

timesUniData.female_male_ratio = parseGenderRatio(timesUniData.female_male_ratio)
pd.to_numeric(timesUniData.female_male_ratio)

timesUniData.international_students = parseInternationalStudents(
    timesUniData.international_students)
pd.to_numeric(timesUniData.international_students)

timesUniData.income = parseObjects(timesUniData.income)
pd.to_numeric(timesUniData.income)

total_score = parseObjects(timesUniData.total_score)
timesUniData.total_score = total_score
pd.to_numeric(timesUniData.total_score)

timesUniData.international = parseObjects(timesUniData.international)
pd.to_numeric(timesUniData.international)

print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2362 entries, 1 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2362 non-null   object 
 1   university_name         2362 non-null   object 
 2   country                 2362 non-null   object 
 3   teaching                2362 non-null   float64
 4   international           2362 non-null   float64
 5   research                2362 non-null   float64
 6   citations               2362 non-null   float64
 7   income                  2362 non-null   float64
 8   total_score             2362 non-null   float64
 9   num_students            2362 non-null   float64
 10  student_staff_ratio     2362 non-null   float64
 11  international_students  2362 non-null   float64
 12  female_male_ratio       2362 non-null   int64  
 13  year                    2362 non-null   int64  
dtypes: float64(9), int64(2), object(3)
memor

In [9]:
# plot 1
# scatter plot of teching of different universities for two years
%matplotlib qt

numUni = 100

# since world rankings are not given in pure int64 format we use a cheaty way of solving
# by generating rankings list for 1 to numUni

rankingList = [i for i in range(1, numUni + 1) ]

uni2013 = timesUniData[timesUniData["year"] == 2013].iloc[:numUni, :]
uni2014 = timesUniData[timesUniData["year"] == 2014].iloc[:numUni, :]

sizes2013 = list(map(lambda x: (5 * x) / (2**2), uni2013.teaching))
sizes2014 = list(map(lambda x: (5 * x) / (2**2), uni2014.teaching))
# print(uni2013)

fig, ax = plt.subplots(figsize=(15, 7.5))

fig.suptitle(f"Citations vs world ranking for the top 100 univeristies of year 2013 and 2014",
    size = 16)

scatter2013 = ax.scatter(rankingList, list(uni2013.citations), s = sizes2013, alpha = .5,
        color = 'tab:orange', marker = 's', label='2013' )

scatter2014 = ax.scatter(rankingList, list(uni2014.citations), s= sizes2014, alpha = .5,
        color = 'tab:brown', marker = 'o', label='2014' )

ax.legend(['2013', '2014'], title="year", shadow=True, loc='upper right')

ax.set_xlabel("World ranking", size=14)
ax.set_ylabel("Citations", size=14)
ax.set_xticks(np.arange(0, numUni + 1, 10) )
ax.set_yticks(np.arange(min(uni2013.citations), max(uni2013.citations) + 10, 10) )
ax.grid(axis= 'x')


In [10]:
# plot 2
# scatter plot of research vs income of different universities over different year
# with gender ratio as color
%matplotlib qt

yStart = 2012
yEnd = 2014

colors = ['Yellow', 'Green', 'skyblue', 'blue']
cMap = mpl.colors.LinearSegmentedColormap.from_list('rainbow_color', colors)
Norm = mpl.colors.Normalize(vmin= 40, vmax = 70)

def makeScatterPlots(ax, df : pd.DataFrame, year : int, marker : str, label : str):
    tmpData = df[df["year"] == year].head(150)

    ax.scatter(list(tmpData.research), list(tmpData.income), 
        c = list(tmpData.female_male_ratio), alpha = .5, s = 100, marker = marker, 
            label = label, cmap = cMap)
        
fig, ax = plt.subplots(figsize=(15, 7.5))

fig.suptitle(f"Publication vs awards with alumni as color for the top 100 universities of year {yStart} to {yEnd}",
    size = 16)

markers = ['$2$', 4, 'p']

for i in range(yStart, yEnd + 1):
    makeScatterPlots(ax, timesUniData, i, markers[i - yStart], str(i))


ax.set_xlabel("Research", size=14)
ax.set_ylabel("Income", size=14)

ax.legend([str(i) for i in range(yStart, yEnd + 1)], 
            title="Year", shadow=True, loc='upper right')

cBar = fig.colorbar(mpl.cm.ScalarMappable(cmap = cMap, norm = Norm))
cBar.ax.tick_params(labelsize = 8)
cBar.ax.set_title("Alumni")

fig.tight_layout()

In [12]:
# file cwurData.csv
currUniData = loadCSVData("../world_university_ranking/cwurData.csv")
print(currUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB
None


In [13]:
currUniData.isnull().sum()

world_rank                0
institution               0
country                   0
national_rank             0
quality_of_education      0
alumni_employment         0
quality_of_faculty        0
publications              0
influence                 0
citations                 0
broad_impact            200
patents                   0
score                     0
year                      0
dtype: int64

In [14]:
before = currUniData.shape[0]
currUniData = currUniData.dropna()

print(f"Dropped {before - currUniData.shape[0]} NaN values")
print(currUniData.isnull().sum())

Dropped 200 NaN values
world_rank              0
institution             0
country                 0
national_rank           0
quality_of_education    0
alumni_employment       0
quality_of_faculty      0
publications            0
influence               0
citations               0
broad_impact            0
patents                 0
score                   0
year                    0
dtype: int64


In [15]:
currUniData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 200 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2000 non-null   int64  
 1   institution           2000 non-null   object 
 2   country               2000 non-null   object 
 3   national_rank         2000 non-null   int64  
 4   quality_of_education  2000 non-null   int64  
 5   alumni_employment     2000 non-null   int64  
 6   quality_of_faculty    2000 non-null   int64  
 7   publications          2000 non-null   int64  
 8   influence             2000 non-null   int64  
 9   citations             2000 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2000 non-null   int64  
 12  score                 2000 non-null   float64
 13  year                  2000 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 234.4+ KB


In [16]:
# plot 3
# scatter plot of quailty_of_education vs publication
%matplotlib qt

def makeScatterPlots(ax, df : pd.DataFrame, year : int, marker : str, color : str, label : str):
    tmpData = df[df["year"] == year].head(100)
    ax.scatter(list(tmpData.quality_of_faculty), list(tmpData.publications), 
            c = color, alpha = .5, s = 100, marker = marker, label = label)

yStart = 2014
yEnd = 2015

colors = ['orange', 'violet', 'royalblue']
        
fig, ax = plt.subplots(figsize=(15, 7.5))

fig.suptitle(f"Quality of education vs publication for year {yStart} to {yEnd}",
    size = 16)


markers = ['X', 'p', 11]

for i in range(yStart, yEnd + 1):
    makeScatterPlots(ax, currUniData, i, markers[i - yStart], colors[i - yStart], str(i))

ax.set_xlabel("Quality of faculty", size=14)
ax.set_ylabel("Publications", size=14)

ax.legend([str(i) for i in range(yStart, yEnd + 1)], 
            title="Year", shadow=True, loc='upper left')


# fig.tight_layout()

<matplotlib.legend.Legend at 0x2619c993190>

In [17]:
shanghaiData = loadCSVData("../world_university_ranking/shanghaiData.csv")
# info on columns
print(shanghaiData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4897 entries, 0 to 4896
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       4897 non-null   object 
 1   university_name  4896 non-null   object 
 2   national_rank    4896 non-null   object 
 3   total_score      1101 non-null   float64
 4   alumni           4896 non-null   float64
 5   award            4895 non-null   float64
 6   hici             4895 non-null   float64
 7   ns               4875 non-null   float64
 8   pub              4895 non-null   float64
 9   pcp              4895 non-null   float64
 10  year             4897 non-null   int64  
dtypes: float64(7), int64(1), object(3)
memory usage: 421.0+ KB
None


In [18]:
shanghaiData.isnull().sum()

world_rank            0
university_name       1
national_rank         1
total_score        3796
alumni                1
award                 2
hici                  2
ns                   22
pub                   2
pcp                   2
year                  0
dtype: int64

In [19]:
# total score has 3000 misssing values so we just drop the table
try:
    shanghaiData = shanghaiData.drop(columns='total_score')
except Exception as e:
    pass

before = shanghaiData.shape[0]
shanghaiData = shanghaiData.dropna()

print(f"Dropped {before - shanghaiData.shape[0]} NaN values")
print(shanghaiData.isnull().sum())

Dropped 22 NaN values
world_rank         0
university_name    0
national_rank      0
alumni             0
award              0
hici               0
ns                 0
pub                0
pcp                0
year               0
dtype: int64


In [20]:
shanghaiData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4875 entries, 0 to 4896
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       4875 non-null   object 
 1   university_name  4875 non-null   object 
 2   national_rank    4875 non-null   object 
 3   alumni           4875 non-null   float64
 4   award            4875 non-null   float64
 5   hici             4875 non-null   float64
 6   ns               4875 non-null   float64
 7   pub              4875 non-null   float64
 8   pcp              4875 non-null   float64
 9   year             4875 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 418.9+ KB


In [21]:
# plot 4
# scatter plot of publication vs awards
%matplotlib qt

def makeScatterPlots(ax, df : pd.DataFrame, year : int, marker : str, color : str, label : str):
    tmpData = df[df["year"] == year]
    # select those data whose award is not 0
    tmpData = df[df.award != 0].head(100)

    ax.scatter(list(tmpData.award), list(tmpData.ns), 
            c = color, alpha = .5, s = 100, marker = marker, label = label)

years = [2005, 2007, 2011]
colors = ['red', 'skyblue', 'purple']
        
fig, ax = plt.subplots(figsize=(15, 7.5))

fig.suptitle("Award vs publication for different years", size = 16)

markers = ['*', '8', 10, '+']

c = 0
for i in years:
    makeScatterPlots(ax, shanghaiData, i, markers[c], colors[c], str(i))
    c += 1

ax.set_xlabel("Award", size=14)
ax.set_ylabel("Publications", size=14)

ax.legend([str(i) for i in range(yStart, yEnd + 1)], 
            title="Year", shadow=True, loc='upper left')


<matplotlib.legend.Legend at 0x2619ca6b850>