## Lab 06 - Scatter and contour plots with custom colormaps(rainbow or hue)

#### Used libraries

1. **pandas**  
    `pip install pandas`

1. **plotly**  
    `pip install plotly`

1. **matplotlib**   
    `pip install matplotlib`   
    inside matplotlib we use pyplot and colors



In [1]:
# imports
import pandas as pd 
import math
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as clr

import numpy as np
import os


The university dataset has many .csv files.

For this experiment we will be exploring only the cwurData.csv and timesData.csv

In [2]:
def loadCSVData(path):
    '''
        Function helps load the given path name if it is a CSV file into a DataFrame using pandas
    '''
    if os.path.splitext(path)[1] == ".csv": # read only csv files from the dataset
        df = pd.read_csv(path, delimiter=',')

        return df
    return None

#### Dataset timesData.csv for other tasks

In [3]:
# loading timesData.csv
timesUniData = loadCSVData("../world_university_ranking/timesData.csv")

# info on columns
print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2603 non-null   object 
 1   university_name         2603 non-null   object 
 2   country                 2603 non-null   object 
 3   teaching                2603 non-null   float64
 4   international           2603 non-null   object 
 5   research                2603 non-null   float64
 6   citations               2603 non-null   float64
 7   income                  2603 non-null   object 
 8   total_score             2603 non-null   object 
 9   num_students            2544 non-null   object 
 10  student_staff_ratio     2544 non-null   float64
 11  international_students  2536 non-null   object 
 12  female_male_ratio       2370 non-null   object 
 13  year                    2603 non-null   int64  
dtypes: float64(4), int64(1), object(9)
memor

In [4]:
timesUniData.isnull().sum()

world_rank                  0
university_name             0
country                     0
teaching                    0
international               0
research                    0
citations                   0
income                      0
total_score                 0
num_students               59
student_staff_ratio        59
international_students     67
female_male_ratio         233
year                        0
dtype: int64

In [5]:
before = timesUniData.shape[0]
timesUniData = timesUniData.dropna()

print(f"Dropped {before - timesUniData.shape[0]} NaN values")
print(timesUniData.isnull().sum())

Dropped 241 NaN values
world_rank                0
university_name           0
country                   0
teaching                  0
international             0
research                  0
citations                 0
income                    0
total_score               0
num_students              0
student_staff_ratio       0
international_students    0
female_male_ratio         0
year                      0
dtype: int64


In [6]:
def parseNumberOfStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace(',', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

def parseGenderRatio(df : pd.Series) -> list[float]:
    lst = []
    ratio = 0

    if df.dtype == object:
        for ind, item in df.iteritems():
            try:
                tmp = list(map(int, item.split(":") ) )
                lst.append(tmp[0]) 
            except:
                lst.append(0) # no data

        return lst
    else:
        return list(df)

# international student is given in the format 27% so we can just remove the % to
# convert it into a numeric data type
def parseInternationalStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace('%', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

#convert income to float 64
def parseObjects(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = []
        mx, mn = -1e9, 1e9

        for ind, item  in df.iteritems():
            try:
                tmp.append(float(item))

                if float(item) > mx:
                    mx = float(item)
                
                if float(item) < mn:
                    mn = float(item)

            except Exception as e:
                # assign random value except assign 0
                tmp.append(np.random.randint(mn, mx))
            
        return tmp
    else:
        return list(df)


In [7]:
# here since number of students and gender ratio is given in strings we need to
# convert them into numeric types

timesUniData.num_students = parseNumberOfStudents(timesUniData.num_students)
pd.to_numeric(timesUniData.num_students)

timesUniData.female_male_ratio = parseGenderRatio(timesUniData.female_male_ratio)
pd.to_numeric(timesUniData.female_male_ratio)

timesUniData.international_students = parseInternationalStudents(
    timesUniData.international_students)
pd.to_numeric(timesUniData.international_students)

timesUniData.income = parseObjects(timesUniData.income)
pd.to_numeric(timesUniData.income)

total_score = parseObjects(timesUniData.total_score)
timesUniData.total_score = total_score
pd.to_numeric(timesUniData.total_score)

timesUniData.international = parseObjects(timesUniData.international)
pd.to_numeric(timesUniData.international)

print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2362 entries, 1 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2362 non-null   object 
 1   university_name         2362 non-null   object 
 2   country                 2362 non-null   object 
 3   teaching                2362 non-null   float64
 4   international           2362 non-null   float64
 5   research                2362 non-null   float64
 6   citations               2362 non-null   float64
 7   income                  2362 non-null   float64
 8   total_score             2362 non-null   float64
 9   num_students            2362 non-null   float64
 10  student_staff_ratio     2362 non-null   float64
 11  international_students  2362 non-null   float64
 12  female_male_ratio       2362 non-null   int64  
 13  year                    2362 non-null   int64  
dtypes: float64(9), int64(2), object(3)
memor

In [8]:
# plot 1
# compare the teaching with world ranking for the top 50 universities
# with international score as continous color map and number of students as size
%matplotlib qt 

year = 2016
num_uni = 30

topUnis = timesUniData[timesUniData.year == year].iloc[: num_uni]
# print(topUnis)

colors = ['Red', 'Orange', 'Blue', 'Violet']
cmap = mpl.colors.LinearSegmentedColormap.from_list('my_colormap', colors) # create a continous color map
norm = clr.Normalize(vmin=0, vmax=100)

sizes = list(map(lambda x : x / 20, list(topUnis.num_students)) )
print(sizes)


fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(15, 7.5))

fig.suptitle(f"Worlds top {num_uni} univeristy with number of students(size) and international score(scale) - {year}",
    size = 16)

fig.subplots_adjust(top=0.88)
# plt.figure(figsize=(10, 6))

scatter1 = ax1.scatter(x = topUnis.world_rank, y = topUnis.teaching, s=sizes, alpha = .5)
ax1.set_title("Original plot")
ax1.set_xticks(ticks=np.arange(1, 35, 3))
ax1.set_yticks(ticks=np.arange(50, 100, step = 10) )
ax1.set_ylabel("Teaching", fontsize=12)
ax1.set_xlabel("World ranking", fontsize=12)

scatter2 = ax2.scatter(x = topUnis.world_rank, y = topUnis.teaching, s=sizes, alpha = .5,
            c = topUnis.teaching, cmap = cmap)
ax2.set_title("Colormapped plot")
ax2.set_xticks(ticks=np.arange(1, 35, 3))
ax2.set_yticks(ticks=np.arange(50, 100, step = 10) )
ax2.set_ylabel("Teaching")
ax2.set_xlabel("World ranking")


cBar = fig.colorbar(mpl.cm.ScalarMappable(cmap = cmap, norm = norm)) # map colormap using scalarMappable
cBar.ax.tick_params(labelsize = 8)
cBar.ax.set_title("International score", fontsize = 10)
fig.tight_layout()

[112.15, 995.95, 779.8, 940.6, 553.7, 396.45, 753.0, 908.9, 711.05, 756.4, 587.55, 1809.3, 1330.35, 1910.3, 1018.8, 1071.2, 758.6, 2089.3, 594.25, 1288.7, 916.7, 1579.6, 1069.7, 388.7, 1784.55, 2102.8, 483.3, 2201.0, 2006.4, 2507.6]


In [16]:
#plot 2
# contour plot of number of students vs income as a contour of top 10 university of 2014
%matplotlib qt 

year = 2014
num_uni = 20

uniData = timesUniData[timesUniData.year == year]

teachingData = np.array(uniData.teaching)
totalScore = np.array(uniData.total_score)

X, Y = np.meshgrid(teachingData, teachingData)
Z = 2 * (X + Y)

# print(Y)
# print(X)
# print(Z)

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 6))
fig.suptitle(f"Contour map of {num_uni} univeristies taking teaching as attribute",
    size = 16)

fig.subplots_adjust(top=0.88)

# setting up the color map
colors = ['Yellow', 'Green', 'Blue', 'Violet', 'Red', 'Orange', 'Indigo']
cMap = mpl.colors.ListedColormap(colors)
Norm = mpl.colors.Normalize(vmin=0, vmax=100)

# use tricontour to directly supply, unordered, irregularly spaced coordinates

ax1.contourf(X, Y, Z)
ax1.set_title("Default colormap", fontsize=14)

# scatter1 = ax2.plot(rankingList, teachingVals, 'o', color = 'black')
contour = ax2.contourf(X, Y, Z, cmap = cMap)
ax2.set_title("Rainbow colormap", fontsize=16)

# display colorbar
cBar = fig.colorbar(contour)
cBar.ax.tick_params(labelsize = 8)
cBar.ax.set_title("Score")

fig.tight_layout()

In [19]:
#plot 3
# scatter plot of Income vs Research for the year 2013
year = 2013

uniData = timesUniData[timesUniData.year == year]

income = list(uniData.income)
research = list(uniData.research)
citations = list(uniData.citations)

colors = ['Red', 'Orange', 'Blue', 'Violet']
cmap = clr.LinearSegmentedColormap.from_list('my_colormap', colors) # create a continous color map
norm = clr.Normalize(vmin=0, vmax=100)

fig, ax = plt.subplots(figsize = (10, 6))

scatter1 = ax.scatter(x = income, y = research, c = citations,
        alpha = .5, cmap = cmap)
ax.set_title(f"Income vs research for year {year} with citation as color", fontsize=18)
ax.set_xlabel("Income", fontsize=12)
ax.set_ylabel("Research", fontsize=12)

plt.grid()

cBar = fig.colorbar(mpl.cm.ScalarMappable(cmap = cmap, norm = norm))
cBar.ax.set_title("Citations")
cBar.ax.tick_params(labelsize = 8)

In [11]:
# file cwurData.csv
currUniData = loadCSVData("../world_university_ranking/cwurData.csv")
print(currUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB
None


In [12]:
currUniData.isnull().sum()

world_rank                0
institution               0
country                   0
national_rank             0
quality_of_education      0
alumni_employment         0
quality_of_faculty        0
publications              0
influence                 0
citations                 0
broad_impact            200
patents                   0
score                     0
year                      0
dtype: int64

In [13]:
before = currUniData.shape[0]
currUniData = currUniData.dropna()

print(f"Dropped {before - currUniData.shape[0]} NaN values")
print(currUniData.isnull().sum())

Dropped 200 NaN values
world_rank              0
institution             0
country                 0
national_rank           0
quality_of_education    0
alumni_employment       0
quality_of_faculty      0
publications            0
influence               0
citations               0
broad_impact            0
patents                 0
score                   0
year                    0
dtype: int64


In [14]:
currUniData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 200 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2000 non-null   int64  
 1   institution           2000 non-null   object 
 2   country               2000 non-null   object 
 3   national_rank         2000 non-null   int64  
 4   quality_of_education  2000 non-null   int64  
 5   alumni_employment     2000 non-null   int64  
 6   quality_of_faculty    2000 non-null   int64  
 7   publications          2000 non-null   int64  
 8   influence             2000 non-null   int64  
 9   citations             2000 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2000 non-null   int64  
 12  score                 2000 non-null   float64
 13  year                  2000 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 234.4+ KB


In [21]:
# plot 4
# scatter plot of world rank vs publications
# in current university data
year = 2014

uniData = currUniData[currUniData.year == year].iloc[10:,]

publications = list(uniData.publications)
world_rank = list(uniData.world_rank)
impact = list(uniData.broad_impact)

colors = ['tomato', 'yellow', 'turquoise', 'darkorchid']
cmap = clr.LinearSegmentedColormap.from_list('my_colormap', colors) # create a continous color map
norm = clr.Normalize(vmin=0, vmax=100)

fig, ax = plt.subplots(figsize = (10, 7))

scatter1 = ax.scatter(x = world_rank, y = publications, c = impact, s = 14,
        alpha = .8, cmap = cmap)
ax.set_title(f"World rank vs publications for the {year} with impact as color", fontsize=15)
ax.set_xlabel("World rank", fontsize=13)
ax.set_ylabel("Publications", fontsize=13)
ax.invert_xaxis()
ax.invert_yaxis()

plt.grid()

cBar = fig.colorbar(mpl.cm.ScalarMappable(cmap = cmap, norm = norm))
cBar.ax.set_title("Impact")
cBar.ax.tick_params(labelsize = 8)