## Lab 08 - Line and area charts

#### Used libraries

1. **pandas**  
    `pip install pandas`

1. **plotly**  
    `pip install plotly`

1. **matplotlib**   
    `pip install matplotlib`   
    inside matplotlib we use pyplot and colors



In [2]:
# imports
import pandas as pd 

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.colors as clr

import numpy as np
import os
# import geopandas as gpd


The university dataset has many .csv files.

For this experiment we will be exploring only the cwurData.csv and timesData.csv

In [3]:
def loadCSVData(path):
    '''
        Function helps load the given path name if it is a CSV file into a DataFrame using pandas
    '''
    if os.path.splitext(path)[1] == ".csv": # read only csv files from the dataset
        df = pd.read_csv(path, delimiter=',')

        return df
    return None

#### Dataset timesData.csv for other tasks

In [4]:
# loading timesData.csv
timesUniData = loadCSVData("../world_university_ranking/timesData.csv")

# info on columns
print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2603 entries, 0 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2603 non-null   object 
 1   university_name         2603 non-null   object 
 2   country                 2603 non-null   object 
 3   teaching                2603 non-null   float64
 4   international           2603 non-null   object 
 5   research                2603 non-null   float64
 6   citations               2603 non-null   float64
 7   income                  2603 non-null   object 
 8   total_score             2603 non-null   object 
 9   num_students            2544 non-null   object 
 10  student_staff_ratio     2544 non-null   float64
 11  international_students  2536 non-null   object 
 12  female_male_ratio       2370 non-null   object 
 13  year                    2603 non-null   int64  
dtypes: float64(4), int64(1), object(9)
memor

In [4]:
timesUniData.isnull().sum()

world_rank                  0
university_name             0
country                     0
teaching                    0
international               0
research                    0
citations                   0
income                      0
total_score                 0
num_students               59
student_staff_ratio        59
international_students     67
female_male_ratio         233
year                        0
dtype: int64

In [5]:
before = timesUniData.shape[0]
timesUniData = timesUniData.dropna()

print(f"Dropped {before - timesUniData.shape[0]} NaN values")
print(timesUniData.isnull().sum())

Dropped 241 NaN values
world_rank                0
university_name           0
country                   0
teaching                  0
international             0
research                  0
citations                 0
income                    0
total_score               0
num_students              0
student_staff_ratio       0
international_students    0
female_male_ratio         0
year                      0
dtype: int64


In [6]:
def parseNumberOfStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace(',', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

def parseGenderRatio(df : pd.Series) -> list[float]:
    lst = []
    ratio = 0

    if df.dtype == object:
        for ind, item in df.iteritems():
            try:
                tmp = list(map(int, item.split(":") ) )
                lst.append(tmp[0]) 
            except:
                lst.append(0) # no data

        return lst
    else:
        return list(df)

# international student is given in the format 27% so we can just remove the % to
# convert it into a numeric data type
def parseInternationalStudents(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = [item.replace('%', '') for ind, item in df.iteritems() ]
        return list(map(float, tmp))
    else:
        return list(df)

#convert income to float 64
def parseObjects(df : pd.Series) -> list[float]:
    if df.dtype == object:
        tmp = []
        mx, mn = -1e9, 1e9

        for ind, item  in df.iteritems():
            try:
                tmp.append(float(item))

                if float(item) > mx:
                    mx = float(item)
                
                if float(item) < mn:
                    mn = float(item)

            except Exception as e:
                # assign random value except assign 0
                tmp.append(np.random.randint(mn, mx))
            
        return tmp
    else:
        return list(df)


In [7]:
# here since number of students and gender ratio is given in strings we need to
# convert them into numeric types

timesUniData.num_students = parseNumberOfStudents(timesUniData.num_students)
pd.to_numeric(timesUniData.num_students)

timesUniData.female_male_ratio = parseGenderRatio(timesUniData.female_male_ratio)
pd.to_numeric(timesUniData.female_male_ratio)

timesUniData.international_students = parseInternationalStudents(
    timesUniData.international_students)
pd.to_numeric(timesUniData.international_students)

timesUniData.income = parseObjects(timesUniData.income)
pd.to_numeric(timesUniData.income)

total_score = parseObjects(timesUniData.total_score)
timesUniData.total_score = total_score
pd.to_numeric(timesUniData.total_score)

timesUniData.international = parseObjects(timesUniData.international)
pd.to_numeric(timesUniData.international)

print(timesUniData.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2362 entries, 1 to 2602
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   world_rank              2362 non-null   object 
 1   university_name         2362 non-null   object 
 2   country                 2362 non-null   object 
 3   teaching                2362 non-null   float64
 4   international           2362 non-null   float64
 5   research                2362 non-null   float64
 6   citations               2362 non-null   float64
 7   income                  2362 non-null   float64
 8   total_score             2362 non-null   float64
 9   num_students            2362 non-null   float64
 10  student_staff_ratio     2362 non-null   float64
 11  international_students  2362 non-null   float64
 12  female_male_ratio       2362 non-null   int64  
 13  year                    2362 non-null   int64  
dtypes: float64(9), int64(2), object(3)
memor

In [24]:
# plot 1
# line chart of teaching and citations vs world rank
%matplotlib qt

numUni = 100

# since world rankings are not given in pure int64 format we use a cheaty way of solving
# by generating rankings list for 1 to numUni

rankingList = [i for i in range(1, numUni + 1) ]

def plotLineChart(ax, df, label = 'plot', marksersize = 1, linewidth = 1,
        color='b', marker='o', linestyle='-', markercolor = 'r'):

    ax.plot(rankingList, df, label= label, color = color, marker = marker, linestyle = linestyle,
            markersize = marksersize, linewidth = linewidth, markerfacecolor = markercolor)
    # ax.plot(rankingList, df.citations, '-o', color = 'k', label="Citations",
    #         , markersize = 5, linewidth = 2)
    
uniData = timesUniData[timesUniData["year"] == 2013].iloc[:numUni, :]

fig, ax = plt.subplots(figsize=(15, 7.5))

color = ['green', 'blue']
markercolor = ['red', 'violet']
marker = ['o', 's']
linestyle = ['-', '--']

plotLineChart(ax, uniData.teaching, "Teaching", 5, 3, 
        color[0], marker[0], linestyle[0], markercolor[0])

plotLineChart(ax, uniData.citations, "Citations", 6, 4, 
        color[1], marker[1], linestyle[1], markercolor[1])

ax.set_title("Line chart of world rank vs teaching and citations", fontsize=16)
ax.set_xlabel("World Rank", fontsize=12)
ax.set_ylabel("Teaching or citation", fontsize=12)
ax.grid()

ax.legend(['Teaching', 'Citations'], title="Legend", shadow=True, loc='upper right')

<matplotlib.legend.Legend at 0x1a75efbbd60>

In [9]:
# file cwurData.csv
currUniData = loadCSVData("../world_university_ranking/cwurData.csv")
print(currUniData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB
None


In [10]:
currUniData.isnull().sum()

world_rank                0
institution               0
country                   0
national_rank             0
quality_of_education      0
alumni_employment         0
quality_of_faculty        0
publications              0
influence                 0
citations                 0
broad_impact            200
patents                   0
score                     0
year                      0
dtype: int64

In [11]:
currUniData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   world_rank            2200 non-null   int64  
 1   institution           2200 non-null   object 
 2   country               2200 non-null   object 
 3   national_rank         2200 non-null   int64  
 4   quality_of_education  2200 non-null   int64  
 5   alumni_employment     2200 non-null   int64  
 6   quality_of_faculty    2200 non-null   int64  
 7   publications          2200 non-null   int64  
 8   influence             2200 non-null   int64  
 9   citations             2200 non-null   int64  
 10  broad_impact          2000 non-null   float64
 11  patents               2200 non-null   int64  
 12  score                 2200 non-null   float64
 13  year                  2200 non-null   int64  
dtypes: float64(2), int64(10), object(2)
memory usage: 240.8+ KB


In [25]:
#plot 2
# here we use cwurData.csv dataset
# stacked chart comparing quality of education of the top 5 university for the year 2012 - 2015

year_s = 2012
year_e = 2015
num_uni = 5

yearList = [i for i in range(year_s, year_e + 1) ]

def getIndivisualData(df : pd.DataFrame, uniNames : list[str], year : int) -> pd.DataFrame :
    dfY = df[df["year"] == year]
    dfUnis = dfY.query('institution in @uniNames')
    return dfUnis

def getQualityOfEducationEachYear(df, university):
    filter = df[df["institution"] == university]
    return list(filter.quality_of_education)

def plotLineChart(ax, df, label = 'plot', marksersize = 1, linewidth = 1,
        marker='o', linestyle='-'):

    ax.plot(yearList, df, label= label, marker = marker, linestyle = linestyle,
            markersize = marksersize, linewidth = linewidth)
            

uniData = currUniData[currUniData["year"] == year_s]["institution"]

# we are excluding havard as its data is all ones when compared to others
uni_names = list(uniData.iloc[1 : (num_uni + 1)] ) 

dataStore = currUniData[0:0]
# print(dataStore)


for i in range(year_s, year_e + 1):
    dataStore = dataStore.append(
            getIndivisualData(currUniData, uni_names, i))

plt.figure(figsize=(15, 7.5))

linestyle = ['-', '-.', '--', ':', '-']

c = 0

for i in uni_names:
    tmp = getQualityOfEducationEachYear(dataStore, i)

    plotLineChart(plt, tmp, i, 5, 3, linestyle = linestyle[c])
    plt.fill_between(yearList, tmp, alpha = .5)

    c += 1

plt.xlabel("Year", fontsize=12)
plt.ylabel("Quality of education", fontsize=12)
plt.title(f"Quality of education of top 5 universities for the year {year_s} - {year_e}", fontsize=16)
plt.legend(uni_names, title="Legend", shadow=True, loc='upper right')
plt.grid()
plt.show()


In [13]:
shanghaiData = loadCSVData("../world_university_ranking/shanghaiData.csv")
# info on columns
print(shanghaiData.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4897 entries, 0 to 4896
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       4897 non-null   object 
 1   university_name  4896 non-null   object 
 2   national_rank    4896 non-null   object 
 3   total_score      1101 non-null   float64
 4   alumni           4896 non-null   float64
 5   award            4895 non-null   float64
 6   hici             4895 non-null   float64
 7   ns               4875 non-null   float64
 8   pub              4895 non-null   float64
 9   pcp              4895 non-null   float64
 10  year             4897 non-null   int64  
dtypes: float64(7), int64(1), object(3)
memory usage: 421.0+ KB
None


In [14]:
shanghaiData.isnull().sum()

world_rank            0
university_name       1
national_rank         1
total_score        3796
alumni                1
award                 2
hici                  2
ns                   22
pub                   2
pcp                   2
year                  0
dtype: int64

In [15]:
# total score has 3000 misssing values so we just drop the table
try:
    shanghaiData = shanghaiData.drop(columns='total_score')
except Exception as e:
    pass

before = shanghaiData.shape[0]
shanghaiData = shanghaiData.dropna()

print(f"Dropped {before - shanghaiData.shape[0]} NaN values")
print(shanghaiData.isnull().sum())

Dropped 22 NaN values
world_rank         0
university_name    0
national_rank      0
alumni             0
award              0
hici               0
ns                 0
pub                0
pcp                0
year               0
dtype: int64


In [16]:
shanghaiData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4875 entries, 0 to 4896
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   world_rank       4875 non-null   object 
 1   university_name  4875 non-null   object 
 2   national_rank    4875 non-null   object 
 3   alumni           4875 non-null   float64
 4   award            4875 non-null   float64
 5   hici             4875 non-null   float64
 6   ns               4875 non-null   float64
 7   pub              4875 non-null   float64
 8   pcp              4875 non-null   float64
 9   year             4875 non-null   int64  
dtypes: float64(6), int64(1), object(3)
memory usage: 418.9+ KB


In [27]:
# plot 3
# line chart of publications of top 100 university for a year
year = 2010

num_uni = 100

rankingList = [i for i in range(0, num_uni)]

def plotLineChart(ax, df, label = 'plot', marksersize = 1, linewidth = 1,
        marker='o', linestyle='-',  markerfacecolor = 'tab:orange'):

    ax.plot(rankingList, df, label= label, marker = marker, linestyle = linestyle, alpha = .8,
            markersize = marksersize, linewidth = linewidth, markerfacecolor = markerfacecolor)
            

uniData = shanghaiData[shanghaiData["year"] == year].head(num_uni)

plotLineChart(plt, uniData.pub, "Publications", 7, 7, marker = 's')

plt.xlabel("University Ranking", fontsize=12)
plt.ylabel("Publications", fontsize=12)
plt.title(f"Publications of top 100 universities for the year {year}", fontsize=16)

plt.legend("Publications", title="Legend", shadow=True, loc='upper right')
plt.grid()
plt.show()

In [10]:
# plot 4
# here we use cwurData.csv dataset
# stacked chart comparing score of top 7 university for the year 2012 - 2015
%matplotlib qt

year_s = 2011
year_e = 2014
num_uni = 4

yearList = [i for i in range(year_s, year_e + 1) ]

def getIndivisualData(df : pd.DataFrame, uniNames : list[str], year : int) -> pd.DataFrame :
    dfY = df[df["year"] == year]
    dfUnis = dfY.query('university_name in @uniNames')
    return dfUnis

def getSeries(df, university):
    filter = df[df['university_name'] == university]
    return list(filter.income)

def plotLineChart(ax, df, label = 'plot', markersize = 1, linewidth = 1,
        marker='o', linestyle='-'):

    ax.plot(yearList, df, label= label, marker = marker, linestyle = linestyle,
            markersize = markersize, linewidth = linewidth)
            
uniData = timesUniData[timesUniData["year"] == year_s]["university_name"]

# we are excluding havard as its data is all ones when compared to others
uni_names = list(uniData.iloc[: num_uni]) 

dataStore = timesUniData[0:0]
# print(dataStore)

for i in range(year_s, year_e + 1):
    dataStore = dataStore.append(
            getIndivisualData(timesUniData, uni_names, i))

plt.figure(figsize=(15, 7.5))

linestyle = ['-', '-.', '--', ':', '-']
c = 0

for i in uni_names:
    tmp = getSeries(dataStore, i)
    print(tmp)
    plotLineChart(plt, tmp, i, 5, 3, linestyle = linestyle[c])
    plt.fill_between(yearList, tmp, alpha = .5)
    c += 1

plt.xlabel("Year", fontsize=12)
plt.ylabel("Score", fontsize=12)
plt.title(f"Score of top 5 universities for the year {year_s} - {year_e}", fontsize=16)
plt.legend(uni_names, title="Legend", shadow=True, loc='upper left')
plt.grid()
plt.show()


['34.5', '35.9', '39.9', '40.6']
['83.7', '97.0', '95.6', '91.2']
['87.5', '94.4', '92.9', '94.3']
['64.3', '63.8', '62.4', '61.3']
