# This script contains the following:
1. Importing Visualization Libraries and Data¶
2. Exploring Relationships
 Correlations
 Scatterplots
 Pair Plots
 Categorical Plots

## 1. Importing Visualization Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os

In [None]:
# This option ensures the charts you create are displayed in the notebook without the need to "call" them specifically.

%matplotlib inline

In [None]:
#Defining path
path=r'/Users/frankamoboateng/Downloads/19.08.23 World University Rankings'

In [None]:
#importing data
df_uni = pd.read_csv(os.path.join(path,'02. DATA','PREPARED DATA','uni_ranks.csv'), index_col = False)

## 2. EXPLORING RELATIONSHIPS

In [None]:
df_uni.head()

In [None]:
df_uni.shape

## Correlations:

In [None]:
# Create a correlation matrix using pandas

df_uni.corr()

## The warning is as a result of the object data type in our data set

In [None]:
# Create a correlation heatmap using matplotlib

plt.matshow(df_uni.corr())
plt.show()

In [None]:
# Save figure
plt.matshow(df_uni.corr())
plt.savefig("out.png") 

In [None]:
#current directory
cwd = os.getcwd()
cwd

In [None]:
# Add labels, a legend, and change the size of the heatmap

f = plt.figure(figsize=(8, 8)) # figure size 
plt.matshow(df_uni.corr(), fignum=f.number) # type of plot
plt.xticks(range(df_uni.shape[1]), df_uni.columns, fontsize=14, rotation=45) # x axis labels
plt.yticks(range(df_uni.shape[1]), df_uni.columns, fontsize=14) # y axis labels
cb = plt.colorbar() # add a colour legend (called colorbar)
cb.ax.tick_params(labelsize=11) # add font size
plt.title('Correlation Matrix', fontsize=8) # add title

## Create a correlation heatmap using seaborn:

In [None]:
df_uni.columns

In [None]:
# Create a subset excluding the "Unnamed:0", "Country" "year"

sub = df_uni[['world_rank', 'national_rank', 'quality_of_education','alumni_employment', 
              'quality_of_faculty', 'publications', 'influence', 'citations', 'broad_impact', 'patents', 'score']]

In [None]:
sub

In [None]:
# Create a subplot with matplotlib
f,ax = plt.subplots(figsize=(10,10))

# Create the correlation heatmap in seaborn by applying a heatmap onto the correlation matrix and the subplots defined above.
corr = sns.heatmap(sub.corr(), annot = True, ax = ax) # The `annot` argument allows the plot to 
#place the correlation coefficients onto the heatmap.

## We see a positive correlation between the world rank and all the variables with variables such as broad impact and publications enjoying a strong influence to the world rank and the national rank having a weak correlation. This is somewhat true as, the rank of a university locally would not necessary impact its global impact. 

## Scatterplots:
## The relationship between quality of faculty and the world rank is further investigated due to its value of 0.57 being the least among the top values the other variables produced. 

In [None]:
corr

In [None]:
corr.figure.savefig(os.path.join(path, '04.ANALYSIS','VISUALIZATIONS', 
                                                       'corr.png'))

In [None]:
# Create a scatterplot for the "world_rank" and "quality of faculty" columns in seaborn

sns.lmplot(x = 'quality_of_faculty', y = 'world_rank', data = sub)

## This plot shows a somewhat positive correlation between the two variables

## Pair Plots:

In [None]:
# only the variables to be used in the pair plot are kept

sub_2 = sub[['world_rank', 'publications', 'patents', 'quality_of_education']]

In [None]:
# Create a pair plot 

g = sns.pairplot(sub_2)

## this pair plot between the variables further affirms their relationship. A closer look at the world rank and the publications, patents, and quality of education all show a very positive correlation. 

## Categorical Plots:



## Creating a categorical variable that splits the "patents" column into categories

In [None]:
g.figure.savefig(os.path.join(path, '04.ANALYSIS','VISUALIZATIONS', 
                                                       'g.png'))

In [None]:
sns.histplot(df_uni['patents'], bins = 20, kde = True)

In [None]:
df_uni.loc[df_uni['patents'] < 300, 'Patent category'] = 'Low patents'

In [None]:
df_uni.loc[(df_uni['patents'] >= 300) & (df_uni['patents'] < 500), 'Patent category'] = 'Middle patents'

In [None]:
df_uni.loc[df_uni['patents'] >= 500, 'Patent category'] = 'High patent'

In [None]:
df_uni['Patent category'].value_counts(dropna = False)

In [None]:
df_uni.columns

In [None]:
# Create a categorical plot in seaborn using the patent categories created above

sns.set(style="ticks")
g = sns.catplot(x="world_rank", y="patents", hue="Patent category", data=df_uni)

## From the chart above we can infer a positive correlation between the world ranking of the universities and the number of patents the universities have. 

In [None]:
# fig = g.get_figure()
g.savefig("out.png") 

# Again, the image will be saved in the working directory. 