# Welcome to Part 3: data visualisation

### Let's do some plotting!

In [None]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read in the data
un_data = pd.read_csv('data/UN_modified.csv')

### Histograms

In [None]:
# Plot histograms of the numerical columns
hist = un_data.hist()

In [None]:
# Plot a histograms of a specific column split by region
hist_by_region = un_data.hist(column='infantMortality', by='region')

### Box plots

In [None]:
# Plot a box plots of a specific column split by region
boxplot = un_data.boxplot(column='infantMortality', by='region')

### Scatter plots

In [None]:
# Plot a scatter plot of two columns in the dataframe
scatter = un_data.plot.scatter('GDPpp', 'infantMortality')
plt.xlabel('GDP per capita')
plt.ylabel('Infant mortality rate')

In [None]:
# Using the seaborn module we have loaded we can make a nicer plot, showing different regions in different colours
# scatterplot is a new function of the seaborn package. Requires v0.9.0 of seaborn. 
# You can check your version with sns.__version__ 
# If it is not up to date run "!pip install seaborn"

scatter = sns.scatterplot(x = "GDPpp", y = "infantMortality",
                          hue = "region",
                          data = un_data)
scatter.set(xscale = "log")

#### Investigate adapting this scatter plot

You can use different colours, vary size and style of points, change axis titles, look at different variables.

There's a lot to do here, so spend time investigating

For help, there are many examples here:
https://seaborn.pydata.org/generated/seaborn.scatterplot.html

### Saving plots

In [None]:
# Create a directory to save our plots into (only if it doesn't already exist)
import os
if not os.path.exists('plots'):
        os.mkdir('plots')

In [None]:
scatter = sns.scatterplot(x = "GDPpp", y = "infantMortality",
                          hue = "region",
                          data = un_data)
scatter.set(xscale = "log")

# Saving current plot to a file. File type is specified by the file extension
plt.savefig('plots/myplot.png')
plt.savefig('plots/myplot.pdf')

### Visualising patterns in the data

In [None]:
regression = sns.regplot(x = "lifeExpectancy", y = "infantMortality", data = un_data)

#### Investigate more on visualsing patterns in the data

https://seaborn.pydata.org/tutorial/regression.html