# Research Computing Boot Camp
# Boston University 

Website: [rcs.bu.edu](http://www.bu.edu/tech/support/research/) <br>
Tutorial materials: [https://github.com/bu-rcs/bu-rcs.github.io/tree/main/Bootcamp](https://github.com/bu-rcs/bu-rcs.github.io/tree/main/Bootcamp)



# Data Visualization in python

### Matplotlib
Matplotlib is a comprehensive library for creating static, animated, and interactive visualizations in Python.

https://matplotlib.org/

### Pandas plot
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.html

### Seaborn
Seaborn is a Python data visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics.

https://seaborn.pydata.org/

## Matplotlib

In [None]:
import matplotlib as mpl

## If using Jupyter notebook (NOT JupyterLab) this line
## will enable interactive plotting which we'll make use of here.
%matplotlib notebook 
# JupyterLab has support for this using the command:
# %matplotlib widget
# but it's not a standard part of JupyterLab yet.
# Also - in regular python a command that starts with
# % is not allowed - those are special commands added by the iPython
# interpreter.
print(mpl.__version__)

The matplotlib is configured to render into the browser.

This configuration is called a backend (backend layer).

In [None]:
mpl.get_backend()

In [None]:
# This sets the size of the figure as it is displayed in the notebook
mpl.rcParams['figure.figsize'] = (4.0, 3.0)

The sub-library pyplot is for plotting data (scripting layer).

https://matplotlib.org/api/_as_gen/matplotlib.pyplot.html#module-matplotlib.pyplot

In [None]:
# pyplot is for plotting. Import & rename as plt.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# What color styles are available for plotting?
plt.style.available

In [None]:
# To set a style for the rest of the notebook call this function
#plt.style.use('seaborn-colorblind')

Examples:

    Line plotting (plt.plot)
    Bar chart (plt.bar)
    Subplot (plt.subplot)
    Histograms (hist)
    Heatmap (hist2d)

### Line plotting

In [None]:
# A trailing ? acts the same as help(plt.plot).  This is an iPython feature -
# it works in JupyterLab and Notebook but not in "plain" Python
plt.plot?

Simply plot a dot in a figure

In [None]:
# plt.figure() creates a new plot window. 
plt.figure()
plt.plot(3,2,'.');

In [None]:
# Calling plt.figure() creates a new figure window.
plt.figure()
# plot a point
plt.plot(1.5, 1.5, 'o');
# Subsequent plot calls will be on the same figure.
plt.plot(2,2,'o');
plt.plot(2.5, 2.5, 'o');

## Plot lines for lists of data points

In [None]:
# Make some points for plotting.
# To do this with plain Python:
linear_data = [1,2,3,4,5,6,7] # make a list of points
quadratic_data = [x**2 for x in linear_data] # make a new list with a "list comprehension"

# We could use a Pandas Series
linear_data = pd.Series([1,2,3,4,5,6,7])
quadratic_data = linear_data**2

# Or with numpy...
# Make some points - place them in a numpy ndarray
linear_data = np.array([1,2,3,4,5,6,7])
# numpy can apply the square to every element in one command
quadratic_data = linear_data**2


# Three ways - same numeric results. The plot() command will accept anything that is list-like, i.e.
# can be indexed or looped over for its x/y values.

# x and y collections can be of different types - if your y values ended up in a Python list
# and x values are in a column of a Pandas dataframe it's fine.

In [None]:
linear_data

In [None]:
quadratic_data

In [None]:
# Let's plot these together
plt.figure()
plt.plot(linear_data,quadratic_data);
# Try adding some markers:
# plt.plot(linear_data,quadratic_data,'d');
# Or changing the linestyle
# plt.plot(linear_data,quadratic_data,'-.');
# Or the color:
#  plt.plot(linear_data,quadratic_data,'r');
# Or all 3 at once:
#plt.plot(linear_data,quadratic_data,'rd-.');

In [None]:
plt.figure();
# Providing just 1 collection implies you are providing y-values
# and that the x-values go from 0 to the len(y_pts)-1.
# Plot the linear data then the quadratic data
plt.plot(linear_data, '-o');
plt.plot(quadratic_data, '--o');

In [None]:
# Same thing, but now let's add labels to the axes, a title, and a legend.
plt.figure();
plt.plot(linear_data, '-o');
plt.plot(quadratic_data, '--o');
plt.xlabel('Your x data');
plt.ylabel('Your y data');
plt.title('A title');

In [None]:
plt.legend(['Baseline', 'Competition']);

In [None]:
plt.plot([10, 15, 25], '--s');

In [None]:
plt.legend(['Baseline', 'Competition', 'us']);

In [None]:
plt.fill_between(range(len(linear_data)), 
                      linear_data, quadratic_data,
                      facecolor='blue',
                      alpha=0.1);

In [None]:
# plt.gca -->  "get current axes".  The axes refers to the x/y axes shown in the plot.
ax = plt.gca()
ax.axis([0,6,0,20])

In [None]:
ax.relim() 
ax.autoscale()

### Bar chart

In [None]:
plt.figure()

xvals = range(len(linear_data))
print(xvals)

plt.bar(xvals,linear_data, width=0.3);

In [None]:
new_xvals=[]
for item in xvals:
    new_xvals.append(item+0.3)
    
plt.bar(new_xvals, quadratic_data, width=0.3, color='red')

In [None]:
plt.figure();

plt.bar(xvals, linear_data, width=0.3, color='b')

plt.bar(xvals, quadratic_data, width=0.3, bottom=linear_data, color='r')

In [None]:
plt.figure();

plt.barh(xvals, linear_data, height=0.3, color='b')

plt.barh(xvals, quadratic_data, height=0.3, left=linear_data, color='r')

### Subplot

In [None]:
plt.figure()

# subplot with 1 row, 2 columns, and current axis is 1st subplot axes
plt.subplot(1, 2, 1)

plt.plot(linear_data, '-o')

In [None]:
plt.subplot(1,2,2);

plt.plot(quadratic_data, '-o');

In [None]:
plt.figure();

# subplit: (nrows,ncols,index)
# The index is left to right across rows. It starts at 1.
ax1 = plt.subplot(1, 2, 1);

plt.plot(linear_data, '-o');

# pass sharey=ax1 to ensure the two subplots share the same y axis
ax2 = plt.subplot(1, 2, 2, sharey=ax1);

plt.plot(quadratic_data, '-x');

In [None]:
fig, ((ax1,ax2,ax3), (ax4,ax5,ax6), (ax7,ax8,ax9)) = plt.subplots(3, 3, sharex=True, sharey=True)


In [None]:
ax5.plot(linear_data, '-');

In [None]:
ax3.plot(quadratic_data, '-');

### Histograms

In [None]:
# create 2x2 grid of axis subplots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, sharex=True)
axs = [ax1,ax2,ax3,ax4]

# draw n = 10, 100, 1000, and 10000 samples from the normal distribution and plot corresponding histograms
for n in range(0,4):
    # generating random numbers
    sample_size = 10**(n+1)
    sample = np.random.normal(loc=0.0, scale=1.0, size=sample_size)
    
    # histogram plotting
    axs[n].hist(sample, bins=100)
    
    # title for each axes
    axs[n].set_title('n={}'.format(sample_size))

### Heatmaps (2d-histograms)

In [None]:
plt.figure()

X = np.random.random(size=10000)

Y = np.random.normal(loc=0.0,scale=1.0,size=10000)

plt.hist2d(X, Y, bins=100, cmap='rainbow');
## cmap: color map options
## https://matplotlib.org/3.2.1/tutorials/colors/colormaps.html

In [None]:
plt.colorbar();

### Animations
https://matplotlib.org/3.2.1/api/animation_api.html#animation

## Empty figure 
fig, ax = plt.subplots()
## Empty data
xdata, ydata = [], []
## Empty plot
ln, = plt.plot([], [], 'ro')

## you can preset the figure size
#ax.set_xlim(0, 2*np.pi)
#ax.set_ylim(-1, 1)

def update(frame):
    ## add new one data point per frame
    xdata.append(frame)
    ydata.append(np.sin(frame))
    
    ## update the plot
    ln.set_data(xdata, ydata)
    
    ## auto rescale figure size
    ax.relim()      
    ax.autoscale()
    return ln,

from matplotlib.animation import FuncAnimation

ani = FuncAnimation(fig, update, frames=np.linspace(0, 4*np.pi, 64))

ani.save('myAnimation.gif', writer='imagemagick', fps=30)

## Pandas plotting

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.plot.html

Iris flower data set https://en.wikipedia.org/wiki/Iris_flower_data_set

In [None]:
import pandas as pd
# Let's import the US-wide health data.
big_df = pd.read_csv('https://raw.githubusercontent.com/bu-rcs/bu-rcs.github.io/main/Bootcamp/Data/USA_HealthData.csv')
# Clean out the NaN County rows, as before
big_df.drop(big_df[big_df['County'].isnull()].index, inplace=True) 
# And delete the FIPS column for plotting convenience
del big_df['FIPS']
# Also as before bring in the census regions
reg_df = pd.read_csv('https://raw.githubusercontent.com/bu-rcs/bu-rcs.github.io/main/Bootcamp/Data/us_states_census_regions.csv')
big_df = pd.merge(left=big_df, right=reg_df, left_on='State', right_on='State')
big_df

### DataFrame.plot()

kind : str
    - 'line' : line plot (default)
    - 'bar' : vertical bar plot
    - 'barh' : horizontal bar plot
    - 'hist' : histogram
    - 'box' : boxplot
    - 'kde' : Kernel Density Estimation plot
    - 'density' : same as 'kde'
    - 'area' : area plot
    - 'pie' : pie plot
    - 'scatter' : scatter plot
    - 'hexbin' : hexbin plot

In [None]:
# x defaults to the indices 0 thru 3141
big_df.plot(y='% Fair or Poor Health')

In [None]:
big_df.plot?

In [None]:
big_df.plot();

In [None]:
# That's too many!  Filter down to a few:
big_df[['% Fair or Poor Health','% Severe Housing Problems','% Physically Inactive']].plot()

In [None]:
# Do a box plot of a few columns
cols=['% Fair or Poor Health','% Physically Inactive']
# Filter then call boxplot:
fig = big_df[cols].boxplot()
# OR
#fig = big_df.boxplot(column=cols)

# The extra circles are outlier points.

In [None]:
#dir(fig)

In [None]:
fig.set_xlabel('Health Statistics')
fig.set_ylabel('Percent')

In [None]:
#  Histogram plot
big_df.hist(column='% Smokers')

In [None]:
#  Histogram plot, grouped by states, after filtering for New England
new_eng = ['Massachusetts','New Hampshire','Connecticut','Rhode Island','Vermont','Maine']
big_df[big_df['State'].isin(new_eng)].hist(column='% Smokers', by='State')


In [None]:
# KDE plot - "Generate Kernel Density Estimate plot using Gaussian kernels."
# more detail: https://en.wikipedia.org/wiki/Kernel_density_estimation 

big_df['% Smokers'].plot.kde()

#### Plot results from data manipulation

In [None]:
big_df.head()

In [None]:
# Mean of some columns after grouping by states.
big_df.groupby('Region')[['% Excessive Drinking','% Adults with Obesity']].mean()

In [None]:
fig, ax = plt.subplots() # get a figure and an axes
big_df.groupby('Region')[['% Excessive Drinking','% Adults with Obesity']].mean().plot(kind='bar', ax=ax, rot=0);
ax.set_ylabel('%')
# Uncomment and re-run to fix those overlapping x labels
# fig.autofmt_xdate()

In [None]:
# Not all plotting functions are packed into a dataframe.  Let's plot a 
# scatter matrix for some of the columns
# First some random data just to show the plot
df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
pd.plotting.scatter_matrix(df, alpha=0.2)

In [None]:
# Now let's try our data...
cols = ['% Excessive Drinking','% Adults with Obesity','% With Access to Exercise Opportunities','% Physically Inactive']
pd.plotting.scatter_matrix(big_df[cols]);

In [None]:
# There's a label formatting problem.  How do we fix it?
# By manipulating the axes AND by renaming the columns
new_cols = {'% Excessive Drinking': '%ed',
            '% Adults with Obesity': '%awo',
            '% With Access to Exercise Opportunities':'%watec',
            '% Physically Inactive':'%pi'}
sm=pd.plotting.scatter_matrix(big_df[cols].rename(columns=new_cols))

#Change label rotation
[s.xaxis.label.set_rotation(45) for s in sm.reshape(-1)]
[s.yaxis.label.set_rotation(90) for s in sm.reshape(-1)]

#Hide all ticks
[s.set_xticks(()) for s in sm.reshape(-1)]
[s.set_yticks(()) for s in sm.reshape(-1)]

## Seaborn
https://seaborn.pydata.org/

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(big_df[cols],diag_kind='kde', height=1.5);

### x vs y by seaborn

In [None]:
np.random.seed(1234)

v1 = pd.Series(np.random.normal(0,10,1000), name='v1')
v2 = pd.Series(2*v1 + np.random.normal(60,15,1000), name='v2')

In [None]:
plt.figure()
plt.hist(v1, alpha=0.7, bins=np.arange(-50,150,5), label='v1');
plt.hist(v2, alpha=0.7, bins=np.arange(-50,150,5), label='v2');
plt.legend();

In [None]:
plt.figure()
plt.scatter(v1,v2)

In [None]:
sns.jointplot(v1, v2, alpha=0.4);

In [None]:
sns.jointplot(v1, v2, kind='hex');

In [None]:
# set the seaborn style for all the following plots
sns.set_style('white')

sns.jointplot(v1, v2, kind='kde', space=0);

### Some Health Data Plots

In [None]:
fig, ax = plt.subplots()
# The Groupby makes the Region into the index values. reset_index() replaces it with
# numbers, makes Region into a column, and will let this plot.
sub_df = big_df.groupby(['Region'])[['% Smokers']].mean().reset_index()
sns.barplot(y='Region',x='% Smokers',data=sub_df, ax=ax)

In [None]:
# Let's bring in demographics data to our dataframe
demo_df = pd.read_csv('https://raw.githubusercontent.com/bu-rcs/bu-rcs.github.io/main/Bootcamp/Data/USA_DemographicsData.csv')
demo_df.drop(demo_df[demo_df['County'].isnull()].index, inplace=True) 

big_df = pd.merge(left=big_df, right=demo_df, left_on='County', right_on='County')
big_df.columns

In [None]:
# Seaborn histogram of life expectancy
plt.figure()
sns.histplot(big_df['Life Expectancy'])

In [None]:
# Plot again, this time with the median labeled.
fig, ax = plt.subplots()
# Uncomment this line and comment the sns line to compare plotting styles
#df.hist('Life_Expectancy',ax=ax, bins=np.arange(60, 100, 0.5))
sns.histplot(big_df['Life Expectancy'],ax=ax, bins=np.arange(60, 100, 0.5))
med_age = big_df['Life Expectancy'].median()
# How high to plot the median line?  Let's query the axes for
# the ylimits:
ylim = ax.get_ylim() # ylim == (0.0, 1167.6)
ax.plot([med_age,med_age],[0,ylim[1]],'r--')
ax.set_xlabel('age')
ax.set_ylabel('Count of counties')
ax.set_title('Life Expectancy')
# Set the y axis to the range it was at before we added the median line
ax.set_ylim(ylim)
ax.set_xlim([60,100])
ax.legend(['Median age'])


In [None]:
ax.get_ylim()