In [1]:
from IPython.display import HTML, IFrame, Image

import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import datashader as ds
from scipy import stats
%matplotlib inline


from matplotlib import rcParams, cm
rcParams['grid.linewidth'] = 0

# Visualizing Big Datasets: Tools, Pitfalls, Experimental Example

### Importance of Data Visualization - NYC Taxi pick-upp data

<img src="./images/nyc_pickups_vs_dropoffs.jpg"  width="900" height="900"/>

# Plotting very large datasets meaningfully



- to provide clear understanding
- to aid decision making 

### We can always look at statistics, but with big dataset visualization can get a good feeling!

When working with large datasets, visualizations are often the only way available to understand the properties of that dataset -- there are simply too many data points to examine each one!  Thus it is very important to be aware of some common plotting problems that are minor inconveniences with small datasets but very serious problems with larger ones.

<img src="./images/6-blind-men-hans.jpg"  width="1100" height="1100"/>

## Visualization Bias: Same Data, Different Visualization
<img src="./images/chart_comb.png"  width="1000" height="1000"/>
<img src="./images/chart_comb2.png"  width="1000" height="1000"/>

<img src="./images/index.jpeg"  width="700" height="700"/> 
<img src="./images/python-graph-gallery1-609.jpg"  width="1000" height="1000"/>
https://python-graph-gallery.com/

# Presentation Outine:
- Tools introduction
- Ratcave VR Acuity introduction
- Pitfalls of Large Datasets Vizualization with Real Data Example

# Tools: Pandas, Seaborn, Datashader

![Image](./images/pandas.png)

Python Library providing high-performance data manipulation and analysis tool using powerful data structures

- DataFrame objects, with customazed and default indexing 
- Loading data into in-memory data object
- Data alignment and handling of missing data.
- Reshaping, operations on columns
- Group by data, mergeging and joining of data
- Time Series functionality.

In [None]:
import pandas as pd

### Creating structured dataset

In [None]:
data = pd.DataFrame(np.random.randn(500,3), columns=list('XYZ'))
data.head()

###  Operations on columns

In [None]:
data.X = data.X * 20
data.head()

### Adding new column: label

In [None]:
x = np.array([[1,2,3,4,5]])
data['label'] = np.repeat(x, 100)

data.head()


<img src='./images/seaborn.png'  width="1000" height="1000"/>


Python visualization library based on matplotlib. It provides a high-level interface for drawing attractive statistical graphics.

- Visualizing univariate and bivariate distributions and comparing them between subsets of data
- Tools that fit and visualize linear regression models
- Functions that visualize matrices of data
- Several built-in themes for styling matplotlib graphics
- Tools for choosing color palettes - to reveal patterns in your data

In [None]:
import seaborn as sns

In [None]:
g = sns.FacetGrid(data=data, size=8)
g.map(plt.scatter, 'X', 'Z')

In [None]:
sns.pairplot(data=data, vars=['X', 'Y', 'Z'] );

In [None]:
## Adding hue
sns.pairplot(data=data, vars=['X', 'Y', 'Z'], hue='label');

<img src='./images/datashader.png'  width="700" height="700"/>

# Datashader
Datashader is a Python library for analyzing and visualizing large datasets. Designed to **"rasterize"** or **"aggregate"** datasets into regular grids that can be viewed as images. 
Datashader breaks the creation of images of data into 3 main steps:

1. Projection - Each record is projected into zero or more bins of a  grid shape.

2. Aggregation - Reductions are computed for each bin, compressing the potentially large dataset into a much smaller aggregate array.

3. Transformation - These aggregates are then further processed, eventually creating an image.


## Ratcave Virtual Reality: Acuity Measurment Project
<img src='./images/rat1.gif'  width="900" height="900" align="center"/>

# Dataset explenation: position and orientation 
<img src='./images/position.png'  width="400" height="500" align="center"/>
<img src="./images/spherical_coordinates.png"  width="500" height="500" align="center"/>

## What are we looking for: Stimuli Related Behavior
<img src='./images/ratSRB.gif'  width="900" height="900" align="center"/>

# Pitfalls of Large Datasets Vizualization
## Goal: Reveal the Underlaying Dataset

## Loading The Dataset
Another feature of pandas in practice. 

In [None]:
path = 'data/'

dfrat     = pd.read_hdf(path+'relationalDatabase.h5', 'Rat_Behavior').set_index('index')
dfevent   = pd.read_hdf(path+'relationalDatabase.h5', 'Events').set_index('index')

dfrat.head(5)

In [None]:
dfrat.describe()

## Taking the subsection - for the sake of plotting speed

In [None]:
mask = dfrat.session_id == 0
rat = dfrat[mask]
rat.head()

In [None]:
mask2 = (dfrat.session_id == 1) | (dfrat.session_id == 2)
rat2 = dfrat[mask2]

rat2.head(2)

In [None]:
g = sns.FacetGrid(data=rat, size=8)
g.map(plt.scatter,'X_Pos','Z_Pos');

## Overplotting 

- 2 x 2D subsets - two separate experiments, plotted togheter

In [None]:
g = sns.FacetGrid(data=rat2, size=8, hue='session_id')
g.map(plt.scatter,'X_Pos','Z_Pos');
plt.legend();

In [None]:
g = sns.FacetGrid(data=rat2, size=8, hue='session_id', hue_order=[2,1], col_order=['orange', 'blue'])
g.map(plt.scatter,'X_Pos','Z_Pos');
plt.legend();

## Apperance depeneds on which one is plotted first!
### Occlusion of Data - points plotted on top of one another

# Oversaturation 

Reduce the problem of overplotting by using the transparency parameter.

**Alpha** - if we set alpha=0.1 we need ten points for the color too saturate 

In [None]:
g = sns.FacetGrid(data=rat, size=8)
g.map(plt.scatter,'X_Pos','Z_Pos', alpha=0.1);

In [None]:
g = sns.FacetGrid(data=rat, size=8)
g.map(plt.scatter,'X_Pos','Z_Pos', alpha=0.01);

+ reduced the effect of points overlap
- but it is now harder to see the individual points 
- we can still see the oversaturation problem, in every point where 

## What else can we do?

## Add another Hyperparameter:    point size 
### And then fine tune both transparency and point size!

In [None]:
g = sns.FacetGrid(data=rat.sample(20000), size=8)
g.map(plt.scatter,'X_Pos','Z_Pos', s=0.9, alpha=0.5);

In [None]:
g = sns.FacetGrid(data=rat2, size=8, hue='session_id')
g.map(plt.scatter,'X_Pos','Z_Pos', s=0.9, alpha=0.1);
plt.legend();

## Undersampling

In [None]:
g = sns.FacetGrid(data=rat2.sample(40000), size=8, hue='session_id')
g.map(plt.scatter,'X_Pos','Z_Pos', s=0.9, alpha=0.1);
plt.legend();

## We got better results, but the choice of the two hyperparameters strongly depeneds on the Dataset!

# Heatmaps: Undersaturation


In [None]:
from matplotlib import colors

def make_heatmap(x, y, bins=20, cmap='viridis', norm=colors.Normalize(), **kwargs):
    hist = np.histogram2d(x, y, bins=bins)[0]
    return plt.imshow(hist, cmap=cmap, origin='lower', norm=norm)

In [None]:
g = sns.FacetGrid(data=dfrat, size=8)
g.map(make_heatmap,'X_Pos','Z_Pos', bins=10);

In [None]:
g = sns.FacetGrid(data=dfrat, size=8)
g.map(make_heatmap,'X_Ori','Z_Ori', bins=20);

# Color Pallete
https://seaborn.pydata.org/tutorial/color_palettes.html
<img src="./images/fire_rainbow.png"  width="500" height="500"/>


In [None]:
# viridis_r, rainbow, icefire_r
g = sns.FacetGrid(data=dfrat, size=8);
g.map(make_heatmap,'X_Ori','Z_Ori', cmap='viridis', bins=20);

# Taking it a step further!

In [None]:
cvs = ds.Canvas(plot_height=900, plot_width=900)
agg_z =cvs.points(source=dfrat, x='X_Pos', y='Z_Pos', agg=ds.mean('Z_Ori'))
agg_x =cvs.points(source=dfrat, x='X_Pos', y='Z_Pos', agg=ds.mean('X_Ori'))
agg_xn = agg_x / np.sqrt(agg_x ** 2 + agg_z ** 2)
agg_zn = agg_z / np.sqrt(agg_x ** 2 + agg_z ** 2)
theta = np.arctan2(agg_zn, agg_xn)
cm.hsv.set_bad('black')
plt.figure(figsize=(10, 10))
plt.imshow(theta)
plt.xticks([0, 10])
plt.colorbar();

In [None]:
cvs = ds.Canvas(plot_height=900, plot_width=900)
agg_z =cvs.points(source=dfrat, x='X_Pos', y='Z_Pos', agg=ds.mean('Z_Ori'))
agg_x =cvs.points(source=dfrat, x='X_Pos', y='Z_Pos', agg=ds.mean('X_Ori'))
agg_xn = agg_x / np.sqrt(agg_x ** 2 + agg_z ** 2)
agg_zn = agg_z / np.sqrt(agg_x ** 2 + agg_z ** 2)
theta = np.arctan2(agg_zn, agg_xn)
cm.hsv.set_bad('black')
plt.figure(figsize=(10, 10))
plt.imshow(theta)
plt.imshow(theta, cmap=cm.hsv)
plt.xticks([0, 10])
plt.colorbar();

<img src='./images/1.png'  width="1000" height="1000" align="center"/>

In [None]:
SRB = pd.read_hdf(path+'SRB.h5', 'Rat_Behavior').set_index('index')
agg_z =cvs.points(source=SRB, x='X_Pos', y='Z_Pos', agg=ds.mean('Z_Ori'))
agg_x =cvs.points(source=SRB, x='X_Pos', y='Z_Pos', agg=ds.mean('X_Ori'))
agg_xn = agg_x / np.sqrt(agg_x ** 2 + agg_z ** 2)
agg_zn = agg_z / np.sqrt(agg_x ** 2 + agg_z ** 2)
theta = np.arctan2(agg_zn, agg_xn)
cm.hsv.set_bad('black')
plt.figure(figsize=(10, 10))
plt.imshow(theta, cmap=cm.hsv)
plt.xticks([0, 10])
plt.colorbar();

# Summary:
## What should we keep in mind, when working with big datasets:
### Scatter plots - 3 hyperparameters:
- overplotting  - avoid obsccuring the data
- saturation    - 10, 100, 6000 points under 10 points limit
- undersampling - taking a subset might not be an answer

### Heatmaps - 1 hyperparameter:
- undersaturation 
- pick the color map in accordance to the 


# Extra Examples

## Ratcave Arena
<img src='./images/rat_control.png'  width="900" height="900" align="center"/>

## Color Rearing
<img src='./images/color_rearing.png'  width="1000" height="1000" align="center"/>

# Thank you for your attention!

<img src="./images/pydata.png"  width="600" height="600"/>

<img src="./images/lmulogo3.png"  width="500" height="500"/>