# Lecture 10
# Section 2: Data Visualization Pandas ... and Seaborn

## Styles

In [None]:
import matplotlib.pyplot as plt
print(plt.style.available)
plt.style.use('seaborn-whitegrid')
plt.rcParams["figure.figsize"] = (12,6)

## Basic plot

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(123)
df = pd.DataFrame({'A': np.random.randn(365).cumsum(0), 
                   'B': np.random.randn(365).cumsum(0) + 20,
                   'C': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2017', periods=365))
df.plot()

### Basic plot: Scatter

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(123)
df = pd.DataFrame({'A': np.random.randn(365).cumsum(0), 
                   'B': np.random.randn(365).cumsum(0) + 20,
                   'C': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))
df.plot('A','B', kind = 'scatter')

## Scatterplot

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))
ax = df.plot.scatter('Length', 'Height', c='Width', colormap='viridis')
ax.set_aspect("equal")  # equal aspect ratio
plt.show()

## 3D Scatterplot

* Same data, but as a 3D plot
* We had to go back to using `matplotlib` directly for this
* `pandas` does not allow us to create 3D scatterplots directly

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
plt.scatter(df["Length"].values, df["Height"].values, df["Width"].values, c=df["Width"].values, cmap='viridis')
ax.set_xlabel('Length')
ax.set_ylabel('Height')
ax.set_zlabel('Width')
plt.show()

## Boxplot

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))
df.plot.box()
plt.show()

# Seaborn
* powerful visualization package
* builds upon Matplotlib but is at a higher level
* works nicely with Pandas
* has _endless_ capabilities
* https://seaborn.pydata.org/

## Scatterplot

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))

sns.scatterplot(x=df["Length"], y=df["Width"])
plt.show()

## KDE plot

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))

sns.kdeplot(data=df, fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))

sns.kdeplot(x=df["Length"], y=df["Width"], 
           #fill=True, thresh=0, levels=100, cmap="mako",
           )
plt.show()

## Scatterplot with regression curve

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))

plt.figure()
sns.regplot(x=df["Length"], y=df["Width"], order=3)
# let's save the nice fig for later in a file
plt.savefig("regplot.png", format='png', dpi=800)
plt.show()

## Jointplot

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))

sns.jointplot(x=df["Length"], y=df["Width"], order=3, kind='reg')
plt.show()

## Large scatterplot with regression curve

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
np.random.seed(103)
df = pd.DataFrame({'Length': np.random.randn(10000).cumsum(0), 
                   'Width': np.random.randn(10000).cumsum(0)})


sns.regplot(x=df["Length"], y=df["Width"], order=5, scatter_kws={'s':2,'color':'black', 'alpha':0.1})
plt.savefig("regplot2.png", format='png', dpi=800)
plt.show()

## Large jointplot

In [None]:
sns.jointplot(x=df["Length"], y=df["Width"], order=5, kind='reg', scatter_kws={'s':0.1,'color':'black'})
plt.show()

## Boxplot

In [None]:
### import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))

sns.boxplot(data=df)
plt.show()

## What about some regression testing (beyond just visualizing data)

In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

np.random.seed(123)
df = pd.DataFrame({'Length': np.random.randn(365).cumsum(0), 
                   'Width': np.random.randn(365).cumsum(0) + 20,
                   'Height': np.random.randn(365).cumsum(0) - 20}, 
                  index=pd.date_range('04/27/2020', periods=365))

res = sm.OLS.from_formula('Height ~ Length + I(Length**2) + Width + I(Width**2) + Length:Width',df).fit()
print(res.summary())