# 第八讲  绘图和可视化

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.random.seed(12345)
np.set_printoptions(precision=10, suppress=True)

plt.rc('figure', figsize=(20, 10))
plt.rc('lines', lw=2, color='red')


In [None]:
%matplotlib inline

##  1. matplotlib API 入门

* matplotlib是一个用于生成出版及质量图表（通常是二维）的桌面绘图包
* 在Python环境下进行Matlab风格的绘图

从一个简单的例子开始：

In [None]:
data = np.random.randint(0, 100, 50)
data

In [None]:
plt.plot(data,lw = '2',color='g', marker='o', linestyle='solid')

### (1) 图与子图

* matplotlib所绘制的图位于Figure对象中。

* 可以使用plt.figure生成一个新的图片

* 可以使用add_subplot创建一个或多个子图. 
* add_subplot返回的对象是Axes Subplot对象。使这些对象，可以直接在子图上调用对象的实例方法进行绘图

In [None]:
fig = plt.figure()

ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
ax3 = fig.add_subplot(2, 2, 4)

* 【使用Jupyter Notebook时，因为每个cell运行后，图表会被重置，所以应将绘图命令放在同一个cell中】

In [None]:
fig = plt.figure()

In [None]:
ax1 = fig.add_subplot(2, 2, 1)

In [None]:
ax1.plot(data,lw = '2',color='g', marker='o', linestyle='solid')

将所有代码在同一个cell中

In [None]:
plt.close('all')
fig = plt.figure()

ax1 = fig.add_subplot(2, 2, 1)
ax2 = fig.add_subplot(2, 2, 2)
ax3 = fig.add_subplot(2, 2, 3)
ax4 = fig.add_subplot(2, 2, 4)

gaussian = np.random.randn(100000)

# plot将画到最后一个子图上
plt.plot(gaussian.cumsum(), 'k--')

_ = ax1.hist(gaussian, bins=50, color='r', alpha=0.3)
ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.randn(30))
ax3.plot(data,lw = '2',color='g', marker='o', linestyle='solid')

In [None]:
plt.close('all')

* subplots生成子图对象的NumPy数组

* axes可以像二维数组那样进行索引，例如， axes[0,1]

In [None]:
fig, axes = plt.subplots(2, 3, sharey=True, sharex=True)
axes[0,2].plot(data,lw = '2',color='g', marker='o', linestyle='solid')

#### 调整子图周围的间距

* 使用subplots_adjust方法来更改间距

subplots_adjust(left=None, bottom=None, right=None, top=None,
                wspace=None, hspace=None)

In [None]:
fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
for i in range(2):
    for j in range(2):
        axes[i, j].hist(np.random.randn(500), bins=50, color='r', alpha=0.5)
plt.subplots_adjust(wspace=0.05, hspace=0)

### （2）颜色、标记和线型

* plot函数接收一些可选字符串缩写来指明颜色和线类型

ax.plot(x, y, 'g--')

* 也可以更显式地指定

ax.plot(x, y, linestyle='--', color='g')

* 很多字母缩写用于常用颜色。当然，也可以使用十六进制的颜色代码（'#CE2B1A'）

In [None]:
plt.plot(data,lw = '2',color='g', marker='o', linestyle='solid')

In [None]:
plt.plot(data,'y--')

In [None]:
plt.plot(data,lw = '2',color='#CE2B1A', marker='o', linestyle='solid')

*  同时使用点和线

In [None]:
from numpy.random import randn
plt.plot(randn(30).cumsum(), 'bo--')

In [None]:
from numpy.random import randn
plt.plot(randn(30).cumsum(), color='#CE2B1A', marker='o', linestyle='dashed')


* 折线的插值 （通过drawstyle指定）

In [None]:
plt.close('all')

In [None]:
data = np.random.randn(30).cumsum()
plt.plot(data, 'b--', label='Default')
plt.plot(data, 'g-', drawstyle='steps-post', label='steps-post')
plt.legend(loc='best')

### （3）刻度、标签和图例

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(np.random.randn(1000).cumsum())

* 使用set_xticks改变刻画

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(np.random.randn(1000).cumsum())
ticks = ax.set_xticks([0, 250, 500, 750, 1000])


* 使用set_xticklabels为标签设置名称

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(np.random.randn(1000).cumsum())
ticks = ax.set_xticks([0, 250, 500, 750, 1000])
labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],rotation=30, fontsize='small')

* 设置标题和轴标签

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(np.random.randn(1000).cumsum())
ticks = ax.set_xticks([0, 250, 500, 750, 1000])
ax.set_title('Random walking')
ax.set_xlabel('step')
ax.set_ylabel('distance')

* 也可以用set方面进行批量绘图属性的设置

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(np.random.randn(1000).cumsum())
ticks = ax.set_xticks([0, 250, 500, 750, 1000])

props = {
    'title': 'Random walking',
    'xlabel': 'step',
    'ylabel': 'distance'
}

ax.set(**props)

#### 添加图例

* 使用label属性和legend

In [None]:
from numpy.random import randn
fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)
ax.plot(randn(50).cumsum(), 'r.-', label='one')
ax.plot(randn(50).cumsum(), 'go--', label='two')
ax.plot(randn(50).cumsum(), 'b*-', label='three')
ax.legend(loc='right')

In [None]:
from numpy.random import randn
fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)
ax.plot(randn(50).cumsum(), 'r.-', label='one')
ax.plot(randn(50).cumsum(), 'go--', label='two')
ax.plot(randn(50).cumsum(), 'b*-', label='three')
ax.legend(loc='best')

### （4）注释与子图加工

* annotate方法

In [None]:
from datetime import datetime

fig = plt.figure(figsize=(15, 6))
ax = fig.add_subplot(1, 1, 1)

data = pd.read_csv('examples/spx.csv', index_col=0, parse_dates=True)
spx = data['SPX']

spx.plot(ax=ax, style='g-')

crisis_data = [
    (datetime(2007, 10, 11), 'Peak of bull market'),
    (datetime(2008, 3, 12), 'Bear Stearns Fails'),
    (datetime(2008, 9, 15), 'Lehman Bankruptcy')
]

for date, label in crisis_data:
    ax.annotate(label, xy=(date, spx.asof(date) + 75),
                xytext=(date, spx.asof(date) + 225),
                arrowprops=dict(facecolor='r', headwidth=4, width=2,
                                headlength=4),
                horizontalalignment='left', verticalalignment='top')

# Zoom in on 2007-2010
ax.set_xlim(['1/1/2007', '1/1/2011'])
ax.set_ylim([600, 1800])

ax.set_title('Important dates in the 2008-2009 financial crisis')

* patches

In [None]:
fig = plt.figure(figsize=(12, 6)); ax = fig.add_subplot(1, 1, 1)
rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
                   color='g', alpha=0.5)
ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)

### （6）保存图像

In [None]:
fig = plt.figure(figsize=(12, 6)); 
ax = fig.add_subplot(1, 1, 1)
rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
                   color='g', alpha=0.5)
ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)
fig.savefig('patches.svg')

fig = plt.figure(figsize=(12, 6)); 
ax = fig.add_subplot(1, 1, 1)
rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
                   color='g', alpha=0.5)
ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)
plt.savefig('figpath.png', dpi=400, bbox_inches='tight')

In [None]:
fig = plt.figure(figsize=(12, 6)); 
ax = fig.add_subplot(1, 1, 1)
rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)
circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)
pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],
                   color='g', alpha=0.5)
ax.add_patch(rect)
ax.add_patch(circ)
ax.add_patch(pgon)
fig.savefig('patches.png', dpi=600, bbox_inches='tight')

from io import BytesIO
buffer = BytesIO()
plt.savefig(buffer)
plot_data = buffer.getvalue()

### （7）matplotlib 设置

* 使用rc方法是修改matplotlib的配置的方法之一

In [None]:
plt.rc('figure', figsize=(10, 10))

In [None]:
font_options = {'family' : 'monospace',
                'weight' : 'bold',
                'size'   : 'small'}
plt.rc('font', **font_options)

* 查看配置

In [None]:
print(matplotlib.rc_params())

## 2、使用 pandas 和 seaborn 绘图

* matplotlib是一个相当底层的绘图工具

* pandas自身有很多内建方法可以简化从DataFrame和Series对象生成可视化的过程

### （1）折线图

In [None]:
plt.close('all')

In [None]:
s = pd.Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))
s.plot(color='g',marker='*', rot = 30, alpha=0.8)

In [None]:
s = pd.Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))
s.plot(style='g*--',title ='Random walking', ylabel='distance', xlabel='step', grid=True)

In [None]:
s.plot?

In [None]:
df = pd.DataFrame(np.random.randn(10, 4).cumsum(0),
                  columns=['A', 'B', 'C', 'D'],
                  index=np.arange(0, 100, 10))
df

In [None]:
df.plot()

In [None]:
df.plot(subplots=True, figsize=(10,10), legend=True)

### （3）柱状图

* bar: 垂直
* hbar: 水平

In [None]:
data = pd.Series(np.random.rand(16), index=list('abcdefghijklmnop')) 
data

In [None]:
fig, axes = plt.subplots(2, 1)

data.plot.bar(ax=axes[0], color='r',  alpha=0.7, figsize=(10,8))
data.plot.barh(ax=axes[1], color='g', alpha=0.7, figsize=(10, 10))

* DataFrame将每一行的值分组并并列的柱子中的一组

In [None]:
np.random.seed(12348)

In [None]:
df = pd.DataFrame(np.random.rand(6, 4),
                  index=['one', 'two', 'three', 'four', 'five', 'six'],
                  columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))
df


In [None]:
df.plot.bar(figsize=(10,5))

* 通过stacked使每一行的值堆积在一起

In [None]:
df.plot.barh(stacked=True, alpha=0.5, figsize=(10,5))

* 实例

In [None]:
plt.close('all')

In [None]:
tips = pd.read_csv('examples/tips.csv')
tips

In [None]:
party_counts = pd.crosstab(tips['day'], tips['size'])
party_counts


In [None]:
# Not many 1- and 6-person parties
party_counts = party_counts.loc[:, 2:5]

In [None]:
# Normalize to sum to 1
party_pcts = party_counts.div(party_counts.sum(1), axis=0)
party_pcts


In [None]:
party_pcts.plot.bar()

In [None]:
plt.close('all')

* 对于在绘图前需要聚合和汇总的数据，使用seaborn包会使工作更为简单

In [None]:
import seaborn as sns
tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip'])
tips.head()
sns.barplot(x='tip_pct', y='day', data=tips, orient='h')

In [None]:
plt.close('all')

In [None]:
sns.barplot(x='tip_pct', y='day', hue='time', data=tips, orient='h')

In [None]:
plt.close('all')

In [None]:
sns.set(style="whitegrid")

### （3）直方图和密度图

In [None]:
plt.figure()

In [None]:
tips['tip_pct'].plot.hist(bins=50)

In [None]:
plt.figure()

* 密度图

In [None]:
tips['tip_pct'].plot.density()

In [None]:
plt.figure()

In [None]:
comp1 = np.random.normal(0, 1, size=200)
comp2 = np.random.normal(10, 2, size=200)
values = pd.Series(np.concatenate([comp1, comp2]))
sns.distplot(values, bins=100, color='k')

### （4）散点图和点图

* 点图或散点图可以用于检验两个一维数据序列之间的关系

In [None]:
macro = pd.read_csv('examples/macrodata.csv')
data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]
trans_data = np.log(data).diff().dropna()
trans_data[-5:]

In [None]:
plt.figure()

使用seabornd 的regplot方法，绘制散点图并拟合出一条线性回归线

In [None]:
sns.regplot('m1', 'unemp', data=trans_data)
plt.title('Changes in log %s versus log %s' % ('m1', 'unemp'))

* pairplot函数，支持在对角线上放置每个变量的直方图或密度估计值

In [None]:
sns.pairplot(trans_data, diag_kind='kde', plot_kws={'alpha': 0.2})

### （5）分面网格和分类数据

使用分面网格是利用多种分组变量对数据进行可视化的方式

In [None]:
sns.catplot(x='day', y='tip_pct', hue='time', col='smoker',
               kind='bar', data=tips[tips.tip_pct < 1])

In [None]:
sns.catplot(x='day', y='tip_pct', row='time',
               col='smoker',
               kind='bar', data=tips[tips.tip_pct < 1])

In [None]:
sns.catplot(x='tip_pct', y='day', kind='box',
               data=tips[tips.tip_pct < 0.5])

## 3、其他可视化工具

### Bokeh

### Ploty