In [None]:
%config InlineBackend.figure_format = 'svg'

## 4.1 Retailer data

In [None]:
import pandas as pd
cust_df = pd.read_csv('http://bit.ly/PMR-ch4')

In [None]:
cust_df.head()

In [None]:
cust_df.age.hist()

In [None]:
cust_df.age.min()

### 4.1.1 Simulating the data

In [None]:
# Import numpy and pandas
import pandas as pd
import numpy as np

n_cust = 1000

np.random.seed(21821)

cust_df = pd.DataFrame({'cust_id': pd.Categorical(range(n_cust))})

In [None]:
cust_df['age'] = np.random.normal(loc=35, scale=5, size=n_cust)
cust_df['credit_score'] = np.random.normal(loc=3 * cust_df.age + 620,
                                           scale=50,
                                           size=n_cust)
cust_df['email'] = pd.Categorical(np.random.choice(a=['yes', 'no'],
                                                   p=[0.8, 0.2],
                                                   size=n_cust))
cust_df['distance_to_store'] = np.exp(np.random.normal(loc=2,
                                                       scale=1.2,
                                                       size=n_cust))

In [None]:
cust_df.describe(include='all')

### 4.1.2 Simulating online and in-store sales data

In [None]:
mu = 15 + ((cust_df.email == 'yes') * 15 -
           0.7 * (cust_df.age - cust_df.age.median()))
n = 0.3
prob = n / (n + mu)
cust_df['online_visits'] = np.random.negative_binomial(n=0.3,
                                                       p=prob,
                                                       size=n_cust)

In [None]:
cust_df['online_trans'] = np.random.binomial(n=cust_df.online_visits,
                                             p=0.3,
                                             size=n_cust)
cust_df['online_spend'] = (np.exp(np.random.normal(loc=3,
                                                  scale=0.1,
                                                  size=n_cust))
                           * cust_df.online_trans)

In [None]:
mu = 3 / np.sqrt(cust_df.distance_to_store)
n = 5
prob = n / (n + mu)
cust_df['store_trans'] = np.random.negative_binomial(n=n,
                                                     p=prob,
                                                     size=n_cust)
cust_df['store_spend'] = (np.exp(np.random.normal(loc=3.5,
                                                  scale=0.4,
                                                  size=n_cust))
                          * cust_df.store_trans)

In [None]:
cust_df.describe()

### 4.1.3 Simulating satisfaction survey responses

In [None]:
sat_overall = pd.Series(np.random.normal(loc=3.1,
                                         scale=0.8,
                                         size=n_cust))
sat_overall.describe()

In [None]:
sat_service = np.floor(sat_overall + np.random.normal(loc=0.5,
                                                      scale=0.7,
                                                      size=n_cust))
sat_selection = np.floor(sat_overall + np.random.normal(loc=-0.2,
                                                        scale=0.6,
                                                        size=n_cust))
sat_service.describe()

In [None]:
sat_service[sat_service > 5] = 5
sat_service[sat_service < 1] = 1
sat_selection[sat_selection > 5] = 5
sat_selection[sat_selection < 1] = 1
sat_service.describe()

### 4.1.4 Simulating non-response data

In [None]:
no_response = np.random.binomial(n=1,
                                 p=cust_df.age/100,
                                 size=n_cust).astype(bool)
sat_service[no_response] = np.nan
sat_selection[no_response] = np.nan
sat_service.describe()

In [None]:
cust_df['sat_service'] = sat_service
cust_df['sat_selection'] = sat_selection
cust_df.describe()

## 4.2 Exploring associations between variables with scatterplots

In [None]:
cust_df.head()

In [None]:
cust_df.dtypes

In [None]:
cust_df.plot(kind='scatter', x='age', y='credit_score')

In [None]:
import matplotlib.pyplot as plt


plt.style.use('seaborn-notebook')
plt.style.use('seaborn-white')
plt.style.use('seaborn-ticks')

cust_df.plot(kind='scatter', x='age', y='credit_score')

In [None]:
cust_df.plot(kind='scatter',
             x='age',
             y='credit_score',
             c='none',
             edgecolor='darkblue',
             xlim=[15, 55],
             ylim=[500, 900])
plt.plot([15, 55], [cust_df.credit_score.mean(),
                    cust_df.credit_score.mean()], 'k:')
plt.plot([cust_df.age.mean(), cust_df.age.mean()],
         [500, 900], 
         'k:')
plt.title('Active customers as of January 2019')
plt.xlabel('Customer age (years)')
plt.ylabel('Customer credit score')

In [None]:
cust_df.plot(kind='scatter',
             x='store_spend',
             y='online_spend',
             c='none',
             edgecolor='darkblue',
             s=8)
plt.title('Customers as of January 2019')
plt.xlabel('Prior 12 months in-store sales ($)')
plt.ylabel('Prior 12 months online sales ($)')

In [None]:
cust_df.store_spend.hist(bins=100,
                         edgecolor='k',
                         facecolor='none',
                         linewidth=1.2)
plt.title('Customers as of January 2019')
plt.xlabel('Prior 12 months online sales ($)')
plt.ylabel('Count of customers')

### 4.2.2 Color-coding points on a scatterplot

In [None]:
edge_mapper = {'yes': 'g',
                'no': 'k'}
fill_mapper = {'yes': 'none',
               'no': 'k' }

fig, ax = plt.subplots()
for name, group in cust_df.groupby('email'):
  ax.scatter(x=group.store_spend,
             y=group.online_spend,
             edgecolor=edge_mapper[name],
             c=fill_mapper[name],
             s=8,
             label=name)
plt.legend(title='email')
plt.title('Customers as of January 2019')
plt.xlabel('Prior 12 months in-store sales ($)')
plt.ylabel('Prior 12 months online sales ($)')

In [None]:
fig, ax = plt.subplots()
for name, group in cust_df.groupby('email'):
  ax.scatter(x=group.store_spend+1,
             y=group.online_spend+1,
             edgecolor=edge_mapper[name],
             c=fill_mapper[name],
             s=8,
             label=name)
plt.legend(title='email')
plt.title('Customers as of January 2019')
plt.xlabel('Prior 12 months in-store sales (log $)')
plt.ylabel('Prior 12 months online sales (log $)')
plt.xscale('log')
plt.yscale('log')

## 4.3 Combining plots in a single graphics object

In [None]:
plt.subplot(221)
plt.scatter(x=cust_df.distance_to_store,
            y=cust_df.store_spend,
            c='none',
            edgecolor='darkblue',
            s=8)
plt.title('store')
plt.ylabel('Prior 12 months in-store sales ($)')

plt.subplot(223)
plt.scatter(x=cust_df.distance_to_store,
            y=cust_df.online_spend,
            c='none',
            edgecolor='darkblue',
            s=8)
plt.title('online')
plt.xlabel('Distance to store')
plt.ylabel('Prior 12 months online sales ($)')

plt.subplot(222)
plt.scatter(x=cust_df.distance_to_store,
            y=cust_df.store_spend+1,
            c='none',
            edgecolor='darkblue',
            s=8)
plt.title('store, log')
plt.xscale('log')
plt.yscale('log')

plt.subplot(224)
plt.scatter(x=cust_df.distance_to_store,
            y=cust_df.online_spend+1,
            c='none',
            edgecolor='darkblue',
            s=8)
plt.title('online, log')
plt.xlabel('Distance to store')
plt.xscale('log')
plt.yscale('log')

plt.tight_layout()

## Scatterplot matrices

### 4.4.1 scatter_matrix()

In [None]:
_ = pd.plotting.scatter_matrix(cust_df, figsize=(12,12),
                               c='none', edgecolor='darkblue')

In [None]:
_ = pd.plotting.scatter_matrix(cust_df[['age', 'distance_to_store',
                                        'store_spend']],
                               c='none', edgecolor='darkblue')

### 4.4.2 PairGrid()

In [None]:
import seaborn as sns

g = sns.PairGrid(cust_df[['age', 'distance_to_store',
                          'store_spend', 'email']],
                 size=2.5,
                 hue='email', palette='Set2',
                 hue_kws={"marker": ['o', 's']})
_ = g.map_offdiag(plt.scatter, s=20, alpha=0.5)
_ = g.map_diag(plt.hist, bins=20)
_ = g.add_legend()

## Correlation coefficients

In [None]:
np.cov(cust_df.age, cust_df.credit_score)

In [None]:
np.corrcoef(cust_df.age, cust_df.credit_score)

In [None]:
np.cov(cust_df.age,
       cust_df.credit_score)[0,1]/(cust_df.age.std() *
                                   cust_df.credit_score.std())

### 4.5.1 Correlation tests

In [None]:
from scipy import stats

stats.pearsonr(cust_df.age, cust_df.credit_score)

### Correlation matrices

In [None]:
cust_df.corr()

In [None]:
plt.imshow(cust_df.corr())
plt.colorbar()

In [None]:
sns.heatmap(cust_df.corr(), center=0)

In [None]:
sns.heatmap(cust_df.corr(),
            vmin=-0.3,
            vmax=0.6,
            center=0,
            annot=True,
            fmt='.2f',
            mask=~np.tri(cust_df.corr().shape[1], k=-1, dtype=bool),
            cbar=False)

### 4.5.3 Transforming variables before computing correlations

In [None]:
x = np.random.uniform(low=-10, high=10, size=1000)
np.corrcoef(x, x**2)

In [None]:
np.corrcoef(cust_df.distance_to_store, cust_df.store_spend)

In [None]:
np.corrcoef(1/cust_df.distance_to_store, cust_df.store_spend)

In [None]:
np.corrcoef(1/np.sqrt(cust_df.distance_to_store),
            cust_df.store_spend)

In [None]:
plt.scatter(cust_df.distance_to_store, cust_df.store_spend)
plt.xlabel('Distance to store')

In [None]:
plt.scatter(1/np.sqrt(cust_df.distance_to_store), cust_df.store_spend)
plt.xlabel('1/sqrt(distance_to_store)')

### 4.5.5 Box-Cox transformations

In [None]:
dts_bc, lmda = stats.boxcox(cust_df.distance_to_store)
lmda

In [None]:
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plt.hist(cust_df.distance_to_store,
         bins=20,
         edgecolor='k',
         facecolor='none',
         linewidth=1.2)
plt.xlabel('Distance to nearest store')
plt.ylabel('Count of customers')
plt.box(False)
plt.subplot(1,2,2)
plt.hist(dts_bc,
         bins=20,
         edgecolor='k',
         facecolor='none',
         linewidth=1.2)
plt.xlabel('Box-Cox transform of distance')
plt.ylabel('Count of customers')
plt.box(False)

In [None]:
sspend_bc, lmda_age = stats.boxcox(cust_df.age)
lmda_age

In [None]:
sspend_bc, lmda_sspend = stats.boxcox(cust_df.store_spend+.001)
lmda_sspend

In [None]:
np.corrcoef(cust_df.distance_to_store, cust_df.store_spend)

In [None]:
np.corrcoef(dts_bc, sspend_bc)

## 4.6 Exploring associations in survey responses

In [None]:
plt.scatter(x=cust_df.sat_service,
            y=cust_df.sat_selection,
            c='none',
            edgecolor='darkblue')
plt.xlabel('Customer satisfaction with service')
plt.ylabel('Customer satisfaction with selection')

In [None]:
plt.scatter(x=cust_df.sat_service + np.random.normal(scale=0.1,
                                                     size=n_cust),
            y=cust_df.sat_selection + np.random.normal(scale=0.1,
                                                       size=n_cust),
            c='none',
            edgecolor='darkblue')
plt.xlabel('Customer satisfaction with service')
plt.ylabel('Customer satisfaction with selection')

In [None]:
!python --version