## 9.1 Consumer Brand Rating Data

In [None]:
%config InlineBackend.figure_format = 'svg'
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 70)

### Load data

In [None]:
import pandas as pd
brand_ratings = pd.read_csv('http://bit.ly/PMR-ch9') # TODO REFORMAT THIS TO MATCH
brand_ratings.head()

In [None]:
brand_ratings.tail()

In [None]:
brand_ratings.describe().round(2)

In [None]:
brand_ratings.plot.box()

### 9.1.1 Rescaling the Data

In [None]:
import numpy as np
x = np.arange(1000)
x_sc = (x - x.mean())/x.std()
print('mean: {}\nmedian: {}\nmax: {}\nmin: {}'.format(x_sc.mean(),
                                                      np.median(x_sc),
                                                      x_sc.max(),
                                                      x_sc.min()))

In [None]:
from sklearn.preprocessing import scale
brand_ratings_sc = brand_ratings.copy()
brand_ratings_sc.iloc[:, :-1] = scale(brand_ratings_sc.iloc[:, :-1])
brand_ratings_sc.describe().round(2)

In [None]:
brand_ratings_sc.plot.box()

In [None]:
brand_ratings_sc.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.clustermap(brand_ratings.corr(), annot=True, fmt=".2f",
               center=0, cmap=plt.cm.bwr)

### 9.1.2 Aggregate Mean Ratings by Brand

In [None]:
brand_means = brand_ratings.groupby('brand').mean()
brand_means

In [None]:
from matplotlib import cm

sns.heatmap(brand_means[['fun', 'latest', 'trendy', 'perform',
                         'leader', 'serious', 'rebuy', 'bargain',
                         'value']], cmap=cm.BrBG)

In [None]:
sns.clustermap(brand_means, cmap=cm.BrBG)

## 9.2 Principal Component Analysis and Perceptual Maps

### 9.2.1 PCA Example

In [None]:
np.random.seed(98286)
xvar = np.random.randint(low=0, high=10, size=100)
yvar = xvar.copy()
yvar[:50] = np.random.randint(low=0, high=10, size=50)
zvar = yvar.copy()
zvar[25:75] = np.random.randint(low=0, high=10, size=50)
myvars = np.array([xvar, yvar, zvar])

In [None]:
sns.swarmplot(x=xvar, y=yvar, color='k')

In [None]:
np.corrcoef(myvars)

In [None]:
from sklearn import decomposition
my_pca = decomposition.PCA().fit(myvars.T)

In [None]:
def pca_summary(pca, round_dig=3):
  '''Print a summary of the PCA fit'''
  return pd.DataFrame(
      [pca.explained_variance_,
       pca.explained_variance_ratio_,
       np.cumsum(pca.explained_variance_ratio_)],
       columns=['pc{}'.format(i) for i in
                range(1, 1+len(pca.explained_variance_))],
       index=['variance', 'proportion of variance explained',
              'cumulative proportion']
              ).round(round_dig)
pca_summary(my_pca)

In [None]:
def pca_components(pca, variable_names):
  '''Return loading of variables on specific components in the PCA'''
  return pd.DataFrame(pca.components_,
                      index=['pc{}'.format(i+1)
                             for i in range(len(pca.components_))],
                      columns=variable_names).T
my_pca_components = pca_components(my_pca, ['xvar', 'yvar', 'zvar'])
my_pca_components.round(3)

In [None]:
myvars_transformed = my_pca.transform(myvars.T)
np.corrcoef(myvars_transformed.T)

### 9.2.2 Visualizing PCA

In [None]:
import matplotlib.pyplot as plt
plt.scatter(x=myvars_transformed[:,0],
            y=myvars_transformed[:,1],
            color='k')
plt.xlabel('PC1')
plt.ylabel('PC2')

In [None]:
def plot_arrow_component(pca_components, variable, scale=1):
  '''Plot an arrow of component dimensions in PCA space'''
  plt.arrow(x=0, y=0,
            dx=pca_components.loc[variable]['pc1'] * scale,
            dy=pca_components.loc[variable]['pc2'] * scale,
            color='r',
            head_width=.5, overhang=1)
  plt.text(x=pca_components.loc[variable]['pc1'] * scale,
           y=pca_components.loc[variable]['pc2'] * scale,
           s=variable,
           color='r',
           fontsize=16)
plt.scatter(x=myvars_transformed[:,0],
              y=myvars_transformed[:,1],
              color='k')

for v in my_pca_components.index:
  plot_arrow_component(my_pca_components, v, 8)

In [None]:
def biplot(values_transformed, pca_components, label=[]):
  '''Create a biplot, a scatterplot of points in PCA space with arrows
  representing the loadings of each variable.
  Points can optionally be labelled'''
  scale = 1.2* np.max(values_transformed[:,1])
  plt.figure(figsize=(10, 10))
  for v in pca_components.index:
    plot_arrow_component(pca_components, v, scale)
  plt.scatter(x=values_transformed[:,0],
              y=values_transformed[:,1],
              color='gray', s=4)
  if len(label) == values_transformed.shape[0]:
    for i, txt in enumerate(label):
      plt.text(s=txt,
               x=values_transformed[i,0]+.01*scale,
               y=values_transformed[i,1]+.01*scale,
               fontsize=14)
  plt.xlabel('PC1')
  plt.ylabel('PC2')

In [None]:
biplot(myvars_transformed, my_pca_components,
       label=range(myvars.shape[1]))

### 9.2.3 PCA for Brand Ratings

In [None]:
brand_rating_names = brand_ratings_sc.columns[:-1]
brand_ratings_sc_vals = brand_ratings_sc[brand_rating_names]
brand_pca = decomposition.PCA().fit(brand_ratings_sc_vals)

In [None]:
pca_summary(brand_pca)

In [None]:
plt.plot(1+np.arange(len(brand_pca.explained_variance_)),
         brand_pca.explained_variance_, 'o-')
plt.xlabel('Component')
plt.ylabel('Variance')

In [None]:
brand_ratings_sc_trans = brand_pca.transform(brand_ratings_sc_vals)
brand_pca_components = pca_components(brand_pca, brand_rating_names)
biplot(brand_ratings_sc_trans, brand_pca_components)

### 9.2.4 Perceptual Map of the Brands

In [None]:
brand_means_sc = brand_ratings_sc.groupby('brand').mean()
brand_means_sc.head()

In [None]:
brand_means_sc = (
    ((brand_means_sc - brand_means_sc.mean()) / brand_means_sc.std()))
brand_means_sc_transformed = brand_pca.transform(brand_means_sc)
biplot(brand_means_sc_transformed, brand_pca_components,
       label=brand_means.index)

In [None]:
brand_means_sc.loc['c'] - brand_means_sc.loc['e']

In [None]:
brand_means_sc.loc[['b','c','f','g']].mean(axis=0) - brand_means_sc.loc['e']

## 9.3 Exploratory Factor Analysis

### 9.3.2 Finding an EFA Solution

In [None]:
np.linalg.eig(np.corrcoef(brand_ratings_sc_vals.T))[0]

In [None]:
brand_2fa = decomposition.FactorAnalysis(n_components=2)
brand_2fa.fit(brand_ratings_sc_vals)

In [None]:
pca_components(brand_2fa, brand_rating_names)


In [None]:
brand_ratings_sc.columns

In [None]:
!pip install factor_analyzer

In [None]:
import factor_analyzer

fa = factor_analyzer.FactorAnalyzer(n_factors=2, rotation='varimax')
fa.fit(brand_ratings_sc_vals)
pd.DataFrame(fa.loadings_, index=brand_rating_names).round(2)

In [None]:
fa = factor_analyzer.FactorAnalyzer(n_factors=3, rotation='varimax')
fa.fit(brand_ratings_sc_vals)
pd.DataFrame(fa.loadings_, index=brand_rating_names).round(2)

In [None]:
fa = factor_analyzer.FactorAnalyzer(n_factors=3, rotation='oblimin')
fa.fit(brand_ratings_sc_vals)
fa_loadings_df = pd.DataFrame(fa.loadings_,
                              index=brand_rating_names)
fa_loadings_df.round(2)

In [None]:
np.corrcoef(fa.transform(brand_ratings_sc_vals).T)

In [None]:
sns.clustermap(fa_loadings_df, cmap=cm.BrBG, center=0)

### 9.3.4 Using Factor Scores for Brands

In [None]:
fa = factor_analyzer.FactorAnalyzer(n_factors=3, rotation='oblimin')
brand_ratings_fa_trans = fa.fit_transform(brand_ratings_sc_vals)
brand_rating_fa_scores = pd.DataFrame(brand_ratings_fa_trans)
brand_rating_fa_scores['brand'] = brand_ratings_sc.brand
brand_rating_fa_scores.head()

In [None]:
brand_rating_fa_mean = brand_rating_fa_scores.groupby('brand').mean()
brand_rating_fa_mean.columns = ['Value', 'Leader', 'Latest']
brand_rating_fa_mean.round(3)

In [None]:
sns.clustermap(brand_rating_fa_mean, cmap=cm.BrBG, center=0)

## 9.4 Manifold dimensionality reduction techniques

### 9.4.1 Multidimensional Scaling

In [None]:
brand_means

In [None]:
from sklearn import manifold

np.random.seed(889783)
brand_mds = manifold.MDS().fit_transform(brand_means)
brand_mds

In [None]:
plt.scatter(x=brand_mds[:,0],
            y=brand_mds[:,1],
            color='grey')
for i,p in enumerate(brand_mds):
  plt.annotate(s=brand_means.index[i], xy=p+.1)

### 9.4.2 Non-metric MDS

In [None]:
brand_means

In [None]:
brand_ranks = brand_means.apply(lambda col: col.argsort().argsort())
brand_ranks

In [None]:
brand_mds_nonmetric = manifold.MDS(metric=False)\
  .fit_transform(brand_ranks)
plt.scatter(x=brand_mds_nonmetric[:,0],
            y=brand_mds_nonmetric[:,1],
            color='grey')
for i,p in enumerate(brand_mds_nonmetric):
  plt.annotate(s=brand_means.index[i], xy=p+.01)

### t-distributed Stochastic Neighbor Embedding (t-SNE)

In [None]:
brand_tsne = manifold.TSNE().fit_transform(brand_ratings_sc_vals)
brand_tsne_df = pd.DataFrame(brand_tsne, columns=['x', 'y'])
brand_tsne_df['brand'] = brand_ratings_sc.brand

In [None]:
sns.pairplot(brand_tsne_df, x_vars=['x'], y_vars=['y'],
             hue='brand', size=10,
             palette=sns.color_palette('Paired', n_colors=10))
plt.title('t-SNE')

In [None]:
import umap

brand_embedding = umap.UMAP().fit_transform(brand_ratings_sc_vals)
brand_umap_df = pd.DataFrame(brand_embedding, columns=['x', 'y'])
brand_umap_df['brand'] = brand_ratings_sc.brand

In [None]:
sns.pairplot(brand_umap_df, x_vars=['x'], y_vars=['y'],
             hue='brand', size=10,
             palette=sns.color_palette('Paired', n_colors=10))
plt.title('UMAP')

In [None]:
df = pd.DataFrame(brand_ratings_sc_trans[:,:2], columns=['x', 'y'])
df['brand'] = brand_ratings.brand
sns.pairplot(df, x_vars=['x'], y_vars=['y'],
             hue='brand', size=10,
             palette=sns.color_palette('Paired', n_colors=10))
plt.title('PCA')

In [None]:
df = pd.DataFrame(brand_rating_fa_scores.iloc[:,:2])
df['brand'] = brand_ratings.brand
sns.pairplot(df, x_vars=[0], y_vars=[1],
             hue='brand', size=10,
             palette=sns.color_palette('Paired', n_colors=10))
plt.xlabel('Value')
plt.ylabel('Leader')
plt.title('EFA')

In [None]:
mds = manifold.MDS().fit_transform(brand_ratings_sc_vals)
df = pd.DataFrame(mds, columns=['x', 'y'])
df['brand'] = brand_ratings.brand
sns.pairplot(df, x_vars=['x'], y_vars=['y'],
             hue='brand', size=10,
             palette=sns.color_palette('Paired', n_colors=10))
plt.title('MDS')