In [None]:
# Demonstrates principal component analysis on a simple data set.
# See
# https://www.statsmodels.org/devel/datasets/generated/statecrime.html
# for more information on the data set.

import numpy as np
from matplotlib import pyplot as plt

import statsmodels.api as sm

data = sm.datasets.statecrime.load_pandas().data

In [None]:
## Description of data

# Each column corresponds to states
# Row 0: violent crime rate per 100k
# Row 1: murder rate per 100k
# Row 2: percentage with high school education
# Row 3: percentage below poverty line
# Row 4: percentage that are single-race caucasian
# Row 5: single households ratio:
#        (# Lone male head of households + # Lone female head of
#        households) / (# family households)
# Row 6: percentage of population in urban areas
X = np.array(data).T

rows = ['violent', 'murder', 'hs_educ', 'poverty', 'caucasian', 'single', 'urban']
states = list(data.index)

In [None]:
## Preprocessing of data

# Centralize data
Xc = X - np.tile(np.reshape(np.mean(X,axis=1), [X.shape[0], 1]), [1, X.shape[1]])

# Relative weight fluctuations for data
weights = np.array([1/100., 1., 1., 1., 1., 1., 1/10.])

# Variables to use
variables = [0, 1, 2, 3, 4, 6]

# Compute svd of scaled data
Xs = np.dot(np.diag(weights[variables]), Xc[variables,:])


In [None]:
## PCA

U, s, V = np.linalg.svd(Xs)
# Above same as ss, UU = np.linalg.eigh(np.dot(Xs, Xs.T))

In [None]:
## Visualization

# Columns of U indicate transformations of variables
plt.subplot(121)
Y = np.dot(U[:,:2].T, Xs)
plt.plot(Y[0,:], Y[1,:], 'b.')
plt.xlabel('$u_1$')
plt.ylabel('$u_2$')
plt.title('Data projected onto first two PC directions')

ax = plt.gca()
for ind, state in enumerate(states):
    ax.annotate(state, (Y[0,ind], Y[1,ind]))

plt.subplot(122)
width = 0.35
x = np.arange(Xs.shape[0])
plt.bar(x-width/2, U[:,0], width=width, label='$u_1$')
plt.bar(x+width/2, U[:,1], width=width, label='$u_2$')
plt.xticks(x, [rows[i] for i in variables])
plt.legend()
plt.title('Components of first two PC directions')

plt.gcf().set_size_inches(40, 12)