Skip to content

Commit

Permalink
BUG: Correct handling of missing in PCA
Browse files Browse the repository at this point in the history
Fix PCA to correctly handle missing values when input is pandas
Add tests for alternative drop methods

closes statsmodels#3347
  • Loading branch information
bashtage committed Feb 2, 2017
1 parent 67dcd43 commit c92299b
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 7 deletions.
16 changes: 11 additions & 5 deletions statsmodels/multivariate/pca.py
Expand Up @@ -335,6 +335,10 @@ def keep_row(x):
else:
raise ValueError('missing method is not known.')

if self._index is not None:
self._columns = self._columns[self.cols]
self._index = self._index[self.rows]

# Check adjusted data size
if self._adjusted_data.size == 0:
raise ValueError('Removal of missing values has eliminated all data.')
Expand Down Expand Up @@ -471,7 +475,7 @@ def _fill_missing_em(self):
return self.data

# 1. Standardized data as needed
data = self.transformed_data = self._prepare_data()
data = self.transformed_data = np.asarray(self._prepare_data())

ncomp = self._ncomp

Expand Down Expand Up @@ -503,15 +507,15 @@ def _fill_missing_em(self):
self._compute_eig()
# Call function to compute factors and projection
self._compute_pca_from_eig()
projection = self.project(transform=False, unweight=False)
projection = np.asarray(self.project(transform=False, unweight=False))
projection_masked = projection[mask]
data[mask] = projection_masked
delta = last_projection_masked - projection_masked
diff = _norm(delta) / _norm(projection_masked)
_iter += 1
# Must copy to avoid overwriting original data since replacing values
data = self._adjusted_data + 0.0
projection = self.project()
projection = np.asarray(self.project())
data[mask] = projection[mask]

return data
Expand Down Expand Up @@ -656,10 +660,12 @@ def _to_pandas(self):
index=index)
self.projection = df
# Weights
df = pd.DataFrame(self.coeff, index=cols, columns=self._columns)
df = pd.DataFrame(self.coeff, index=cols,
columns=self._columns)
self.coeff = df
# Loadings
df = pd.DataFrame(self.loadings, index=self._columns, columns=cols)
df = pd.DataFrame(self.loadings,
index=self._columns, columns=cols)
self.loadings = df
# eigenvals
self.eigenvals = pd.Series(self.eigenvals)
Expand Down
42 changes: 40 additions & 2 deletions statsmodels/multivariate/tests/test_pca.py
Expand Up @@ -2,13 +2,14 @@

import os
import sys
from unittest import TestCase
import warnings
from unittest import TestCase

import numpy as np
import pandas as pd
from nose.tools import assert_true
from numpy.testing import assert_allclose, assert_equal, assert_raises
from numpy.testing.decorators import skipif
import pandas as pd

try:
import matplotlib.pyplot as plt
Expand Down Expand Up @@ -381,3 +382,40 @@ def test_rsquare(self):
errors = x - pc.project(i, transform=False, unweight=False)
rsquare[i] = 1.0 - np.sum(errors ** 2) / tss
assert_allclose(rsquare, pc.rsquare)

def test_missing_dataframe(self):
x = self.x.copy()
x[::5, ::7] = np.nan
pc = PCA(x, ncomp=3, missing='fill-em')

x = pd.DataFrame(x)
pc_df = PCA(x, ncomp=3, missing='fill-em')
assert_allclose(pc.coeff, pc_df.coeff)
assert_allclose(pc.factors, pc_df.factors)

pc_df_nomissing = PCA(pd.DataFrame(self.x.copy()), ncomp=3)
assert_true(isinstance(pc_df.coeff, type(pc_df_nomissing.coeff)))
assert_true(isinstance(pc_df.data, type(pc_df_nomissing.data)))
assert_true(isinstance(pc_df.eigenvals,
type(pc_df_nomissing.eigenvals)))
assert_true(isinstance(pc_df.eigenvecs,
type(pc_df_nomissing.eigenvecs)))


x = self.x.copy()
x[::5, ::7] = np.nan
x_df = pd.DataFrame(x)
pc = PCA(x, missing='drop-row')
pc_df = PCA(x_df, missing='drop-row')
assert_allclose(pc.coeff, pc_df.coeff)
assert_allclose(pc.factors, pc_df.factors)

pc = PCA(x, missing='drop-col')
pc_df = PCA(x_df, missing='drop-col')
assert_allclose(pc.coeff, pc_df.coeff)
assert_allclose(pc.factors, pc_df.factors)

pc = PCA(x, missing='drop-min')
pc_df = PCA(x_df, missing='drop-min')
assert_allclose(pc.coeff, pc_df.coeff)
assert_allclose(pc.factors, pc_df.factors)

0 comments on commit c92299b

Please sign in to comment.