In [3]:
import numpy as np

In [4]:
import numpy as np

class PCA():
	"""
	Runs PCA & prints variables & importances for each component you ask for.
	"""
	def __init__(self):
		self.input = self.X = None
		self.y = None
		self.dim = None
		self.K = None
		self.eigenmatrix = None
		self.names = None

	def _subtract_means(self):
		"""
		Normalize input matrix by subtracting column wise means
		"""
		self.X = self.X - self.X.mean(axis = 0)

	def _create_cov_matrix(self):
		self.K = np.cov(self.X.T)
		# # self written code:
		# X = self.X
		# K = np.zeros((self.dim, self.dim))
		# for i in xrange(self.dim):
		# 	for j in xrange(i, self.dim):
		# 		if i != j:
		# 			K[i, j] = K[j, i] = np.cov(X[:,i], X[:,j])
		# 		else:
		# 			K[i, j] = np.var(X[:,i])
		# self.K = K

	def _get_eigenvec(self):
		self.eig_vals, self.eig_vecs = np.linalg.eig(self.K)
		order = self.eig_vals.argsort()[::-1] # order eigenvalues largest to smallest
		self.eigenmatrix = self.eig_vecs[:,order]
		self.order = order

	def _print_importances(self, n_components, n_variables):
		feature_M = self.eigenmatrix[:,:n_components]
		for component_num in xrange(n_components):
			column = feature_M[:,component_num]
			most_important = abs(column).argsort()[::-1][:n_variables] # get n_variables most important Eig Vec components
			top_variables = column[most_important]
			print "COMPONENT {}:".format(component_num + 1)
			for index, value in zip(most_important, top_variables):
				variable_name = self.names[index]
				print "{}: {}".format(variable_name, round(value, 4))
			print "--------"

	def add_names(self, original_order_names):
		self.names = original_order_names

	def fit(self, X, y):
		self.input = self.X = X
		self.y = y
		self.dim = X.shape[1]

	def predict(self, n_components, n_variables):
		if not self.names:
			self._subtract_means()
			self._create_cov_matrix()
			self._get_eigenvec()
			self._print_importances(n_components, n_variables)
		else:
			return "Please input a list of [ {} ] variable names".format(self.dim)




In [5]:
pca = PCA()

In [6]:
X = np.random.rand(100, 10)
X[:,3] = X[:,3] + 80*np.random.uniform()
X[:,7] = X[:,7] + 80*np.random.uniform()

In [153]:
y = np.array(list('0'*70 + '1'*30))
np.random.shuffle(y)

In [154]:
pca.add_names(['1','2','3','4','5','6','7','8','9','10'])

In [155]:
pca.fit(X, y)

In [156]:
pca._subtract_means()

In [157]:
pca._create_cov_matrix()

In [158]:
pca._get_eigenvec()

In [159]:
pca.eig_vals.argsort()[::-1]

array([0, 3, 5, 4, 8, 9, 7, 6, 2, 1])

In [162]:
pca._print_importances(4,10)

COMPONENT 1:
7: -0.6229
6: 0.5108
10: -0.4232
5: 0.2322
9: 0.2209
4: -0.1887
1: 0.158
3: 0.0916
8: 0.0168
2: 0.0081
--------
COMPONENT 2:
2: -0.5519
9: -0.4048
1: 0.3934
5: 0.3714
8: 0.334
10: 0.2662
6: 0.2074
7: 0.1032
4: -0.0455
3: 0.0249
--------
COMPONENT 3:
4: -0.7126
7: 0.3775
10: -0.3409
5: -0.3058
3: 0.2385
9: -0.2089
8: 0.1474
2: 0.088
6: 0.0741
1: 0.067
--------
COMPONENT 4:
6: -0.5088
1: 0.4842
2: -0.4196
8: -0.3336
10: -0.2849
5: -0.2836
7: -0.2193
3: 0.0888
4: 0.0422
9: 0.0039
--------


In [1]:
# Testing CDF & Percentiles

In [8]:
import pandas as pd

In [10]:
X = pd.DataFrame(X)

In [111]:
def series_to_percentiles(series):
    """
    Still requires:
        1. CDF function
        2. floor function
    """
    sorted_array = np.sort(series)
    def get_percentile(array, element):
        cdf = round(100 * np.mean(array < element), 2)
        return my_floor(cdf, 10)
    return map(lambda x: get_percentile(sorted_array, x), np.array(series))

def df_to_percentiles(df):
    df_new = df.copy()
    for i in xrange(df.shape[1]):
        df_new.iloc[:,i] = series_to_percentiles(df.iloc[:,i])
    return df_new

def floor(num, nearest):
    if nearest == 0:
        return num
    elif nearest > num:
        if num > nearest / 2.0:
            return nearest
        else:
            return 0
    else:
        divider = int(num) / nearest
        remainder = num % nearest
        if remainder >= nearest/2.0:
            return (divider + 1) * nearest
        else:
            return divider * nearest

def my_cdf(array, element):
    return np.mean(np.sort(array) < element)

In [114]:
s = np.random.randn(30)

In [119]:
s

array([ 0.43984266,  0.44143895, -1.07709079, -0.99697379,  1.27749563,
       -0.78713738, -0.36990174, -0.5385754 , -0.75535568,  0.91242569,
       -0.01504391,  1.0184284 ,  1.27728864, -0.65282002, -1.35449506,
       -0.02654897, -2.79780459, -0.01186213,  0.99568042,  0.90381793,
       -0.60438087,  3.19195076,  2.20095288,  0.58441777, -1.31838029,
       -1.35702871, -1.59528166, -0.56415687, -1.22822781, -0.21242765])

In [115]:
series_to_percentiles(s)

[60,
 60,
 20,
 20,
 90,
 20,
 40,
 40,
 30,
 70,
 50,
 80,
 80,
 30,
 10,
 50,
 0.0,
 60,
 80,
 70,
 30,
 90,
 90,
 70,
 10,
 6.67,
 3.33,
 40,
 10,
 50]