In [1]:
# show genearlized eigs and noise

In [2]:
import sys
sys.path.append('..')

%matplotlib inline
import numpy as np

from fa_kit import FactorAnalysis
from fa_kit import plotting as fa_plotting

In [3]:
import pandas as pd
import string

def make_random_data(n_samp=10000, n_feat=100):
    """
    make some random data with correlated features
    """
    data = np.random.randn(n_samp, n_feat)
    
    signal_width = 10
    signal_overlap = 2
    step_size = signal_width - signal_overlap
    for i in range(0, data.shape[1], step_size):
        shared_signal = 0.3*np.random.randn(n_samp, 1)
        data[:, i:(i+signal_width)] += shared_signal
    return data

data = make_random_data()



column_names = [
    "{}{}".format(i, string.lowercase[i % len(string.lowercase)])
    for i in range(data.shape[1])]

df_data = pd.DataFrame(
    data,
    columns=column_names
    )

df_data.head()

Unnamed: 0,0a,1b,2c,3d,4e,5f,6g,7h,8i,9j,...,90m,91n,92o,93p,94q,95r,96s,97t,98u,99v
0,1.055599,1.476581,2.000307,1.44066,0.992155,-0.943282,-0.210476,0.817368,0.092183,1.351513,...,-0.023534,0.618383,0.828507,-0.954152,-0.444514,-1.452409,-2.038335,0.799201,0.179439,-0.554806
1,-0.355314,1.842766,-0.647995,-0.864052,-0.377402,-0.017221,0.12827,1.599627,0.596051,0.106951,...,-0.689752,-1.015005,0.256802,0.438005,1.898118,1.418766,0.624838,-0.485969,1.246627,1.321106
2,0.326138,-2.264428,1.429309,2.61003,-1.537242,-0.488147,-1.894466,-1.198952,0.055827,1.540722,...,1.053812,-1.272244,-0.381188,0.36095,0.00722,-1.794968,1.022982,-0.052683,0.867375,0.067846
3,0.785828,1.954073,1.502395,1.253371,0.375129,0.75509,-0.302583,-0.375817,1.412537,0.819658,...,-2.198043,-1.475108,-0.832671,-0.702026,0.105928,0.712295,-1.175212,-2.084471,-0.696595,0.758741
4,-0.823434,0.067682,-1.086434,1.029577,-1.974218,0.110234,0.148997,-0.063679,-0.028054,-0.052532,...,-1.25513,-0.158838,0.856558,-0.992505,0.770106,0.418213,-0.591785,-0.866542,-0.067485,0.258969


In [4]:
def run_pipeline(data, retain_method='broken_stick',
    rotation_method='varimax', **kwargs):

    # Set up the factor analysis object, indiate how to calculate the
    # association matrix out of this input data.

    fa = FactorAnalysis.load_data(
        data,
        assoc_method='corr'
        )

    # Extract the components
    fa.extract_components()

    # Calculate how many components to retain
    # You can use any of these methods:
    #   'top_n', 'top_pct', 'kaiser', 'broken_stick'

    fa.find_comps_to_retain(
        method=retain_method,
        **kwargs
        )

    # Once you know how many to retain, re-extract with PAF
    fa.reextract_using_paf()

    # Apply factor rotation
    # Right now there are both 'varimax' and 'quartimax'
    fa.rotate_components(
        method=rotation_method
        )

    return fa

In [5]:
fa = run_pipeline(df_data, retain_method='broken_stick', rotation_method='varimax')

In [8]:
fa_plotting.text_summary(fa, top_n_items=4)

COMPONENT 0 (index 0)
	34.8: 70s
	34.7: 66o
	32.6: 65n
	31.9: 69r
COMPONENT 1 (index 1)
	34.0: 83f
	33.7: 84g
	32.4: 87j
	31.9: 86i
COMPONENT 2 (index 2)
	36.4: 28c
	32.2: 31f
	31.4: 26a
	31.3: 29d
COMPONENT 3 (index 3)
	34.0: 10k
	33.3: 8i
	32.7: 15p
	32.0: 11l
COMPONENT 4 (index 4)
	33.5: 2c
	33.1: 5f
	32.8: 4e
	32.4: 1b
COMPONENT 5 (index 5)
	34.3: 61j
	34.2: 57f
	33.7: 60i
	32.0: 59h
COMPONENT 6 (index 6)
	38.8: 97t
	37.9: 96s
	32.3: 95r
	31.6: 92o
COMPONENT 7 (index 7)
	35.1: 42q
	34.1: 43r
	31.8: 44s
	31.8: 47v
COMPONENT 8 (index 8)
	34.6: 22w
	33.3: 23x
	33.3: 24y
	32.5: 20u
COMPONENT 9 (index 9)
	35.3: 50y
	33.5: 49x
	33.5: 52a
	32.2: 51z
COMPONENT 10 (index 10)
	35.6: 76y
	34.5: 73v
	33.9: 79b
	32.7: 77z
COMPONENT 11 (index 11)
	36.0: 39n
	33.6: 34i
	32.0: 37l
	31.7: 41p
COMPONENT 12 (index 12)
	81.2: 98u
