# PCA for 5 Factor Solution (PopCensus)

In [1]:
import pandas as pd
import numpy as np
from pca_support import perform_pca

n_factors = 5
ipsatized_data = pd.read_csv('data/popc_ipsatised_results.csv', index_col=0)
results_file = perform_pca('data/popc_ipsatised_results.csv', n_factors)
# results_file = 'r_loadings.csv'
loadings = pd.read_csv(results_file, index_col=0)
loadings.columns = [f'Factor{x}' for x in range(1, n_factors+1)]
print(f"loadings shape = {loadings.shape}")

[1] "Args: data/popc_ipsatised_results.csv"
[2] "Args: 5"                              
[3] "Args: promax"                         


New names:
• `` -> `...1`
Rows: 310 Columns: 1703
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr    (1): ...1
dbl (1702): active, agreeable, anxious, artistic, assertive, bashful, bold, ...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.


# A tibble: 310 × 1,703
   ...1         active agreeable anxious artistic assertive bashful  bold bright
   <chr>         <dbl>     <dbl>   <dbl>    <dbl>     <dbl>   <dbl> <dbl>  <dbl>
 1 DaveThomas    1.08     -0.619  -0.619   -0.619     1.08   -1.30  1.08   0.743
 2 OliverHarris  1.26      0.244   0.244    0.584     0.924  -0.775 0.584  0.924
 3 DavidHarris…  0.998    -0.609  -1.25    -0.609     0.998  -1.25  0.676  0.998
 4 JerryMcAlis…  1.38      0.679  -0.369   -0.719     1.38   -1.42  1.38   1.03 
 5 GregoryHarr…  1.42     -0.572  -0.572   -0.572     1.09   -1.24  1.09   1.09 
 6 MargaretCon…  0.714     0.714  -0.607    0.384     1.04   -1.27  0.714  0.714
 7 CarolynPatt…  0.936     0.936   0.281    0.281     0.936  -1.36  0.936  0.936
 8 LorraineBet…  0.711     1.06    0.711    1.06     -0.703   0.711 0.358  0.711
 9 ClarenceCut…  1.38      1.03    0.671   -0.754     1.03   -1.11  1.03   0.671
10 JohnathonK    1.01      1.01   -0.608   -0.608     0.688  -1.26  0.364  0.364
# ℹ 

The determinant of the smoothed correlation was zero.
This means the objective function is not defined.
Chi square is based upon observed residuals.
The determinant of the smoothed correlation was zero.
This means the objective function is not defined for the null model either.
The Chi square is thus based upon observed correlations.


loadings shape = (1702, 5)


1: In cor.smooth(r) : Matrix was not positive definite, smoothing was done
2: In principal(r = r, nfactors = nfactors, residuals = residuals,  :
  The matrix is not positive semi-definite, scores found from Structure loadings


In [2]:
explained_variance = np.sum(loadings**2, axis=0)
variances = [x/len(loadings)*100 for x in explained_variance]
print("variances: "+", ".join([f"{x:.2f}%" for x in variances])+ f", cumulative: {sum(variances):.2f}%")

variances: 4.84%, 4.43%, 4.16%, 4.11%, 2.00%, cumulative: 19.54%


In [3]:
from pca_support import get_highest_loadings
highest_loadings, pp = get_highest_loadings(30, loadings)
agent_loadings_df = pd.DataFrame(highest_loadings)
pp

Unnamed: 0,Factor1,Factor2,Factor3,Factor4,Factor5
0,ungregarious(0.79),unbold(-0.77),deceptive(0.88),strait-laced(-0.68),homespun(0.59)
1,distant(0.78),placid(-0.75),manipulative(0.88),overrigid(-0.62),folksy(0.57)
2,uncompanionable(0.77),mild(-0.75),sly(0.87),overscrupulous(-0.61),unliterary(0.57)
3,aloof(0.76),soft-spoken(-0.74),sneaky(0.86),conventional(-0.60),unscholarly(0.57)
4,standoffish(0.76),unobtrusive(-0.70),devious(0.86),free-living(0.59),unstudious(0.56)
5,detached(0.74),forceful(0.69),deceitful(0.84),unconventional(0.59),sophisticated(-0.54)
6,unpersonable(0.73),outspoken(0.66),double-faced(0.84),unpredictable(0.58),unbookish(0.52)
7,untalkative(0.72),overpatient(-0.65),underhanded(0.84),spontaneous(0.58),ultraintellectual(-0.50)
8,unaccessible(0.71),fierce(0.65),double-tongued(0.83),unconstrained(0.58),old-fashioned(0.50)
9,unapproachable(0.71),bullish(0.63),unscrupulous(0.82),hit-or-miss(0.57),visionary(-0.50)


In [4]:
from cronbach_alpha import calc_cronbachs_alpha as cronbach_alpha
alphas = []
for dim in highest_loadings:
    cols = highest_loadings[dim]
    df_ = ipsatized_data[cols.index]
    alpha = cronbach_alpha(df_)
    alphas.append(alpha)
print("Cronbach's Alpha: "+", ".join([f"{x:.2f}" for x in alphas]))

Cronbach's Alpha: 0.95, -0.56, 0.94, 0.15, -0.39


In [5]:
from data_support import load_hexaco_data
from jaccard import calc_jaccard_with_hexaco
jaccards = calc_jaccard_with_hexaco(agent_loadings_df, load_hexaco_data())
highest_jaccards = list(zip(jaccards.index, jaccards.idxmax(axis=1), jaccards.max(axis=1)))
print("Highest Jaccard Similarities:")
[print(f" * {a} -> {b}, ({c:.3f})") for a, b, c in list(highest_jaccards)];

Highest Jaccard Similarities:
 * Factor1 -> Extraversion, (0.045)
 * Factor2 -> Agreeableness, (0.066)
 * Factor3 -> Honesty-Humility, (0.085)
 * Factor4 -> Openness, (0.032)
 * Factor5 -> Openness, (0.041)
