# PCA for 6 Factor Solution (PopCensus)

In [1]:
import pandas as pd
import numpy as np
from support.pca_support import perform_pca

n_factors = 6
ipsatized_data = pd.read_csv('data/popc_ipsatised_results.csv', index_col=0)
results_file = 'intermediate/loadings_popc_6.csv'
results_file = perform_pca('data/popc_ipsatised_results.csv', n_factors, 'intermediate/loadings_popc_6.csv')
loadings = pd.read_csv(results_file, index_col=0)
loadings.columns = [f'Factor{x}' for x in range(1, n_factors+1)]
print(f"loadings shape = {loadings.shape}")

Args: input_file= data/popc_ipsatised_results.csv , n_factors= 6 , rotation_method promax 


The determinant of the smoothed correlation was zero.
This means the objective function is not defined.
Chi square is based upon observed residuals.
The determinant of the smoothed correlation was zero.
This means the objective function is not defined for the null model either.
The Chi square is thus based upon observed correlations.


loadings shape = (1702, 6)


In [2]:
explained_variance = np.sum(loadings**2, axis=0)
variances = [x/len(loadings)*100 for x in explained_variance]
print("variances: "+", ".join([f"{x:.2f}%" for x in variances])+ f", cumulative: {sum(variances):.2f}%")

variances: 4.24%, 3.96%, 3.79%, 3.06%, 2.70%, 1.96%, cumulative: 19.71%


In [3]:
from support.pca_support import get_highest_loadings
highest_loadings, pp = get_highest_loadings(30, loadings)
agent_loadings_df = pd.DataFrame(highest_loadings)
labels_factors = ['Dishonesty', 'Introversion', 'Unconventionality', 
                  'Dominance', 'Disagreeableness', 'Provincial']
pp.columns = [pp.columns[x]+'('+labels_factors[x]+')' for x in range(len(pp.columns))]
pp

Unnamed: 0,Factor1(Dishonesty),Factor2(Introversion),Factor3(Unconventionality),Factor4(Dominance),Factor5(Disagreeableness),Factor6(Provincial)
0,sly(0.91),ungregarious(0.76),strait-laced(-0.64),unbold(0.72),sharp-tongued(0.63),unliterary(0.59)
1,deceptive(0.91),distant(0.74),unconventional(0.57),bullish(-0.64),faultfinding(0.60),unstudious(0.57)
2,manipulative(0.90),untalkative(0.73),conventional(-0.57),mild(0.64),overharsh(0.58),unscholarly(0.57)
3,sneaky(0.89),aloof(0.72),hit-or-miss(0.57),fierce(-0.64),abrasive(0.57),folksy(0.56)
4,devious(0.88),detached(0.72),overrigid(-0.57),soft-spoken(0.59),unforbearing(0.56),homespun(0.56)
5,deceitful(0.86),uncommunicative(0.71),distractible(0.56),placid(0.58),peevish(0.56),unbookish(0.54)
6,underhanded(0.85),uncompanionable(0.71),free-living(0.55),forceful(-0.58),argumentative(0.56),sophisticated(-0.54)
7,double-faced(0.85),taciturn(0.70),unpredictable(0.55),uncompetitive(0.57),harsh(0.56),rugged(0.53)
8,unscrupulous(0.84),seclusive(0.70),unreined(0.54),sedate(0.56),unforgiving(0.55),overstudious(-0.50)
9,scheming(0.84),undemonstrative(0.70),impulsive(0.54),forcible(-0.55),caustic(0.55),ultraintellectual(-0.49)


In [4]:
from support.cronbach_alpha import calc_cronbachs_alpha as cronbach_alpha
alphas = []
for dim in highest_loadings:
    cols = highest_loadings[dim]
    df_ = ipsatized_data[cols.index]
    alpha = cronbach_alpha(df_)
    alphas.append(alpha)
print("Cronbach's Alpha: "+", ".join([f"{x:.2f}" for x in alphas]))

Cronbach's Alpha: 0.94, 0.97, -0.20, -0.47, 0.91, -0.42


In [5]:
from support.data_support import load_hexaco_data
from support.jaccard import calc_jaccard_with_hexaco
jaccards = calc_jaccard_with_hexaco(agent_loadings_df, load_hexaco_data())
highest_jaccards = list(zip(jaccards.index, jaccards.idxmax(axis=1), jaccards.max(axis=1)))
print("Highest Jaccard Similarities:")
[print(f" * {a} -> {b}, ({c:.3f})") for a, b, c in list(highest_jaccards)];

Highest Jaccard Similarities:
 * Factor1 -> Honesty-Humility, (0.085)
 * Factor2 -> Extraversion, (0.081)
 * Factor3 -> Openness, (0.032)
 * Factor4 -> Extraversion, (0.015)
 * Factor5 -> Agreeableness, (0.121)
 * Factor6 -> Openness, (0.042)


In [6]:
from support.semantic_similarity import load_embedding_model, compute_similarity
model = load_embedding_model()
for i, m in enumerate(highest_loadings.keys()):
    # reduce terms to only those we know are in the model...
    ats = [x for x in highest_loadings[m].index if x in model]
    sim, matrix = compute_similarity(ats, model)
    print(f"Average similarity for {m}({labels_factors[i]}): {sim:.3f}")

Average similarity for Factor1(Dishonesty): 0.544
Average similarity for Factor2(Introversion): 0.504
Average similarity for Factor3(Unconventionality): 0.420
Average similarity for Factor4(Dominance): 0.469
Average similarity for Factor5(Disagreeableness): 0.486
Average similarity for Factor6(Provincial): 0.471
