# PCA for 10 Factor Solution (PopCensus)

In [1]:
import pandas as pd
import numpy as np
from support.pca_support import perform_pca

n_factors = 10
ipsatized_data = pd.read_csv('data/popc_ipsatised_results.csv', index_col=0)
results_file = 'intermediate/loadings_popc_10.csv'
results_file = perform_pca('data/popc_ipsatised_results.csv', n_factors, 'intermediate/loadings_popc_10.csv')
loadings = pd.read_csv(results_file, index_col=0)
loadings.columns = [f'Factor{x}' for x in range(1, n_factors+1)]
print(f"loadings shape = {loadings.shape}")

Args: input_file= data/popc_ipsatised_results.csv , n_factors= 10 , rotation_method promax 


The determinant of the smoothed correlation was zero.
This means the objective function is not defined.
Chi square is based upon observed residuals.
The determinant of the smoothed correlation was zero.
This means the objective function is not defined for the null model either.
The Chi square is thus based upon observed correlations.


loadings shape = (1702, 10)


In [2]:
explained_variance = np.sum(loadings**2, axis=0)
variances = [x/len(loadings)*100 for x in explained_variance]
print("variances: "+", ".join([f"{x:.2f}%" for x in variances])+ f", cumulative: {sum(variances):.2f}%")

variances: 4.00%, 3.79%, 3.18%, 3.09%, 2.80%, 1.62%, 1.10%, 1.01%, 1.00%, 0.92%, cumulative: 22.51%


In [3]:
from support.pca_support import get_highest_loadings
highest_loadings, pp = get_highest_loadings(30, loadings)
agent_loadings_df = pd.DataFrame(highest_loadings)
labels_factors = ['Dishonesty', 'Disagreeableness', 'Introversion', 
                  'Unconscientiousness', 'Unheroic', 'Unscholarly', 
                  'Gendered-emotionality', 'Unsentimentality', 'Insensitivity', 
                  'Unartistic']
pp.columns = [pp.columns[x]+'('+labels_factors[x]+')' for x in range(len(pp.columns))]
pp

Unnamed: 0,Factor1(Dishonesty),Factor2(Disagreeableness),Factor3(Introversion),Factor4(Unconscientiousness),Factor5(Unheroic),Factor6(Unscholarly),Factor7(Gendered-emotionality),Factor8(Unsentimentality),Factor9(Insensitivity),Factor10(Unartistic)
0,sly(0.94),sharp-tongued(0.74),uncommunicative(0.72),overneat(-0.67),unbold(0.69),unbookish(0.71),womanly(1.07),unsentimental(0.53),finicky(-0.51),ultrarefined(-0.50)
1,sneaky(0.93),abrasive(0.69),aloof(0.71),overconscientious(-0.60),unadventurous(0.64),unscholarly(0.70),gentlemanlike(-0.96),earthy(-0.49),fretful(-0.48),chic(-0.50)
2,deceptive(0.93),unforbearing(0.67),untalkative(0.71),messy(0.60),venturesome(-0.63),unliterary(0.67),feminine(0.90),homespun(-0.49),fussy(-0.45),lavish(-0.49)
3,devious(0.91),harsh(0.66),seclusive(0.69),hit-or-miss(0.59),mild(0.62),bookish(-0.62),masculine(-0.89),long-suffering(-0.39),overnervous(-0.42),refined(-0.45)
4,undevious(-0.90),overharsh(0.66),ungregarious(0.68),unbusinesslike(0.58),venturous(-0.61),ultraintellectual(-0.61),manly(-0.89),competitory(0.38),worrying(-0.41),dapper(-0.45)
5,undeceptive(-0.89),argumentative(0.66),uncompanionable(0.68),planless(0.57),sedate(0.61),overbookish(-0.60),ladylike(0.80),mechanistic(0.38),anxious(-0.37),elegant(-0.45)
6,manipulative(0.89),unaccommodating(0.65),distant(0.67),unmethodical(0.57),fierce(-0.60),scholarly(-0.58),virile(-0.77),sentimental(-0.37),tense(-0.36),overrefined(-0.43)
7,deceitful(0.87),caustic(0.65),detached(0.67),unsystematic(0.56),daring(-0.60),unphilosophical(0.56),unmasculine(0.58),tenderminded(-0.37),overfastidious(-0.36),unextravagant(0.43)
8,underhanded(0.86),overbearing(0.65),withdrawn(0.67),ultrafastidious(-0.55),hectic(-0.57),unskeptical(0.55),rugged(-0.43),careworn(-0.37),perturbable(-0.35),extravagant(-0.41)
9,uncandid(0.85),unforgiving(0.64),taciturn(0.67),overscrupulous(-0.54),placid(0.56),overstudious(-0.55),maternal(0.43),tender-hearted(-0.36),compulsive(-0.35),cosmopolitan(-0.41)


In [4]:
from support.cronbach_alpha import calc_cronbachs_alpha as cronbach_alpha
alphas = []
for dim in highest_loadings:
    cols = highest_loadings[dim]
    df_ = ipsatized_data[cols.index]
    alpha = cronbach_alpha(df_)
    alphas.append(alpha)
print("Cronbach's Alpha: "+", ".join([f"{x:.2f}" for x in alphas]))

Cronbach's Alpha: 0.94, 0.93, 0.96, 0.45, 0.68, 0.33, 0.55, 0.79, 0.86, 0.79


In [5]:
from support.data_support import load_hexaco_data
from support.jaccard import calc_jaccard_with_hexaco
jaccards = calc_jaccard_with_hexaco(agent_loadings_df, load_hexaco_data())
highest_jaccards = list(zip(jaccards.index, jaccards.idxmax(axis=1), jaccards.max(axis=1)))
print("Highest Jaccard Similarities:")
[print(f" * {a} -> {b}, ({c:.3f})") for a, b, c in list(highest_jaccards)];

Highest Jaccard Similarities:
 * Factor1 -> Honesty-Humility, (0.085)
 * Factor2 -> Agreeableness, (0.106)
 * Factor3 -> Extraversion, (0.083)
 * Factor4 -> Conscientiousness, (0.065)
 * Factor5 -> Emotionality, (0.024)
 * Factor6 -> Openness, (0.047)
 * Factor7 -> Emotionality, (0.166)
 * Factor8 -> Emotionality, (0.048)
 * Factor9 -> Emotionality, (0.089)
 * Factor10 -> Honesty-Humility, (0.021)


In [6]:
from support.semantic_similarity import load_embedding_model, compute_similarity
model = load_embedding_model()
for i, m in enumerate(highest_loadings.keys()):
    # reduce terms to only those we know are in the model...
    ats = [x for x in highest_loadings[m].index if x in model]
    sim, matrix = compute_similarity(ats, model)
    print(f"Average similarity for {m}({labels_factors[i]}): {sim:.3f}")

Average similarity for Factor1(Dishonesty): 0.549
Average similarity for Factor2(Disagreeableness): 0.529
Average similarity for Factor3(Introversion): 0.498
Average similarity for Factor4(Unconscientiousness): 0.459
Average similarity for Factor5(Unheroic): 0.472
Average similarity for Factor6(Unscholarly): 0.466
Average similarity for Factor7(Gendered-emotionality): 0.549
Average similarity for Factor8(Unsentimentality): 0.437
Average similarity for Factor9(Insensitivity): 0.501
Average similarity for Factor10(Unartistic): 0.523
