In [None]:
# import packages
from __future__ import print_function
import gspread
from oauth2client.service_account import ServiceAccountCredentials

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
import os
from os.path import join, dirname
from datetime import datetime
import csv

from statsmodels import *
import statsmodels.api as sm
from scipy.stats import *

from sklearn import linear_model
import matplotlib as mpl



# 1 Import survey responses

In [None]:
#Import Survey Responses from Google Sheet
#Procedure to retrieve latest survey responses.
#Establishes connection to google API using json key stored in local folder
#Downloads latest responses in the google sheets
#Returns pandas dataframe with all reponses. 

#Specify sheet name
spreadsheet = '2020-08-14 Matt Matthew Walford BIG5 Psycometric Survey Free personality report from your online public profiles responses to forms survey https://docs.google.com/forms/d/e/1FAIpQLSd3hm7vkXpIaHg4KTzLriyxk71ec2qbSdgkV7beLOmSQIOszA/viewform shared with Vei Yie and alice alice.d.matthews@gmail.com'
    
#Specify Json key location
json_file = 'survey-personality-71154dfbe30a.json'
    
#load in json file key
json_key = json.load(open(json_file))

creds = ServiceAccountCredentials.from_json_keyfile_name(json_file)
        
# Find a workbook by name and open the first sheet
client = gspread.authorize(creds)

#Open Survey Data of the spreadsheet and intalise as a variable
survey_sheet = client.open(spreadsheet).sheet1

#Convert sheet to a pandas dataframe
survey_data = pd.DataFrame(survey_sheet.get_all_records())

# set the index to match the Google Sheet index.  Important because the index functions as a unique ID for each respondent
index = pd.Index(range(2, len(survey_data)+2))
survey_data.set_index(index, inplace=True)



In [None]:
survey_data

# 2 Demographic Distributions

In [None]:
# Distribution of ages among survey respondents

age_data = survey_data["What age bracket do you fall in?"]

sns.set(rc = {'figure.figsize':(10, 6)}, font_scale = 1.2)
ax = sns.countplot(age_data, order = ['18-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65+', 'Prefer not to say'])
ax.set_title(label = f"Age distribution among survey respondents", fontsize = 18, loc = 'left')
   

In [None]:
# Distribution of countries among respondents

country_data = survey_data["What is your country of residence?"]
sns.set(rc = {'figure.figsize':(15,8)}, font_scale = 1.2)
ax = sns.countplot(country_data,
                    order = country_data.value_counts().index)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right')
ax.set_title(label = 'Country distribution among survey respondents', fontsize = 18, loc = 'left')
   

In [None]:
# Gender distribution of survey respondents
gender_data = survey_data['What is your sex?']

sns.set(rc = {'figure.figsize':(10,8)}, font_scale = 1.2)
ax = sns.countplot(gender_data, palette = "husl")
ax.set_title(label = 'Gender distribution among survey respondents', fontsize = 18, loc = 'left')



In [None]:
# reindex gender data to make the pie plot display nicely
gender_data = gender_data.value_counts().reindex(['Male', 'Prefer not to say', 'Female', 'Non-binary'])

colours = sns.color_palette("Set2")

fig = plt.pie(gender_data,
        labels=['Male', 'Prefer not to say', 'Female', 'Non-binary'],
        startangle=45,
        autopct='%1.1f%%',
        textprops={'fontsize': 15},
        colors = colours)

plt.title('Gender distribution among survey respondents', fontsize=18, loc = 'center')
fig = plt.gcf()
fig.set_size_inches(5, 5)

# 3 Calculate IPIP 50-item Big 5 scores

In [None]:
# read in the CSV file of the Big 5 Survey questions, description and scoring code
survey_code = pd.read_csv('Big 5 Survey Code.csv')
survey_code
question_codes = []
x = 1
for i in list(survey_code.Code):
    q = f"Q{x} {i}"
    question_codes.append(q)
    x = x + 1
    
# add useful column names and delete unneccessary columns
col_names = question_codes
for x in ['LinkedIn', 'Reddit', 'Twitter', 'Stacko', 'Gender', 'Age', 'Country']:
    col_names.append(x)
columns_to_drop = [0, 58, 59, 60, 61, 62, 63, 64, 65] # drop the timestamp and unnecessary details
survey_data.drop(survey_data.columns[columns_to_drop], axis = 1, inplace = True)
survey_data.columns = col_names

In [None]:
# convert to the appropriate numeric scorings
# questions with 'NEG' in the tag are negatively scored, and the final score is reversed from 6 so that 5 becomes 1, 4 becomes 2 etc
survey_numerated = survey_data.replace({"Very Accurate": 5, "Moderately Accurate": 4, "Neither Accurate Nor Inaccurate": 3, "Moderately Inaccurate": 2, "Very Inaccurate": 1})

for col in survey_numerated.columns:
    if 'NEG' in col:
        survey_numerated[col] = 6-survey_numerated[col]

survey_numerated

In [None]:
# calculate scores as a decimal
survey_numerated['survey_openness_raw'] = (survey_numerated.filter(regex='OPE', axis = 1).mean(axis = 1))/5 
survey_numerated['survey_conscientiousness_raw'] = (survey_numerated.filter(regex='CON', axis = 1).mean(axis = 1))/5
survey_numerated['survey_extraversion_raw'] = (survey_numerated.filter(regex='EXT', axis = 1).mean(axis = 1))/5
survey_numerated['survey_agreeableness_raw'] = (survey_numerated.filter(regex='AGR', axis = 1).mean(axis = 1))/5
survey_numerated['survey_emotional_stability_raw'] = (survey_numerated.filter(regex='EMO', axis = 1).mean(axis = 1))/5

survey_scores = survey_numerated[['survey_openness_raw', 'survey_conscientiousness_raw', 'survey_extraversion_raw','survey_agreeableness_raw', 'survey_emotional_stability_raw']].copy()
survey_scores = survey_scores[~survey_scores.index.duplicated(keep = 'first')] #remove dupilicate indices.  The ~ is a 'not' operator

# 4 Distributions of IPIP 50-item scores

In [None]:
plot_survey_scores = survey_scores.copy() # make a copy with neater column names

plot_survey_scores.columns = ["Openness", "Conscientiousness", "Extraversion", "Agreeableness", "Emotional Stability"]

grid = sns.PairGrid(plot_survey_scores, diag_sharey=False)
grid.map_lower(sns.scatterplot)
grid.map_diag(sns.histplot)
grid.set(xlim = (0,1), ylim = (0, 1))

grid.fig.suptitle("Distributions of psychometric survey-based Big 5 trait scores, n = 367", x = 0, y = 1.01, fontsize = 18, ha = 'left')

In [None]:
# calculate and add percentile scores

survey_scores['survey_openness_percentile'] = survey_scores['survey_openness_raw'].rank(pct=True)
survey_scores['survey_conscientiousness_percentile'] = survey_scores['survey_conscientiousness_raw'].rank(pct=True)
survey_scores['survey_extraversion_percentile'] = survey_scores['survey_extraversion_raw'].rank(pct=True)
survey_scores['survey_agreeableness_percentile'] = survey_scores['survey_agreeableness_raw'].rank(pct=True)
survey_scores['survey_emotional_stability_percentile'] = survey_scores['survey_emotional_stability_raw'].rank(pct=True)


survey_scores

In [None]:
big_5 = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'emotional_stability']
for i in big_5:
    plt.figure()
    sns.set_theme(style = 'ticks')
    sns.set(rc = {'figure.figsize':(10,8)})
    ax = sns.histplot(survey_scores[f"survey_{i}_raw"])
    ax.set_xlabel(xlabel = f"survey {i} score")
    ax.set_xlim(0, 1)

# 5 Get Personality Insights Big 5 scores

In [None]:
# Open the Profile Data sheet and initialise as a variable
profiles = client.open(spreadsheet).get_worksheet(1)
profile_data = pd.DataFrame(profiles.get_all_records())

# set the index to be the user_id column (which is the index of the survey sheet)
profile_data.set_index(profile_data.user_id, drop = True, inplace=True)
profile_data.sort_index(axis = 0, inplace=True)

In [None]:
# Get Personality Insights NLP scores from the profile_data df
NLP_scores = profile_data.iloc[:, 11:28]
NLP_scores = NLP_scores[~NLP_scores.index.duplicated(keep = 'first')] # drop duplicate indices

NLP_scores.drop([i for i in NLP_scores.index if NLP_scores.openness_percentile[i] == ''], axis = 0, inplace=True) # drop empty rows

NLP_scores[[i for i in NLP_scores.columns if 'percentile' in i]] \
    = NLP_scores[[i for i in NLP_scores.columns if 'percentile' in i]].replace('[\%,]', '', regex=True).astype(float)/100 # drop the '%' sign, convert to float

NLP_scores[[i for i in NLP_scores.columns if 'raw' in i]]\
    = NLP_scores[[i for i in NLP_scores.columns if 'raw' in i]].astype(float) # convert raw scores to floats


# Append 'NLP_' to the NLP score column headers
for i in big_5:
    NLP_scores.rename(columns={f"{i}_percentile":f"NLP_{i}_percentile"}, inplace=True) 
    NLP_scores.rename(columns = {f"{i}_raw": f"NLP_{i}_raw"}, inplace = True)
    NLP_scores.rename(columns = {f"{i}_interpretation": f"NLP_{i}_interpretation"}, inplace = True)
    NLP_scores.rename(columns = {f"{i}_my_percentile": f"NLP_{i}_my_percentile"}, inplace = True)

# 6 Distributions of Personality Insights scores

In [None]:
big_5 = ['openness', 'conscientiousness', 'extraversion', 'agreeableness', 'emotional_stability']

for i in big_5:
    plt.figure()
    plt.xlim([0, 1])
    sns.set_theme(style = 'ticks')
    sns.set(rc = {'figure.figsize':(10,8)})
    ax = sns.histplot(NLP_scores[f"NLP_{i}_raw"])
    ax.set(xlabel = f"{i.capitalize()} Score")
    ax.set_title(label = 'Distribution of online content scores', fontsize = 18, loc = 'left')




# 7 Add all scores to a single df for analysis

In [None]:
all_scores = pd.concat([survey_scores, NLP_scores], axis=1, join="inner")

# create 3000+word and 1000+word dfs
all_scores_3000 = all_scores[all_scores.total_word_count_passed > 2999].drop('total_word_count_passed', axis = 1).copy() 
all_scores_1000 = all_scores[all_scores.total_word_count_passed > 999].drop('total_word_count_passed', axis = 1).copy()

# split on raw/percentile scores
all_scores_3000_raw = all_scores_3000[[i for i in all_scores.columns if 'raw' in i]]
all_scores_3000_percentiles = all_scores_3000[[i for i in all_scores.columns if 'percentile' in i]]

all_scores_1000_raw = all_scores_1000[[i for i in all_scores.columns if 'raw' in i]]
all_scores_1000_percentiles = all_scores_1000[[i for i in all_scores.columns if 'percentile' in i]]


all_scores_3000_raw.head(10)


In [None]:
# set the data for analysis to 1000+/3000+ words, and raw/percentile

data = all_scores_3000_raw

# 8 Correlation analysis

In [None]:
sns.pairplot(data)

In [None]:
# Correlation matrix

all_scores_3000_percentiles.corr()

In [None]:
# create a correlation matrix heatmap
corr_matrix_3000_raw = all_scores_3000_raw.corr(method ='pearson')
corr_matrix_1000_raw = all_scores_1000_raw.corr(method = 'pearson')
corr_matrix_3000_percentile = all_scores_3000_percentiles.corr(method = 'spearman')
corr_matrix_1000_percentile = all_scores_1000_percentiles.corr(method = 'spearman')
# create a mask to remove the upper triangle of the heatmap
mask = np.triu(np.ones_like(corr_matrix_3000_raw, dtype=bool))

for i in range(len(mask)): # make the diagonal 1s show in the final graph
    mask[i][i] = False

In [None]:
# plot a correlation matrix for 3000-word respondents
fig, ax = plt.subplots(figsize=(10, 8))
# plot heatmap
cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)

sns.heatmap(corr_matrix_3000_raw, mask=mask, annot=True, fmt=".2f", cmap=cmap,
           vmin=-1, vmax=1, cbar_kws={"shrink": .8})

title = 'CORRELATION MATRIX - 3000 w of content\nmethod: Pearson\n'
plt.title(title, loc='left', fontsize=18)

# plot a correlation matrix for 3000-word percentiles with Spearman's Rho
fig, ax = plt.subplots(figsize=(10, 8))
# plot heatmap
cmap = sns.diverging_palette(0, 230, 90, 60, as_cmap=True)

sns.heatmap(corr_matrix_3000_percentile, mask=mask, annot=True, fmt=".2f", cmap=cmap,
           vmin=-1, vmax=1, cbar_kws={"shrink": .8})

title = f"CORRELATION MATRIX - 3000 w of content\nmethod: Spearman's rho"
plt.title(title, loc='left', fontsize=18)


In [None]:
data = all_scores_1000_percentiles
print(f"n (all_scores_1000) = {len(all_scores_1000_percentiles)}")
print(f"n (all_scores_3000) = {len(all_scores_3000_percentiles)}")


## Correlation Plots

In [None]:
for i in big_5:
    plt.figure()
    sns.set_theme(style = 'ticks')

    rho, p = spearmanr(data[f"survey_{i}_percentile"], data[f"NLP_{i}_percentile"])
    if p < 0.001:
        p = 'P < 0.001'

    elif p < 0.01:
        p = 'P < 0.01'
    
    elif p < 0.05:
        p = 'P < 0.05'

    else:
        p = f"P > 0.05"


    sns.set(rc = {'figure.figsize':(10,8)})
    ax = sns.regplot(x = f"NLP_{i}_percentile", y = f"survey_{i}_percentile", data = data)
    ax.set(xlabel = f"Online Content", ylabel = f"IPIP 50-item")
    ax.set_title(label = f"Online Content vs. IPIP 50-item {i.capitalize()} Percentiles \nSpearman's Rho = {round(rho, 2)}, {p}", fontsize = 18, loc = 'left')





# 9 Linear Regression

## Data

In [None]:
# choose a dataset for analysis, add back the demographic variables
regression_data = pd.concat([all_scores_1000_raw, survey_data[['Age', 'Country', 'Gender']]], axis = 1, join = 'inner').copy()

# convert categorical data to dummy variables for regression
regression_data = pd.get_dummies(regression_data, columns = ['Age', 'Country', 'Gender'])

## Multiple regression (with demographic data as independent variables)

In [None]:
# regressions on each Big 5 trait with all demographic data
# this 'for loop' iterates through each big_5 trait and performs a regression on each, with demographic data
for i in big_5:
    X = regression_data[[f"NLP_{i}_raw", 'Age_18-24', 'Age_25-29', 'Age_30-34', 'Age_35-39', 'Age_40-44',
       'Age_45-49', 'Age_50-54', 'Age_55-59', 'Age_65+','Country_Argentina',
       'Country_Australia', 'Country_India', 'Country_Nigeria',
       'Country_Philippines', 'Country_Qatar', 'Country_Romania',
       'Country_Switzerland', 'Country_United Arab Emirates',
       'Country_United Kingdom', 'Country_United States','Gender_Female',
       'Gender_Male', 'Gender_Prefer not to say']]
    y = regression_data[f"survey_{i}_raw"]

    # X = sm.add_constant(X)

    # Note the difference in argument order
    model = sm.OLS(y, X).fit()
    pred_ols = model.get_prediction()
    
    iv_l = pred_ols.summary_frame()["obs_ci_lower"]
    iv_u = pred_ols.summary_frame()["obs_ci_upper"]

    
    # Print out the statistics
    print(model.summary())

## Simple regression (Survey Data ~ NLP Data)

In [None]:
# this for loop iterates through the Big 5 traits and performs a simple regression on each.
# the independent variable (X) is the set of NLP scores for each trait
for i in big_5:
    X = regression_data[f"NLP_{i}_raw"]
    y = regression_data[f"survey_{i}_raw"]

    # Note the difference in argument order
    model = sm.OLS(y, X).fit()
    pred_ols = model.get_prediction()
    
    iv_l = pred_ols.summary_frame()["obs_ci_lower"]
    iv_u = pred_ols.summary_frame()["obs_ci_upper"]

    
    # Print out the statistics
    print(model.summary())
