In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

In [32]:
data = pd.read_csv('data/preprocessed_data.csv')
data.head()

Unnamed: 0,male_lecturers,university_gender_quota,workload_stress,dei_policies,netflix,attention_test,age,gender_identity,schooling,political_alignment,lat,lng,duration,is_direct,has_conformity_trigger,male_lecturers_SDB,university_gender_quota_SDB,workload_stress_SDB,dei_policies_SDB,netflix_SDB
0,2,1,2,1,3,5,4,1,5,11,41.387,2.1701,166,0,0,0.046923,0.46,-0.23,0.076154,-0.050769
1,2,2,3,4,3,5,3,2,4,1,41.387,2.1701,323,0,0,0.046923,0.46,-0.23,0.076154,-0.050769
2,2,2,2,3,3,5,2,2,4,-12,41.387,2.1701,315,0,1,0.046923,0.46,-0.23,0.076154,-0.050769
3,5,4,4,2,2,5,2,1,4,3,34.0034,-84.4605,210,0,0,0.046923,0.46,-0.23,0.076154,-0.050769
4,2,1,2,3,3,5,3,2,4,-41,40.4172,-3.684,572,0,0,0.046923,0.46,-0.23,0.076154,-0.050769


In [33]:
questions = ['male_lecturers',	'university_gender_quota',	'workload_stress',	'dei_policies',	'netflix']
ind_vars = ['age', 'gender_identity', 'schooling', 'political_alignment']

SDB_columns = {}

for question in questions: 
    col_name = question + '_SDB'
    SDB_columns[question] = col_name

first get summary statistics for each question with conformity and non conformity trigger

In [34]:
print(data.groupby('has_conformity_trigger')['university_gender_quota'].count())

has_conformity_trigger
0    50
1    52
Name: university_gender_quota, dtype: int64


create categories for political alignment

run percentages for each question with conformity and without conformity

In [35]:
    
def get_sdb(data, suffix):
    all_data = {'all' +suffix:[], 'conformity' + suffix:[], 'non_conformity' + suffix:[]}
    questions = ['male_lecturers',	'university_gender_quota',	'workload_stress',	'dei_policies',	'netflix']

    for question in questions:

        direct = data.loc[data['is_direct'] == 1, question].mean()
        veiled = data.loc[data['is_direct'] == 0, question].mean()
        pct_diff = ((veiled - direct)/direct) * 100
        all_data['all' + suffix].append(pct_diff)

        non_conformity = data[data['has_conformity_trigger'] == 0]
        direct = non_conformity.loc[non_conformity['is_direct'] == 1, question].mean()
        veiled = non_conformity.loc[non_conformity['is_direct'] == 0, question].mean()
        pct_diff = ((veiled - direct)/direct) * 100
        all_data['conformity' + suffix].append(pct_diff)

        conformity = data[data['has_conformity_trigger'] == 1]
        direct = conformity.loc[conformity['is_direct'] == 1, question].mean()
        veiled = conformity.loc[conformity['is_direct'] == 0, question].mean()
        pct_diff = ((veiled - direct)/direct) * 100
        all_data['non_conformity' + suffix].append(pct_diff)

    return pd.DataFrame(all_data, index=questions)
    

In [36]:
overall = get_sdb(data, '_all')
male = get_sdb(data.loc[data.gender_identity == 1, :], '_male')
female = get_sdb(data.loc[data.gender_identity == 2, :], '_female')

# concat all the dataframes column wise
all_sdb = pd.concat([overall, male, female], axis=1)


all_sdb

Unnamed: 0,all_all,conformity_all,non_conformity_all,all_male,conformity_male,non_conformity_male,all_female,conformity_female,non_conformity_female
male_lecturers,2.159292,8.333333,-3.571429,-2.64977,8.035714,-11.764706,6.595365,3.448276,10.0
university_gender_quota,30.666667,30.555556,30.769231,16.611296,0.396825,32.102273,48.571429,66.666667,29.411765
workload_stress,-8.363636,-15.410959,-1.428571,-8.43254,-17.5,-1.785714,-8.450704,-12.790698,-1.785714
dei_policies,3.168,13.411458,-6.557377,-8.59375,5.769231,-18.585526,18.181818,21.710526,14.782609
netflix,-1.859155,-5.208333,1.428571,-4.489796,-8.333333,-1.676829,1.767677,-1.162791,6.206897


In [37]:

all_sdb.loc['workload_stress', :] = -all_sdb.loc['workload_stress', :]

In [38]:
overall_sdb = all_sdb[['all_all','conformity_all','non_conformity_all']]

# absolute value workload stress because it is supposed to be negative
overall_sdb.columns = ['Overall SDB','Conformity Trigger','Non Conformity Trigger']

# sum columns and plot with plotly px
plot_data = overall_sdb.sum()

fig = px.bar(plot_data, title='SDB Across All Questions', color_discrete_sequence=px.colors.qualitative.Bold, labels={'value':'Social Desirability Bias', 'index':'Question'}, color=plot_data.index)


# Customize x-axis labels (e.g., rotate them for better readability)
fig.update_xaxes(title_text='', tickfont=dict( color='black', size=12))

# Customize y-axis labels if needed
fig.update_yaxes(title_text='SDB')

fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    coloraxis=dict(colorscale='viridis')
)

# Show the figure
fig.show()
#,'','','',''

what is driving differences in conformity and non conformity veiled or direct

In [39]:
import plotly.graph_objects as go


In [None]:
# trained_model, history = train_bert_model(
#   model_name="bert_model_gpt3_data",
#   model=bert_model,
#   tokenizer=tokenizer,
#   train_data=llm_aug_data,
#   val_data=val,
#   num_labels=num_labels,
#   max_length=max_length,
#   batch_size=32,
#   learning_rate=learning_rate,
#   num_epochs=num_epochs
# )

In [40]:
data.head()

Unnamed: 0,male_lecturers,university_gender_quota,workload_stress,dei_policies,netflix,attention_test,age,gender_identity,schooling,political_alignment,lat,lng,duration,is_direct,has_conformity_trigger,male_lecturers_SDB,university_gender_quota_SDB,workload_stress_SDB,dei_policies_SDB,netflix_SDB
0,2,1,2,1,3,5,4,1,5,11,41.387,2.1701,166,0,0,0.046923,0.46,-0.23,0.076154,-0.050769
1,2,2,3,4,3,5,3,2,4,1,41.387,2.1701,323,0,0,0.046923,0.46,-0.23,0.076154,-0.050769
2,2,2,2,3,3,5,2,2,4,-12,41.387,2.1701,315,0,1,0.046923,0.46,-0.23,0.076154,-0.050769
3,5,4,4,2,2,5,2,1,4,3,34.0034,-84.4605,210,0,0,0.046923,0.46,-0.23,0.076154,-0.050769
4,2,1,2,3,3,5,3,2,4,-41,40.4172,-3.684,572,0,0,0.046923,0.46,-0.23,0.076154,-0.050769


In [41]:

all_cols = [col for col in all_sdb.columns if 'all_' in col]
conform_cols = [col for col in all_sdb.columns if 'conformity' in col and 'non_' not in col]
nonconform_cols = [col for col in all_sdb.columns if 'non_' in col]

male_cols = [col for col in all_sdb.columns if '_male' in col]
female_cols = [col for col in all_sdb.columns if '_female' in col]
all_cols = [col for col in all_sdb.columns if '_all' in col]

sdb_table = all_sdb

#plot results in a grouped bar chart with plotly with x axis as the questions and y axis as the SDB groups should be 3 bars per question
all_sdb = all_sdb.reset_index()
all_sdb = all_sdb.rename(columns={'index':'question'})
all_sdb = all_sdb.melt(id_vars='question', var_name='group', value_name='sdb')


dont break it down by question either

In [42]:
temp_sdb = all_sdb.loc[all_sdb['group'].isin(all_cols), :]

# create 3 groups of bars for each question based on all_cols female_cols and male_cols
fig = px.bar(temp_sdb, x='question', y='sdb', color='group', barmode='group', color_discrete_sequence=px.colors.qualitative.Set2, title='Overall SDB by group')

# increase plot size and change color palette to viridis
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    coloraxis=dict(colorscale='viridis')
)

fig.show()

In [43]:
temp_sdb = all_sdb.loc[all_sdb['group'].isin(conform_cols), :]

# create 3 groups of bars for each question based on all_cols female_cols and male_cols
fig = px.bar(temp_sdb, x='question', y='sdb', color='group', barmode='group', color_discrete_sequence=px.colors.qualitative.Set2, title='SDB by group with Conformity Trigger')

# increase plot size and change color palette to viridis
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    coloraxis=dict(colorscale='viridis')
)

fig.show()

In [44]:
temp_sdb = all_sdb.loc[all_sdb['group'].isin(nonconform_cols), :]

# create 3 groups of bars for each question based on all_cols female_cols and male_cols
fig = px.bar(temp_sdb, x='question', y='sdb', color='group', color_discrete_sequence=px.colors.qualitative.Set2, barmode='group', title='SDB by group with Non Conformity Trigger')

# increase plot size and change color palette to viridis
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    coloraxis=dict(colorscale='viridis')
)

fig.show()

#

In [45]:
temp_sdb = all_sdb.loc[all_sdb['group'].isin(male_cols), :]

# create 3 groups of bars for each question based on all_cols female_cols and male_cols
fig = px.bar(temp_sdb, x='question', y='sdb', color='group', color_discrete_sequence=px.colors.qualitative.Bold, barmode='group', title='Conformity Effects on SDB for Males')

# increase plot size and change color palette to viridis
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    coloraxis=dict(colorscale='viridis')
)

fig.show()

In [46]:
temp_sdb = all_sdb.loc[all_sdb['group'].isin(female_cols), :]

# create 3 groups of bars for each question based on all_cols female_cols and male_cols
fig = px.bar(temp_sdb, x='question', y='sdb', color='group', color_discrete_sequence=px.colors.qualitative.Bold, barmode='group', title='Conformity Effects on SDB for Females')

# increase plot size and change color palette to viridis
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    coloraxis=dict(colorscale='viridis')
)

fig.show()

In [47]:
temp_sdb = all_sdb.loc[all_sdb['group'].isin(all_cols), :]

# create 3 groups of bars for each question based on all_cols female_cols and male_cols
fig = px.bar(temp_sdb, x='question', y='sdb', color='group', color_discrete_sequence=px.colors.qualitative.Bold, barmode='group', title='Conformity Effects on SDB')

# increase plot size and change color palette to viridis
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    coloraxis=dict(colorscale='viridis')
)

fig.show()

Figure out what to do with Political Alignment

Ideas: 
    
    - weight the SDB with political alignment
    
    - create a categorical for political alignment



In [48]:
# plot the politcal alignment in a histogram plotly exprss
fig = px.histogram(data, x='political_alignment', barmode='group', title='Political Alignment Distribution')
fig.show()

normalize distribution mean std

1 std left right middle - maybe extreme left and right

In [49]:
# normalize political alignment

from sklearn.preprocessing import Normalizer

political_alignment_normalized = data.political_alignment / np.linalg.norm(data.political_alignment, axis=0)

data['political_alignment_normalized'] = political_alignment_normalized

In [50]:
# plot the politcal alignment in a histogram plotly exprss
fig = px.histogram(data, x='political_alignment_normalized', barmode='group', title='Political Alignment Distribution')
fig.show()

In [51]:
# create left right and center groups ased on the normalized political alignment
data['political_alignment_group'] = pd.cut(data.political_alignment_normalized, bins=2, labels=['left', 'right'])
political_groups = data.groupby('political_alignment_group')['is_direct'].count()

In [52]:
# plot the politcal alignment in a histogram plotly exprss
fig = px.bar(political_groups, title='Political Alignment Group Distribution')
fig.show()

In [53]:
data.head()

Unnamed: 0,male_lecturers,university_gender_quota,workload_stress,dei_policies,netflix,attention_test,age,gender_identity,schooling,political_alignment,...,duration,is_direct,has_conformity_trigger,male_lecturers_SDB,university_gender_quota_SDB,workload_stress_SDB,dei_policies_SDB,netflix_SDB,political_alignment_normalized,political_alignment_group
0,2,1,2,1,3,5,4,1,5,11,...,166,0,0,0.046923,0.46,-0.23,0.076154,-0.050769,0.042499,right
1,2,2,3,4,3,5,3,2,4,1,...,323,0,0,0.046923,0.46,-0.23,0.076154,-0.050769,0.003864,right
2,2,2,2,3,3,5,2,2,4,-12,...,315,0,1,0.046923,0.46,-0.23,0.076154,-0.050769,-0.046362,left
3,5,4,4,2,2,5,2,1,4,3,...,210,0,0,0.046923,0.46,-0.23,0.076154,-0.050769,0.011591,right
4,2,1,2,3,3,5,3,2,4,-41,...,572,0,0,0.046923,0.46,-0.23,0.076154,-0.050769,-0.158404,left


In [54]:
gender_by_political_alignment = data.groupby(['political_alignment_group', 'gender_identity'])['is_direct'].count()
gender_by_political_alignment = gender_by_political_alignment.unstack()
gender_by_political_alignment.reset_index(inplace=True)
#gender_by_political_alignment.drop(columns=['gender_identity'], inplace=True)
gender_by_political_alignment.columns = ['political_alignment_group', 'male','female']


gender_by_political_alignment = gender_by_political_alignment.melt(id_vars='political_alignment_group')

fig = px.bar(gender_by_political_alignment, x='variable', y='value', color='political_alignment_group', barmode='group', title='Gender Distribution by Political Alignment Group')
fig.show()

In [55]:
# get sdb for political alignment
left_sdb = get_sdb(data.loc[data.political_alignment_group == 'left'], '_left')
right_sdb = get_sdb(data.loc[data.political_alignment_group == 'right'], '_right')

all_political_sdb = pd.concat([left_sdb, right_sdb], axis=1)

left_cols = [col for col in all_political_sdb.columns if '_left' in col]
right_cols = [col for col in all_political_sdb.columns if '_right' in col]

all_cols = [col for col in all_political_sdb.columns if 'all_' in col]

#plot results in a grouped bar chart with plotly with x axis as the questions and y axis as the SDB groups should be 3 bars per question
all_political_sdb = all_political_sdb.reset_index()
all_political_sdb = all_political_sdb.rename(columns={'index':'question'})
all_political_sdb = all_political_sdb.melt(id_vars='question', var_name='group', value_name='sdb')



In [57]:
all_political_sdb

Unnamed: 0,question,group,sdb
0,male_lecturers,all_left,3.83219
1,university_gender_quota,all_left,43.117536
2,workload_stress,all_left,-13.719154
3,dei_policies,all_left,7.722008
4,netflix,all_left,-4.798056
5,male_lecturers,conformity_left,3.020668
6,university_gender_quota,conformity_left,27.058824
7,workload_stress,conformity_left,-21.107266
8,dei_policies,conformity_left,21.008403
9,netflix,conformity_left,-7.082833


In [58]:
all_political_sdb.loc[all_political_sdb.question == 'workload_stress', 'sdb'] = -all_political_sdb.loc[all_political_sdb.question == 'workload_stress', 'sdb']

In [59]:
temp_sdb = all_political_sdb.loc[all_political_sdb['group'].isin(all_cols), :]
print(temp_sdb.shape)
# create 3 groups of bars for each question based on all_cols female_cols and male_cols
fig = px.bar(temp_sdb, x='question', y='sdb', color='group', barmode='group', title='Political Alignment Effects on SDB')

# increase plot size and change color palette to viridis
fig.update_layout(
    autosize=False,
    width=1000,
    height=600,
    coloraxis=dict(colorscale='viridis')
)

fig.show()

(10, 3)
