In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import statsmodels.api as sm

In [103]:
data = pd.read_csv('data/preprocessed_data.csv')
data.head()

Unnamed: 0,male_lecturers,university_gender_quota,workload_stress,dei_policies,netflix,attention_test,age,gender_identity,schooling,political_alignment,lat,lng,duration,is_direct,has_conformity_trigger,male_lecturers_SDB,university_gender_quota_SDB,workload_stress_SDB,dei_policies_SDB,netflix_SDB
0,2,1,2,1,3,5,4,1,5,11,41.387,2.1701,166,0,0,0.046923,0.46,-0.23,0.076154,-0.050769
1,2,2,3,4,3,5,3,2,4,1,41.387,2.1701,323,0,0,0.046923,0.46,-0.23,0.076154,-0.050769
2,2,2,2,3,3,5,2,2,4,-12,41.387,2.1701,315,0,1,0.046923,0.46,-0.23,0.076154,-0.050769
3,5,4,4,2,2,5,2,1,4,3,34.0034,-84.4605,210,0,0,0.046923,0.46,-0.23,0.076154,-0.050769
4,2,1,2,3,3,5,3,2,4,-41,40.4172,-3.684,572,0,0,0.046923,0.46,-0.23,0.076154,-0.050769


In [104]:
questions = ['male_lecturers',	'university_gender_quota',	'workload_stress',	'dei_policies',	'netflix']
ind_vars = ['age', 'gender_identity', 'schooling', 'political_alignment']

SDB_columns = {}

for question in questions: 
    col_name = question + '_SDB'
    SDB_columns[question] = col_name

first get summary statistics for each question with conformity and non conformity trigger

In [105]:
print(data.groupby('has_conformity_trigger')['university_gender_quota'].count())

has_conformity_trigger
0    50
1    52
Name: university_gender_quota, dtype: int64


run percentages for each question with conformity and without conformity

In [106]:
all_data = {'all':[], 'conformity':[], 'non_conformity':[]}

for question in questions:

    direct = data.loc[data['is_direct'] == 1, question].mean()
    veiled = data.loc[data['is_direct'] == 0, question].mean()
    pct_diff = ((veiled - direct)/direct) * 100
    all_data['all'].append(pct_diff)

    non_conformity = data[data['has_conformity_trigger'] == 0]
    direct = non_conformity.loc[non_conformity['is_direct'] == 1, question].mean()
    veiled = non_conformity.loc[non_conformity['is_direct'] == 0, question].mean()
    pct_diff = ((veiled - direct)/direct) * 100
    all_data['conformity'].append(pct_diff)

    conformity = data[data['has_conformity_trigger'] == 1]
    direct = conformity.loc[conformity['is_direct'] == 1, question].mean()
    veiled = conformity.loc[conformity['is_direct'] == 0, question].mean()
    pct_diff = ((veiled - direct)/direct) * 100
    all_data['non_conformity'].append(pct_diff)

plot_data = pd.DataFrame(all_data, index=questions)
plot_data
    

Unnamed: 0,all,conformity,non_conformity
male_lecturers,2.159292,8.333333,-3.571429
university_gender_quota,30.666667,30.555556,30.769231
workload_stress,-8.363636,-15.410959,-1.428571
dei_policies,3.168,13.411458,-6.557377
netflix,-1.859155,-5.208333,1.428571


In [107]:
# log scale

# create a grouped bar chart for the percentage differences in the means of the questions in plotly
fig = px.bar(plot_data, x=plot_data.index, y=['all', 'conformity', 'non_conformity'], barmode='group', title='Percentage Difference in Means of Questions', color_discrete_map={'all':'#0293b0', 'conformity':'#02697e', 'non_conformity':'#013f4c'}, labels={'value':'Percentage Difference in Means', 'variable':'Group', 'index':'Questions'})
#increase plot size
fig.update_layout(
    autosize=False,
    width=800,
    height=400,
    margin=dict(
        l=50,
        r=50,
        b=100,
        t=100,
        pad=4
    )
)    

fig.show()

In [108]:
plot_data.to_csv('data/percentage_difference_in_means.csv')

run regression for each question

In [120]:
# run OLS regression
import statsmodels.api as sm

for column in questions:
    print(column)
    
    X = data[ind_vars]
    X = sm.add_constant(X)
    X['SDB'] = data[SDB_columns[column]] * data['is_direct']
    print(X.head())
    y = data[column]
    model = sm.OLS(y, X).fit()
    print(model.summary())
    # print summaries into excel like format 
    with open('data/model_summaries.txt', 'a') as f:
        f.write(model.summary().as_text())

    # print model summary to latex
    with open('data/model_summaries_latex.txt', 'a') as f:
        f.write(model.summary().as_latex())


male_lecturers
   const  age  gender_identity  schooling  political_alignment  SDB
0    1.0    4                1          5                   11  0.0
1    1.0    3                2          4                    1  0.0
2    1.0    2                2          4                  -12  0.0
3    1.0    2                1          4                    3  0.0
4    1.0    3                2          4                  -41  0.0
                            OLS Regression Results                            
Dep. Variable:         male_lecturers   R-squared:                       0.103
Model:                            OLS   Adj. R-squared:                  0.056
Method:                 Least Squares   F-statistic:                     2.201
Date:                Sun, 10 Mar 2024   Prob (F-statistic):             0.0604
Time:                        22:25:04   Log-Likelihood:                -118.58
No. Observations:                 102   AIC:                             249.2
Df Residuals:           

for male lecturers look into change in veiled unveiled at each age group

In [110]:
data.shape

(102, 20)

In [111]:

age_mapping = {1:'Under 18',2:'18-24',3:'25-34',4:'35-44',5:'45-54',6:'55-64',7:'65+'}

In [112]:
for age in data.age.unique():
    # for each age group get the difference in means for each the male_lecturers question
    print(age_mapping[age])
    direct = data.loc[(data['age'] == age), 'male_lecturers'].mean()
    print( data.loc[(data['age'] == age), 'male_lecturers'].shape)
    print(direct)

35-44
(1,)
2.0
25-34
(35,)
2.085714285714286
18-24
(64,)
2.265625
45-54
(1,)
2.0
55-64
(1,)
2.0


In [113]:

data_dict = {'age':[], 'conformity':[], 'non_conformity':[], 'all':[], 'count':[], 'mean':[], 'std':[]}


for age in data.age.unique():
    # for each age group get the difference in means for each the male_lecturers question
    direct = data.loc[(data['age'] == age) & (data['is_direct'] == 1), 'male_lecturers'].mean()
    veiled = data.loc[(data['age'] == age) & (data['is_direct'] == 0), 'male_lecturers'].mean()

    pct_diff_all = ((veiled - direct)/direct) * 100
    


    print(age_mapping[age])
    direct = data.loc[(data['age'] == age) & (data['is_direct'] == 1)].groupby('has_conformity_trigger')['male_lecturers'].mean()
    veiled = data.loc[(data['age'] == age) & (data['is_direct'] == 0)].groupby('has_conformity_trigger')['male_lecturers'].mean()
    pct_diff = ((veiled - direct)/direct) * 100
    
    if len(pct_diff) > 1:
        data_dict['age'].append(age_mapping[age])
        data_dict['conformity'].append(pct_diff.iloc[1])
        data_dict['non_conformity'].append(pct_diff.iloc[0])
        data_dict['all'].append(pct_diff_all)
        data_dict['count'].append(data.loc[(data['age'] == age), 'male_lecturers'].shape[0])
        data_dict['mean'].append(data.loc[(data['age'] == age), 'male_lecturers'].mean())
        data_dict['std'].append(data.loc[(data['age'] == age), 'male_lecturers'].std())
    print(data_dict)

plot_data = pd.DataFrame(data_dict)
plot_data


35-44
{'age': [], 'conformity': [], 'non_conformity': [], 'all': [], 'count': [], 'mean': [], 'std': []}
25-34
{'age': ['25-34'], 'conformity': [-19.999999999999996], 'non_conformity': [21.33333333333334], 'all': [1.6129032258064457], 'count': [35], 'mean': [2.085714285714286], 'std': [0.8530715634909972]}
18-24
{'age': ['25-34', '18-24'], 'conformity': [-19.999999999999996, 7.3684210526315725], 'non_conformity': [21.33333333333334, 0.9157509157509092], 'all': [1.6129032258064457, 4.464285714285719], 'count': [35, 64], 'mean': [2.085714285714286, 2.265625], 'std': [0.8530715634909972, 0.8211913045875234]}
45-54
{'age': ['25-34', '18-24'], 'conformity': [-19.999999999999996, 7.3684210526315725], 'non_conformity': [21.33333333333334, 0.9157509157509092], 'all': [1.6129032258064457, 4.464285714285719], 'count': [35, 64], 'mean': [2.085714285714286, 2.265625], 'std': [0.8530715634909972, 0.8211913045875234]}
55-64
{'age': ['25-34', '18-24'], 'conformity': [-19.999999999999996, 7.3684210526

Unnamed: 0,age,conformity,non_conformity,all,count,mean,std
0,25-34,-20.0,21.333333,1.612903,35,2.085714,0.853072
1,18-24,7.368421,0.915751,4.464286,64,2.265625,0.821191


In [114]:
data.age.unique()

array([4, 3, 2, 5, 6])

In [126]:
temp_data = data.loc[(data['age'] == 2) ]
print(temp_data.shape)

X = temp_data[ind_vars]
X = sm.add_constant(X)
X['SDB'] = temp_data[SDB_columns['male_lecturers']] * data['is_direct']
print(X.columns)
y = temp_data[column]
model = sm.OLS(y, X).fit()
print(model.summary())

(64, 20)
Index(['age', 'gender_identity', 'schooling', 'political_alignment', 'SDB'], dtype='object')
                            OLS Regression Results                            
Dep. Variable:                netflix   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                 -0.060
Method:                 Least Squares   F-statistic:                    0.1104
Date:                Sun, 10 Mar 2024   Prob (F-statistic):              0.978
Time:                        22:26:43   Log-Likelihood:                -74.504
No. Observations:                  64   AIC:                             159.0
Df Residuals:                      59   BIC:                             169.8
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
--------------------

In [125]:
temp_data = data.loc[(data['age'] == 3) ]
print(temp_data.shape)

X = temp_data[ind_vars]
X = sm.add_constant(X)
X['SDB'] = temp_data[SDB_columns['male_lecturers']] * data['is_direct']
y = temp_data[column]
model = sm.OLS(y, X).fit()
print(model.summary())

(35, 20)
                            OLS Regression Results                            
Dep. Variable:                netflix   R-squared:                       0.299
Model:                            OLS   Adj. R-squared:                  0.205
Method:                 Least Squares   F-statistic:                     3.197
Date:                Sun, 10 Mar 2024   Prob (F-statistic):             0.0267
Time:                        22:26:31   Log-Likelihood:                -27.627
No. Observations:                  35   AIC:                             65.25
Df Residuals:                      30   BIC:                             73.03
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
age                     0

In [92]:

data_dict = {'schooling':[], 'conformity':[], 'non_conformity':[], 'all':[], 'count':[], 'mean':[], 'std':[]}
education_mapping = {1:'Less than high school', 2:'High school', 3:'Some college', 4:'Bachelor\'s degree', 5:'Master\'s degree', 6:'Doctoral degree', 7:'Professional degree'}

for schooling in data.schooling.unique():
    # for each age group get the difference in means for each the male_lecturers question
    direct = data.loc[(data['schooling'] == schooling) & (data['is_direct'] == 1), 'male_lecturers'].mean()
    veiled = data.loc[(data['schooling'] == schooling) & (data['is_direct'] == 0), 'male_lecturers'].mean()

    pct_diff_all = ((veiled - direct)/direct) * 100
    


    print(education_mapping[schooling])
    direct = data.loc[(data['schooling'] == schooling) & (data['is_direct'] == 1)].groupby('has_conformity_trigger')['male_lecturers'].mean()
    veiled = data.loc[(data['schooling'] == schooling) & (data['is_direct'] == 0)].groupby('has_conformity_trigger')['male_lecturers'].mean()
    pct_diff = ((veiled - direct)/direct) * 100
    
    if len(pct_diff) > 1:
        data_dict['schooling'].append(education_mapping[schooling])
        data_dict['conformity'].append(pct_diff.iloc[1])
        data_dict['non_conformity'].append(pct_diff.iloc[0])
        data_dict['all'].append(pct_diff_all)
        data_dict['count'].append(data.loc[data['schooling'] == schooling].shape[0])
        data_dict['mean'].append(data.loc[(data['schooling'] == schooling), 'male_lecturers'].mean())
        data_dict['std'].append(data.loc[(data['schooling'] == schooling), 'male_lecturers'].std())
    print(data_dict)

plot_data = pd.DataFrame(data_dict)
plot_data


Master's degree
{'schooling': ["Master's degree"], 'conformity': [0.0], 'non_conformity': [12.5], 'all': [7.692307692307687], 'count': [17], 'mean': [2.1176470588235294], 'std': [0.48507125007266605]}
Bachelor's degree
{'schooling': ["Master's degree", "Bachelor's degree"], 'conformity': [0.0, -5.571847507331388], 'non_conformity': [12.5, 19.230769230769244], 'all': [7.692307692307687, 7.093821510297483], 'count': [17, 49], 'mean': [2.1176470588235294, 2.2653061224489797], 'std': [0.48507125007266605, 0.8107483735229252]}
Doctoral degree
{'schooling': ["Master's degree", "Bachelor's degree", 'Doctoral degree'], 'conformity': [0.0, -5.571847507331388, -50.0], 'non_conformity': [12.5, 19.230769230769244, 100.0], 'all': [7.692307692307687, 7.093821510297483, 0.0], 'count': [17, 49, 5], 'mean': [2.1176470588235294, 2.2653061224489797, 2.0], 'std': [0.48507125007266605, 0.8107483735229252, 1.4142135623730951]}
High school
{'schooling': ["Master's degree", "Bachelor's degree", 'Doctoral degr

Unnamed: 0,schooling,conformity,non_conformity,all,count,mean,std
0,Master's degree,0.0,12.5,7.692308,17,2.117647,0.485071
1,Bachelor's degree,-5.571848,19.230769,7.093822,49,2.265306,0.810748
2,Doctoral degree,-50.0,100.0,0.0,5,2.0,1.414214
3,High school,22.222222,-40.0,-10.344828,20,2.15,0.933302
4,Some college,20.0,-42.857143,16.666667,9,2.111111,0.927961


In [None]:

data_dict = {'political_alignment':[], 'conformity':[], 'non_conformity':[], 'all':[], 'count':[], 'mean':[], 'std':[]}

for schooling in data.schooling.unique():
    # for each age group get the difference in means for each the male_lecturers question
    direct = data.loc[(data['schooling'] == schooling) & (data['is_direct'] == 1), 'male_lecturers'].mean()
    veiled = data.loc[(data['schooling'] == schooling) & (data['is_direct'] == 0), 'male_lecturers'].mean()

    pct_diff_all = ((veiled - direct)/direct) * 100
    


    print(education_mapping[schooling])
    direct = data.loc[(data['schooling'] == schooling) & (data['is_direct'] == 1)].groupby('has_conformity_trigger')['male_lecturers'].mean()
    veiled = data.loc[(data['schooling'] == schooling) & (data['is_direct'] == 0)].groupby('has_conformity_trigger')['male_lecturers'].mean()
    pct_diff = ((veiled - direct)/direct) * 100
    
    if len(pct_diff) > 1:
        data_dict['schooling'].append(education_mapping[schooling])
        data_dict['conformity'].append(pct_diff.iloc[1])
        data_dict['non_conformity'].append(pct_diff.iloc[0])
        data_dict['all'].append(pct_diff_all)
        data_dict['count'].append(data.loc[data['schooling'] == schooling].shape[0])
        data_dict['mean'].append(data.loc[(data['schooling'] == schooling), 'male_lecturers'].mean())
        data_dict['std'].append(data.loc[(data['schooling'] == schooling), 'male_lecturers'].std())
    print(data_dict)

plot_data = pd.DataFrame(data_dict)
plot_data


In [131]:
combined_gender = data.copy()

print(combined_gender.shape)
combined_gender = combined_gender.loc[combined_gender.gender_identity == 1]
print(combined_gender.shape)

combined_gender['gender_identity'] 
# for each question plot the distribution of responses in veiled and unveiled conditions


combined_temp = combined_gender.copy()
direct = combined_temp.loc[combined_temp['is_direct'] == 1, questions].mean()
veiled = combined_temp.loc[combined_temp['is_direct'] == 0, questions].mean()

# get pct diff between veiled and unveiled
pct_diff = ((veiled - direct)/direct) * 100

pct_diff

# pct change formula is (new - old)/old * 100

(102, 20)
(54, 20)


male_lecturers             -2.649770
university_gender_quota    16.611296
workload_stress            -8.432540
dei_policies               -8.593750
netflix                    -4.489796
dtype: float64

In [130]:
combined_gender = data.copy()

print(combined_gender.shape)
combined_gender = combined_gender.loc[combined_gender.gender_identity == 2]
print(combined_gender.shape)

# for each question plot the distribution of responses in veiled and unveiled conditions


combined_temp = combined_gender.copy()
direct = combined_temp.loc[combined_temp['is_direct'] == 1, questions].mean()
veiled = combined_temp.loc[combined_temp['is_direct'] == 0, questions].mean()

# get pct diff between veiled and unveiled
pct_diff = ((veiled - direct)/direct) * 100

pct_diff

# pct change formula is (new - old)/old * 100

(102, 20)
(48, 20)


male_lecturers              6.595365
university_gender_quota    48.571429
workload_stress            -8.450704
dei_policies               18.181818
netflix                     1.767677
dtype: float64

In [133]:
data.groupby('is_direct')['university_gender_quota'].value_counts()

is_direct  university_gender_quota
0          2                          34
           1                           8
           3                           6
           0                           1
           4                           1
1          2                          26
           1                          17
           0                           6
           3                           3
Name: count, dtype: int64