# Implementation Linear Mixed Effects Models 

This code implementes and runs the regressions for the analysis for
* The effect of annotator demograpics on POPQUORN, SBIC and the merged data
* The subsets (gender, not gender) for each of the datasets each

## 0. Preparation 

### 0.1 Load libraries 

In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

### 0.2 Define functions 

In [2]:
def create_summaries(model):
    """
    This function creates summary tables as output and LaTex for a given fitted LM statsmodel model. 

    Parameters
    ----------
    model : statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper
        Fitted model in question.

    Returns
    -------
    sum : statsmodels.iolib.summary2.Summary
        Object with summary statistics of the model. Latex output is printed.

    """
    sum=model.summary()
    print(sum)
    sum_tex=sum.as_latex()
    print(sum_tex)
    return sum

## 1. Load Data 

### 1.1 POPQUORN, SBIC, merged Dataset

In [3]:
pop_df=pd.read_csv("preprocessed data/popquorndata.csv")
sbf_df=pd.read_csv("preprocessed data/socialbiascorpus.csv")
all_df = pd.read_csv("preprocessed data/merged_dataset.csv")

In [4]:
# Minor preprocessing necessary, since the dataset was also used for validation
# drop target category
sbf_df=sbf_df.drop('targetCategory',axis=1)

# drop multiple annotations related to assignment of multiple target categories 
sbf_df=sbf_df.drop_duplicates(subset=['HITId','WorkerId'])

In [5]:
# Map variable names and categories and everything is consistent
# variable names
col_dict={'annotatorGender':'gender', 'annotatorRace':'race', 'annotatorAge':'age'}
sbf_df.rename(col_dict,axis=1,inplace=True)

# gender 
sbf_df['gender']=sbf_df['gender'].map({'nonBinary':'Non-binary', 'man':'Man', 'woman':'Woman'})

#race
sbf_df['race']=sbf_df['race'].map({'black':'Black or African American', 'hisp':'Hispanic or Latino', 'native':'Native American','other':'Other','white':'White','asian':'Asian'})

### 1.2 Subsets of the data 

In [7]:
# Target category is gender
df_gendertarget = pd.read_csv("preprocessed data/df_gendertarget.csv")
#split into former datasets
ss_sbf_gen=df_gendertarget[df_gendertarget['Dataset']=='SBIC']
ss_pop_gen=df_gendertarget[df_gendertarget['Dataset']=='POPQUORN Data']

# Target category is not gender 
df_othertarget = pd.read_csv("preprocessed data/df_othertarget.csv")

#split into former datasets
ss_sbf_nogen=df_othertarget[df_othertarget['Dataset']=='SBIC']
ss_pop_nogen=df_othertarget[df_othertarget['Dataset']=='POPQUORN Data']


In [19]:
set(df_gendertarget['Dataset'])

{'SBIC'}

## 2. Run regressions 

### 2.1 POPQUORN, SBIC, merged Dataset

This part aims to answer our first research question.
 Do the demographics of annotators have an effect on how they annotate?

In [8]:
# POPQUORN
m_pop = smf.mixedlm("offensiveness ~ gender+race+age", pop_df, groups=pop_df["instance_id"])
mf_pop = m_pop.fit()

# SBIC
m_sbf = smf.mixedlm("offensiveYN ~ gender+race+age", sbf_df, groups=sbf_df["HITId"])
mf_sbf = m_sbf.fit()

# Merged dataset
m_merged = smf.mixedlm("offensiveness_merge ~ gender+race+age+Dataset", all_df, groups=all_df["instance_id"])
mf_merged = m_merged.fit()

In [9]:
type(mf_pop)

statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper

In [10]:
# Cretae summaries (if one wants to explore the results deeper)

#sum_pop=create_summaries(mf_pop)
#sum_sbf=create_summaries(mf_sbf)
#sum_merged=create_summaries(mf_merged)

Format results and create LaTex output table

In [11]:
var_order=['Intercept','Dataset[T.SBIC]','gender[T.Woman]','gender[T.Non-binary]',
           'race[T.Black or African American]','race[T.Hispanic or Latino]','race[T.Native American]','race[T.White]','race[T.Other]', 
           'age','age[T.25-29]','age[T.30-34]','age[T.35-39]','age[T.40-44]','age[T.45-49]','age[T.50-54]','age[T.54-59]','age[T.60-64]','age[T.>65]',
           'Group Var']
print(summary_col([mf_pop,mf_sbf,mf_merged ],
                  float_format='%.3f', 
                  model_names=['POPQUORN','SBIC','merged'], 
                  info_dict = {"N":lambda x:(x.nobs)},
                  stars=True, regressor_order=var_order).as_latex())

\begin{table}
\caption{}
\label{}
\begin{center}
\begin{tabular}{llll}
\hline
                                  & POPQUORN   & SBIC        & merged       \\
\hline
Intercept                         & 1.995***   & 0.464***    & 0.124***     \\
                                  & (0.046)    & (0.005)     & (0.011)      \\
Dataset[T.SBIC]                   &            &             & 0.312***     \\
                                  &            &             & (0.010)      \\
gender[T.Woman]                   & -0.018     & -0.021***   & -0.014***    \\
                                  & (0.019)    & (0.002)     & (0.002)      \\
gender[T.Non-binary]              & -0.225***  & 0.085       & -0.071***    \\
                                  & (0.060)    & (0.163)     & (0.016)      \\
race[T.Black or African American] & 0.173***   & 0.064***    & 0.078***     \\
                                  & (0.044)    & (0.006)     & (0.006)      \\
race[T.Hispanic or Latino]        & -0.419*** 

### 2.2 Subsets of the data 
This part aims to answer our second research question.
 Does the effect of demographics of annotators on how they annotate, change when content is atrgetd toward gender?

In [15]:
ss_pop_gen

Unnamed: 0,offensiveness_merge,text,instance_id,user_id,gender,race,age,Dataset


In [13]:
# POPQUORN + Target category is Gender 
m_gen_pop = smf.mixedlm("offensiveness_merge ~ gender+race+age", ss_pop_gen, groups=ss_pop_gen["instance_id"])
mf_gen_pop = m_gen_pop.fit()

# POPQUORN + Target category is NOT Gender 
m_nogen_pop = smf.mixedlm("offensiveness_merge ~ gender+race+age", ss_pop_nogen, groups=ss_pop_nogen["instance_id"])
mf_nogen_pop = m_nogen_pop.fit()

# SBIC + Target category is Gender 
m_gen_sbf = smf.mixedlm("offensiveness_merge ~ gender+race+age", ss_sbf_gen, groups=ss_sbf_gen["instance_id"])
mf_gen_sbf = m_gen_sbf.fit()

# SBIC + Target category is NOT Gender 
m_nogen_sbf = smf.mixedlm("offensiveness_merge ~ gender+race+age", ss_sbf_nogen, groups=ss_sbf_nogen["instance_id"])
mf_nogen_sbf = m_nogen_sbf.fit()

# Merged data + Target category is Gender 
m_gen = smf.mixedlm("offensiveness_merge ~ gender+race+age+Dataset", df_gendertarget, groups=df_gendertarget["instance_id"])
mf_gen = m_gen.fit()

# Merged data + Target category is NOT Gender 
m_nogen = smf.mixedlm("offensiveness_merge ~ gender+race+age+Dataset", df_othertarget, groups=df_othertarget["instance_id"])
mf_nogen = m_nogen.fit()

ValueError: negative dimensions are not allowed

In [None]:
# Cretae summaries (if one wants to explore the results deeper)

#sum_gen_pop=create_summaries(mf_gen_pop)
#sum_nogen_pop=create_summaries(mf_nogen_pop)
#sum_gen_sbf=create_summaries(mf_gen_sbf)
#sum_nogen_sbf=create_summaries(mf_nogen_sbf)
#sum_gen=create_summaries(mf_gen)
#sum_nogen=create_summaries(mf_nogen)

Format results and create LaTex output table

In [None]:
var_order=['Intercept','Dataset[T.SBIC]','gender[T.Woman]','gender[T.Non-binary]',
           'race[T.Black or African American]','race[T.Hispanic or Latino]','race[T.Native American]','race[T.White]','race[T.Other]', 
           'age[T.25-29]','age[T.30-34]','age[T.35-39]','age[T.40-44]','age[T.45-49]','age[T.50-54]','age[T.54-59]','age[T.60-64]','age[T.>65]',
           'Group Var']
print(summary_col([mf_gen, mf_nogen, mf_gen_sbf,mf_nogen_sbf,mf_gen_pop, mf_nogen_pop],
                  float_format='%.3f',
                  info_dict = {"N":lambda x:x.nobs}, 
                  stars=True, regressor_order=var_order).as_latex())

\begin{table}
\caption{}
\label{}
\begin{center}
\begin{tabular}{lllllll}
\hline
                                  & offensiveness\_merge I & offensiveness\_merge II & offensiveness\_merge III & offensiveness\_merge IIII & offensiveness\_merge IIIII & offensiveness\_merge IIIIII  \\
\hline
Intercept                         & 0.208***               & 0.120***                & 0.587***                 & 0.404***                  & 0.390***                   & 0.204***                     \\
                                  & (0.034)                & (0.011)                 & (0.012)                  & (0.005)                   & (0.062)                    & (0.014)                      \\
Dataset[T.SBIC]                   & 0.386***               & 0.294***                &                          &                           &                            &                              \\
                                  & (0.033)                & (0.011)                 &              