In [2]:
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from statsmodels.stats.multitest import multipletests

In [9]:
REPLICATION = True

file_path_add = ''
if REPLICATION:
    file_path_add = 'replication/'

basemodelname = 'gpt2-large'

In [4]:
models = ['country_based','topic based','random','removed topics']

#We didn't do 'removed topics' in reproduction so i excluded it
models = ['country_based','topic based','random']

In [5]:
index = 'token_pairs'

## Evaluating on PEW

In [10]:
list_rows = []
for model in models:
    pew_gpt2 = pd.read_csv(f'../data/{file_path_add}pew_{basemodelname}_{model}_on_wvs_{index}.csv')
    pew_gpt2_cultural = pew_gpt2.loc[pew_gpt2['country'] != 'universal']
    r, p = scipy.stats.pearsonr(pew_gpt2_cultural['pew_score'], pew_gpt2_cultural['log prob difference'])
    row = {'model':basemodelname,'train_data' : 'WVS', 'eval_data': 'PEW',
          'strategy': model, 'r': r, 'p': p, 'n': len(pew_gpt2_cultural)}
    list_rows.append(row)


In [11]:
df = pd.DataFrame(list_rows)
df['p'] = multipletests(df['p'], method = 'bonferroni', alpha = 0.5)[1]
df

Unnamed: 0,model,train_data,eval_data,strategy,r,p,n
0,gpt2-large,WVS,PEW,country_based,0.484518,2.7048889999999997e-19,312
1,gpt2-large,WVS,PEW,topic based,0.59775,3.933223e-31,312
2,gpt2-large,WVS,PEW,random,0.537835,2.591639e-24,312


## Evaluating on WVS

We remove the seen topic-country pairs from evaluation set

In [12]:
def included_function(pairs):
    def func(row):
        return (row['country'], row['topic']) in pairs
    return func

In [32]:
all_eval_pairs = pickle.load(open(f'../data/{file_path_add}wvs_eval_pairs.p', 'rb'))
list_rows = []
for model in models:
    eval_pairs = all_eval_pairs[model]
    if model == 'topic based':
        model_refined = 'topic_based'
    elif model == 'removed topics':
        model_refined = 'removed_topic'
    else:
        model_refined = model
    wvs_gpt2 = pd.read_csv(f'../data/{file_path_add}wvs_w7_{basemodelname}_{model}_on_wvs_{index}_in.csv')
    wvs_gpt2_cultural = wvs_gpt2.loc[wvs_gpt2['country'] != 'universal']
    wvs_gpt2_cultural = wvs_gpt2_cultural.loc[~pd.isna(wvs_gpt2_cultural['wvs_score'])]
    
    wvs_gpt2_cultural['in_eval'] = wvs_gpt2_cultural.apply(included_function(eval_pairs), axis = 1)
    wvs_gpt2_cultural = wvs_gpt2_cultural.loc[wvs_gpt2_cultural.in_eval == True]
    
    r, p = scipy.stats.pearsonr(wvs_gpt2_cultural['wvs_score'], wvs_gpt2_cultural['log prob difference'])
    row = {'model':basemodelname,'train_data' : 'WVS', 'eval_data': 'WVS',
          'strategy': model, 'r': r, 'p': p, 'n': len(wvs_gpt2_cultural)}
    list_rows.append(row)



In [33]:
df = pd.DataFrame(list_rows)
df['p'] = multipletests(df['p'], method = 'bonferroni', alpha = 0.5)[1]
df

Unnamed: 0,model,train_data,eval_data,strategy,r,p,n
0,gpt2-large,WVS,WVS,country_based,-0.229252,1.0,19
1,gpt2-large,WVS,WVS,topic based,0.762381,0.002854,15
2,gpt2-large,WVS,WVS,random,0.856039,0.000281,14


## Variation study

### WVS

In [34]:
topics = ['claiming government benefits to which you are not entitled',
 'avoiding a fare on public transport',
 'stealing property',
 'cheating on taxes',
 'someone accepting a bribe in the course of their duties',
 'homosexuality',
 'prostitution',
 'abortion',
 'divorce',
 'sex before marriage',
 'suicide',
 'euthanasia',
 'for a man to beat his wife',
 'parents beating children',
 'violence against other people',
 'terrorism as a political, ideological or religious mean',
 'having casual sex',
 'political violence',
 'death penalty']

In [37]:
variation_rows = []

for i, model in enumerate(models):

    wvs_gpt2 = pd.read_csv(f'../data/{file_path_add}wvs_w7_{basemodelname}_{model}_on_wvs_{index}_in.csv')
    wvs_gpt2_cultural = wvs_gpt2.loc[wvs_gpt2['country'] != 'universal']
    df = wvs_gpt2_cultural.loc[~pd.isna(wvs_gpt2_cultural['wvs_score'])]

    for t in topics:

        pew_gpt2_t = df.loc[df.topic == t]
        user_var = np.var(pew_gpt2_t['wvs_score'])
        model_var = np.var(pew_gpt2_t['log prob difference'])
        row = {'model': model, 'user variation': user_var, 'model variation': model_var, 'topic': t}
        variation_rows.append(row)

df = pd.DataFrame(variation_rows)

   


        

In [38]:
list_rows = []
for i, model in enumerate(models):
    
    model_df = df.loc[df.model == model] 
    r, p = scipy.stats.pearsonr(model_df['model variation'],model_df['user variation'])
    
    row = {'strategy': model, 'r': r, 'p':p,
          'n': len(model_df)}
    list_rows.append(row)

In [39]:
df = pd.DataFrame(list_rows)
df['p'] = multipletests(df['p'], method = 'bonferroni', alpha = 0.5)[1]
df

Unnamed: 0,strategy,r,p,n
0,country_based,0.246306,0.928163,19
1,topic based,0.498158,0.089864,19
2,random,0.845622,1.5e-05,19


## PEW

In [40]:
pew_topics = {'using contraceptives':['using contraceptives', 'use contraceptives'],
           'getting a divorce': ['getting a divorce','get a divorce'], 
           'having an abortion': ['having an abortion','have an abortion'],
           'homosexuality': ['homosexuality','be homosexual'], 
           'drinking alcohol': ['drinking alcohol','drink alcohol'],
           'married people having an affair' : ['married people having an affair', 'have an extramarital affair'],
            'gambling': ['gambling','gamble'],
       'sex between unmarried adults': [ 'sex between unmarried adults','have sex between unmarried adults']
                         }

pew_topics_list = list(pew_topics.keys())

variation_rows = []

for i, model in enumerate(models):

    pew_gpt2 = pd.read_csv(f'../data/{file_path_add}pew_{basemodelname}_{model}_on_wvs_{index}.csv')
    pew_gpt2_cultural = pew_gpt2.loc[wvs_gpt2['country'] != 'universal']
    df = pew_gpt2_cultural.loc[~pd.isna(pew_gpt2_cultural['pew_score'])]

    for t in pew_topics:

        pew_gpt2_t = df.loc[df.topic.isin(pew_topics[t])]
        user_var = np.var(pew_gpt2_t['pew_score'])
        model_var = np.var(pew_gpt2_t['log prob difference'])
        row = {'model': model, 'user variation': user_var, 'model variation': model_var, 'topic': t}
        variation_rows.append(row)

df = pd.DataFrame(variation_rows)

   



In [41]:
list_rows = [ ]
for i, model in enumerate(models):
    
    model_df = df.loc[df.model == model]
    
    r, p = scipy.stats.pearsonr(model_df['model variation'],model_df['user variation'])
    
    row = {'strategy': model, 'r': r, 'p':p,
          'n': len(model_df)}
    list_rows.append(row)


In [42]:
df = pd.DataFrame(list_rows)
df['p'] = multipletests(df['p'], method = 'bonferroni', alpha = 0.5)[1]
df

Unnamed: 0,strategy,r,p,n
0,country_based,0.815737,0.040676,8
1,topic based,0.722977,0.128152,8
2,random,0.696389,0.165007,8
