In [4]:
import pandas as pd
import numpy as np
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from statsmodels.stats.multitest import multipletests



In [1]:
REPLICATION = True

file_path_add = ''
if REPLICATION:
    file_path_add = 'replication/'

basemodelname = 'gpt2-large'

In [2]:
wave = 7

In [5]:
wvs_mcm = pd.read_csv(f'../data/{file_path_add}wvs_w{wave}_mcm.csv')
wvs_gpt3 = pd.read_csv('../data/wvs_w7_gpt3.csv')
wvs_gpt2 = pd.read_csv(f'../data/wvs_w7_{basemodelname}_token_pairs.csv')

In [6]:
gpt2_models = ['gpt2', 'gpt2-medium','gpt2-large']

In [7]:
wvs_gpt2_universal = wvs_gpt2.loc[wvs_gpt2.country == 'universal']

In [8]:
#We didn't do 'removed topics' in reproduction so i excluded it

#test_types = ['random', 'country_based','topic based','removed topics']
test_types = ['random', 'country_based','topic based']

In [9]:
all_eval_pairs = pickle.load(open(f'../data/{file_path_add}wvs_eval_pairs.p','rb'))

In [12]:
wvs_gpt2 = pd.read_csv(f'../data/{file_path_add}wvs_w7_{basemodelname}_token_pairs_in.csv') #PRE-TRAINED MODEL
wvs_gpt2_cultural = wvs_gpt2.loc[wvs_gpt2['country'] != 'universal']
wvs_gpt2_cultural = wvs_gpt2_cultural.loc[~pd.isna(wvs_gpt2_cultural['wvs_score'])] 

In [13]:
pew_topics = {'use contraceptives':'using contraceptives',
           'get a divorce':'getting a divorce', 
            'have an abortion': 'having an abortion',
            'be homosexual': 'homosexuality', 
           'drink alcohol': 'drinking alcohol',
           'have an extramarital affair': 'married people having an affair' ,
             'gamble': 'gambling',
       'have sex between unmarried adults':'sex between unmarried adults'
                         }

In [14]:
def included_function(pairs,data):
    def func(row):
        
        return (row['country'], row['topic']) in pairs
    return func

## Evaluating on WVS

In [15]:
list_rows = []
for test_type in test_types:
    
    eval_pairs = all_eval_pairs[test_type]
    
    wvs_gpt2_cultural_copy = wvs_gpt2_cultural.copy()
    wvs_gpt2_cultural_copy['included'] = wvs_gpt2_cultural_copy.apply(included_function(eval_pairs, 'wvs'), axis = 1)
    wvs_gpt2_cultural_test = wvs_gpt2_cultural_copy.loc[wvs_gpt2_cultural_copy.included == True]
    
    
    r, p = scipy.stats.pearsonr(wvs_gpt2_cultural_test['wvs_score'], wvs_gpt2_cultural_test['log prob difference'])
    row = {'model': basemodelname, 'test type': test_type, 'r' : r, 'p': p ,'n': len(wvs_gpt2_cultural_test)}
    list_rows.append(row)



In [16]:
df = pd.DataFrame(list_rows)
df['p'] = multipletests(df['p'], method = 'bonferroni', alpha = 0.5)[1]
df

Unnamed: 0,model,test type,r,p,n
0,gpt2,random,0.287649,8.3e-05,206
1,gpt2,country_based,0.267338,0.00036,202
2,gpt2,topic based,0.221804,0.003093,216


## Evaluating on PEW

In [17]:
pew_mcm = pd.read_csv(f'../data/pew_mcm.csv')
pew_gpt3 = pd.read_csv('../data/pew_gpt3.csv')
gpt2_topic_mapping = {'use contraceptives': 'using contraceptices',
                     'get a divorce': 'getting a divorce',
                     'have an abortion': 'having an abortion',
                     'be homosexual': 'Homosexuality',
                     'drink alcohol':'drinking alcohol',
                     'have an extramarital affair':'married people having an affair',
                     'gamble' :'gambling',
                      'have sex between unmarried adults': 'sex between unmarried adults'
                     }

In [18]:
list_rows = []
all_eval_pairs = pickle.load(open(f'../data/{file_path_add}pew_eval_pairs.p','rb'))
pew_gpt2 = pd.read_csv(f'../data/{file_path_add}pew_{basemodelname}_token_pairs.csv')
pew_gpt2['topic'] = pew_gpt2['topic'].apply(lambda t : gpt2_topic_mapping[t])
pew_gpt2_cultural = pew_gpt2.loc[pew_gpt2['country'] != 'universal']
for test_type in test_types:
    
    eval_pairs = all_eval_pairs[test_type]
    pew_gpt2_cultural_copy = pew_gpt2_cultural.copy()
    pew_gpt2_cultural_copy['included'] = pew_gpt2_cultural_copy.apply(included_function(eval_pairs, 'pew2'), axis = 1)
    pew_gpt2_cultural_test = pew_gpt2_cultural_copy.loc[pew_gpt2_cultural_copy.included == True]
   
    r, p = scipy.stats.pearsonr(pew_gpt2_cultural_test['pew_score'], pew_gpt2_cultural_test['log prob difference'])
    row = {'model': basemodelname, 'test type': test_type, 'r' : r, 'p': p ,'n': len(pew_gpt2_cultural_test)}
    list_rows.append(row)

    
    

In [19]:
df = pd.DataFrame(list_rows)
df['p'] = multipletests(df['p'], method = 'bonferroni', alpha = 0.5)[1]
df

Unnamed: 0,model,test type,r,p,n
0,gpt2,random,0.123588,1.0,43
1,gpt2,country_based,0.498049,0.000947,48
2,gpt2,topic based,0.132275,0.74496,78
