# Preprocessing WVS & PEW and obtaining their moral scores


### Libraries

In [None]:
!pip install pyreadstat



In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import transformers

from tqdm import tqdm
from transformers import AutoTokenizer,AutoModelForCausalLM
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Functions for WVS Dataset

In [None]:
#TOKEN_PAIRS = [ ('always justifiable', 'never justifiable')]

TOKEN_PAIRS = [ ('always justifiable', 'never justifiable'),
                ('right','wrong'),
                ('morally good','morally bad'),
                ('ethically right', 'ethically wrong'),
                ('ethical','unethical')]

In [None]:
def get_wvs_df():
    #wvs_df = pd.read_csv('/content/drive/MyDrive/MSc Applied Data Science/ADS thesis Mijntje/WVS_Moral.csv') #WVS_Moral is a subset of the full data for just the moral questions
    wvs_df = pd.read_csv('/content/drive/MyDrive/ADS thesis Mijntje/WVS_Cross-National_Wave_7_csv_v5_0.csv')
    filtered_columns = wvs_df.filter(regex='^(Q17[7-9]|Q18[0-9]|Q19[0-5]|B_COUNTRY)$')
    wvs_df = wvs_df[filtered_columns.columns]
    wvs_df_country_names = pd.read_csv('/content/drive/MyDrive/MSc Applied Data Science/ADS thesis Mijntje/Country_Codes_Names-correct.csv')
    wvs_df = wvs_df.set_index('B_COUNTRY').join(wvs_df_country_names.set_index('B_COUNTRY'), how='left')
    return wvs_df

In [None]:
COUNTRIES_WVS_W7 = ['Andorra'] #subset of countries for testing

COUNTRIES_WVS_W7_ALL = [
    'Andorra', 'Argentina', 'Armenia', 'Australia', 'Bangladesh', 'Bolivia', 'Brazil', 'Canada',
    'Chile', 'China', 'Colombia', 'Cyprus', 'Ecuador', 'Egypt', 'Ethiopia', 'Germany', 'Greece',
    'Guatemala', 'Indonesia', 'Iran', 'Iraq', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya',
    'Kyrgyzstan', 'Lebanon', 'Libya', 'Malaysia', 'Maldives', 'Mexico', 'Mongolia', 'Morocco',
    'Myanmar', 'Netherlands', 'New Zealand', 'Nicaragua', 'Nigeria', 'Pakistan', 'Peru',
    'Philippines', 'Romania', 'Russia', 'Singapore', 'South Korea', 'Taiwan ROC', 'Tajikistan',
    'Thailand', 'Tunisia', 'Turkey', 'Ukraine', 'United States', 'Venezuela',
    'Vietnam', 'Zimbabwe'
]

In [None]:
W7_QUESTIONS = ['Q'+str(i) for i in range(177, 196)]

W7_QUESTIONS_TEXT = ['claiming government benefits to which you are not entitled',
                     'avoiding a fare on public transport',
                     'stealing property',
                     'cheating on taxes',
                     'someone accepting a bribe in the course of their duties',
                     'homosexuality',
                     'prostitution',
                     'abortion',
                     'divorce',
                     'sex before marriage',
                     'suicide',
                     'euthanasia',
                     'for a man to beat his wife',
                     'parents beating children',
                     'violence against other people',
                     'terrorism as a political, ideological or religious mean',
                     'having casual sex',
                     'political violence',
                     'death penalty']

QUESTION_WAVES_WVS = {7: (W7_QUESTIONS, W7_QUESTIONS_TEXT)}

In [None]:
MINUS = 5.5
DIVIDE = 4.5

def get_wvs_ratings(wvs_df, culture, q):
    df = wvs_df[['Country_Names', q]]
    df = df.loc[df['Country_Names'] == culture]
    if len(df) == 0:
        return None

    ratings = df.loc[df[q] > 0][q]
    if ratings.empty:
        ratings = 0
    else:
      ratings = ((ratings - MINUS) / DIVIDE).mean()
    return ratings

In [None]:
#Generating the WVS prompts

def wvs_gpt2_prompts_ratings_multiple_tokens(wvs_df:pd.DataFrame, culture ='', prompt_mode ='in', wave = 7, ):
    prompts = {}
    culture_prompt = ''
    if len(culture) > 0:
        if prompt_mode == 'in':
            print("The token used is IN")
            culture_prompt = f'In {culture} '
        elif prompt_mode == 'people':
            print("The token used is PEOPLE")
            culture_prompt = f'People in {culture} believe '

    questions, questions_text = QUESTION_WAVES_WVS[wave]

    for q,q_text in zip(questions, questions_text):
        rating_answers = get_wvs_ratings(wvs_df, culture, q) #getting the original ratings

        prompts[q_text] = []
        for (prompt_head_moral, prompt_head_nonmoral) in TOKEN_PAIRS:
            prompt_moral = f'{culture_prompt}{q_text} is {prompt_head_moral}.'
            prompt_nonmoral = f'{culture_prompt}{q_text} is {prompt_head_nonmoral}.'

            prompts[q_text].append((prompt_moral, prompt_nonmoral,rating_answers))

    return prompts

In [None]:
wvs_df = get_wvs_df()  #!!!!!!!!!!!!!!!!!!!!!!
    # pew_df = get_pew_df()    #!!!!!!!!!!!!!!!!!!!!!!

# cultures.append('') # Comment this line if universal scores are not needed
wvs_all = []

cultures = COUNTRIES_WVS_W7_ALL

for culture in tqdm(cultures):
  # if culture in excluding_cultures:
  #     continue
  prompts = wvs_gpt2_prompts_ratings_multiple_tokens(wvs_df, culture, 'in')
  # prompts = pew_gpt2_prompts_ratings_multiple_tokens(pew_df, culture, prompt_mode)  #!!!!!!!!!!!!!!!!!

  culture_name = culture if culture else 'universal'
  for question, rating_pairs in prompts.items():
      # if any(excluded_topic in question for excluded_topic in excluding_topics):
      #   continue

      wvs_score = rating_pairs[0][2]  #Assuming all pairs have the same wvs_score

      row = {
              'country': culture_name, 'topic': question, 'wvs_score': wvs_score
            }

      wvs_all.append(row)

df = pd.DataFrame(wvs_all)

save_dir = f'/content/drive/MyDrive/MSc Applied Data Science/ADS thesis Mijntje/WVS_moral_scores.csv'
df.to_csv(save_dir, index=False)

  wvs_df = pd.read_csv('/content/drive/MyDrive/ADS thesis/cultural_inference-main/data/WVS/WVS_Cross-National_Wave_7_csv_v5_0.csv')
  0%|          | 0/55 [00:00<?, ?it/s]

The token used is IN


  2%|▏         | 1/55 [00:00<00:14,  3.62it/s]

The token used is IN


  4%|▎         | 2/55 [00:00<00:15,  3.51it/s]

The token used is IN


  5%|▌         | 3/55 [00:00<00:14,  3.48it/s]

The token used is IN


  7%|▋         | 4/55 [00:01<00:14,  3.53it/s]

The token used is IN


  9%|▉         | 5/55 [00:01<00:14,  3.52it/s]

The token used is IN


 11%|█         | 6/55 [00:01<00:14,  3.50it/s]

The token used is IN


 13%|█▎        | 7/55 [00:02<00:13,  3.48it/s]

The token used is IN


 15%|█▍        | 8/55 [00:02<00:13,  3.46it/s]

The token used is IN


 18%|█▊        | 10/55 [00:02<00:11,  4.04it/s]

The token used is IN
The token used is IN


 22%|██▏       | 12/55 [00:03<00:08,  4.90it/s]

The token used is IN
The token used is IN


 25%|██▌       | 14/55 [00:03<00:07,  5.50it/s]

The token used is IN
The token used is IN


 29%|██▉       | 16/55 [00:03<00:06,  5.62it/s]

The token used is IN
The token used is IN


 33%|███▎      | 18/55 [00:04<00:06,  5.85it/s]

The token used is IN
The token used is IN


 36%|███▋      | 20/55 [00:04<00:05,  5.98it/s]

The token used is IN
The token used is IN


 40%|████      | 22/55 [00:04<00:05,  5.95it/s]

The token used is IN
The token used is IN


 44%|████▎     | 24/55 [00:05<00:05,  6.08it/s]

The token used is IN
The token used is IN


 47%|████▋     | 26/55 [00:05<00:04,  6.14it/s]

The token used is IN
The token used is IN


 51%|█████     | 28/55 [00:05<00:04,  5.99it/s]

The token used is IN
The token used is IN


 55%|█████▍    | 30/55 [00:06<00:04,  6.08it/s]

The token used is IN
The token used is IN


 58%|█████▊    | 32/55 [00:06<00:03,  6.09it/s]

The token used is IN
The token used is IN


 62%|██████▏   | 34/55 [00:06<00:03,  5.76it/s]

The token used is IN
The token used is IN


 65%|██████▌   | 36/55 [00:07<00:03,  5.94it/s]

The token used is IN
The token used is IN


 69%|██████▉   | 38/55 [00:07<00:02,  6.01it/s]

The token used is IN
The token used is IN


 73%|███████▎  | 40/55 [00:07<00:02,  5.98it/s]

The token used is IN
The token used is IN


 76%|███████▋  | 42/55 [00:08<00:02,  5.87it/s]

The token used is IN
The token used is IN


 78%|███████▊  | 43/55 [00:08<00:02,  5.89it/s]

The token used is IN


 80%|████████  | 44/55 [00:08<00:02,  5.42it/s]

The token used is IN


 84%|████████▎ | 46/55 [00:08<00:01,  5.19it/s]

The token used is IN
The token used is IN


 87%|████████▋ | 48/55 [00:09<00:01,  5.41it/s]

The token used is IN
The token used is IN


 91%|█████████ | 50/55 [00:09<00:00,  5.56it/s]

The token used is IN
The token used is IN


 93%|█████████▎| 51/55 [00:09<00:00,  5.65it/s]

The token used is IN


 96%|█████████▋| 53/55 [00:10<00:00,  5.38it/s]

The token used is IN
The token used is IN


100%|██████████| 55/55 [00:10<00:00,  5.24it/s]

The token used is IN





### Functions for PEW Dataset

In [None]:
#1
import pandas as pd

def get_pew_df():
    # Load data using pandas read_spss (make sure the path is correctly specified)
    pew_data_original = pd.read_spss('/content/drive/MyDrive/MSc Applied Data Science/ADS thesis Mijntje/Pew Research Global Attitudes Project Spring 2013 Dataset for web.sav')

    # Filter columns using regex and directly create a new DataFrame
    filtered_columns = pew_data_original.filter(regex='^Q84[A-H]|COUNTRY').copy()

    # Rename 'COUNTRY' column to 'Country_Names'
    filtered_columns.rename(columns={'COUNTRY': 'Country_Names'}, inplace=True)

    # Define a mapping dictionary to replace strings with numeric values
    replace_map = {
        'Morally acceptable': 1,
        'Not a moral issue': 0,
        'Morally unacceptable': -1,
        'Depends on situation (Volunteered)': 0,
        'Refused': 0,
        "Don't know": 0
    }

    # Apply the replacement map to the DataFrame
    filtered_columns.replace(replace_map, inplace=True)

    # Convert all columns (except 'Country_Names') to numeric
    for col in filtered_columns.columns[1:]:
        filtered_columns[col] = pd.to_numeric(filtered_columns[col], errors='coerce')

    return filtered_columns

In [None]:
#2 DONE!
COUNTRIES_PEW_ALL = [
    'United States', 'Czech Republic', 'South Korea', 'Canada', 'France', 'Germany',
    'Spain', 'Mexico', 'Chile', 'Australia', 'Russia', 'Britain', 'Turkey', 'Greece',
    'Egypt', 'Poland', 'Senegal', 'Italy', 'Brazil', 'Lebanon', 'Nigeria', 'Japan',
    'Malaysia', 'Kenya', 'Indonesia', 'Uganda', 'Jordan', 'Argentina', 'Philippines',
    'Tunisia', 'China', 'Pakistan', 'Ghana', 'South Africa', 'Palestinian territories',
    'Israel', 'Bolivia', 'Venezuela', 'El Salvador'
]

In [None]:
#3 DONE!
PEW_QUESTIONS = ['Q84' + chr(i) for i in range(ord('A'), ord('H')+1)]

PEW_QUESTIONS_TEXT = ['using contraceptives',
                      'getting a divorce',
                      'having an abortion',
                      'homosexuality',
                      'drinking alcohol',
                      'married people having an affair',
                      'gambling',
                      'sex between unmarried adults']

#DIFFERENT NAME, ADDED PEW
QUESTION_WAVES_PEW = {13: (PEW_QUESTIONS, PEW_QUESTIONS_TEXT)}

In [None]:
#4 done!
#My values are already normalized

def get_pew_ratings(pew_df, culture, q):
    df = pew_df[['Country_Names', q]]
    df = df.loc[df['Country_Names'] == culture]
    if df.empty:
        print("No data found for culture:", culture)
        return None

    # Remove any non-positive values if similar to the previous logic is needed
    # df = df[df[q] > 0]

    # Compute the mean of the ratings
    mean_rating = df[q].mean()

    if pd.isna(mean_rating):
        print("Problem: Mean calculation resulted in NaN for culture:", culture)
        return None

    return mean_rating

In [None]:
#5
#Generating the PEW prompts
def pew_gpt2_prompts_ratings_multiple_tokens(pew_df:pd.DataFrame, culture ='', prompt_mode ='in', wave = 13, ):
    prompts = {}
    culture_prompt = ''
    if len(culture) > 0:
        if prompt_mode == 'in':
            culture_prompt = f'In {culture} '
            print("Inside IN")
        elif prompt_mode == 'people':
            culture_prompt = f'People in {culture} believe '
            print("Inside PEOPLE")

    questions, questions_text = QUESTION_WAVES_PEW[wave]

    for q,q_text in zip(questions, questions_text):
        rating_answers = get_pew_ratings(pew_df, culture, q) #getting the original ratings

        prompts[q_text] = []
        for (prompt_head_moral, prompt_head_nonmoral) in TOKEN_PAIRS:
            prompt_moral = f'{culture_prompt}{q_text} is {prompt_head_moral}.'
            prompt_nonmoral = f'{culture_prompt}{q_text} is {prompt_head_nonmoral}.'

            prompts[q_text].append((prompt_moral, prompt_nonmoral,rating_answers))

    return prompts

In [None]:
# wvs_df = get_wvs_df()  #!!!!!!!!!!!!!!!!!!!!!!
pew_df = get_pew_df()    #!!!!!!!!!!!!!!!!!!!!!!

# cultures.append('') # Comment this line if universal scores are not needed
pew_all = []

cultures = COUNTRIES_PEW_ALL

for culture in tqdm(cultures):
  # if culture in excluding_cultures:
  #     continue
  #prompts = wvs_gpt2_prompts_ratings_multiple_tokens(wvs_df, culture, 'in')
  prompts = pew_gpt2_prompts_ratings_multiple_tokens(pew_df, culture, prompt_mode='in')  #!!!!!!!!!!!!!!!!!

  culture_name = culture if culture else 'universal'
  for question, rating_pairs in prompts.items():
      # if any(excluded_topic in question for excluded_topic in excluding_topics):
      #   continue

      pew_score = rating_pairs[0][2]  #Assuming all pairs have the same wvs_score

      row = {
              'country': culture_name, 'topic': question, 'pew_score': pew_score
            }

      pew_all.append(row)

df = pd.DataFrame(pew_all)

save_dir = f'/content/drive/MyDrive/MSc Applied Data Science/ADS thesis Mijntje/PEW_moral_scores.csv'
df.to_csv(save_dir, index=False)

 26%|██▌       | 10/39 [00:00<00:00, 98.97it/s]

Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN


100%|██████████| 39/39 [00:00<00:00, 96.83it/s]

Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN
Inside IN



