In [11]:
import sys
import os
import pprint
from collections import Counter
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../methods')))
from marked_words import marked_words 

def pprint(dic):
    full_list = []
    for word in sorted(dic, key=lambda x: x[1], reverse=True):
        full_list.append(word[0])
    return full_list

In [13]:
# Load datasets
df_gpt_4o = pd.read_csv('../new_data/gpt-4o/gpt-4o_personas.csv')
df_gpt_4 = pd.read_csv('../new_data/gpt-4/gpt-4_personas.csv')
df_gpt_4_turbo = pd.read_csv('../new_data/gpt-4-turbo/gpt-4-turbo_personas.csv')
df_gemini = pd.read_csv('../new_data/gemini/gemini_personas.csv')
df_gpt_35_turbo = pd.read_csv('../new_data/gpt-3.5-turbo/gpt-3.5-turbo-0125_personas.csv')
df_llama3 = pd.read_csv('../new_data/llama3/meta-llama/Llama-3-70b-chat-hf_personas.csv')
df_mixtral = pd.read_csv('../new_data/mixtral/mistralai/Mixtral-8x22B-Instruct-v0.1_personas.csv')

titles = [
    'GPT-4o_Dataset',
    'GPT-4_Dataset',
    'GPT-4_Turbo_Dataset',
    'Gemini-1.5-Flash_Dataset',
    'GPT-3.5_Turbo_Dataset',
    'Llama-3_Dataset',
    'Mixtral_Dataset'
]

dfs = [df_gpt_4o, df_gpt_4, df_gpt_4_turbo, df_gemini, df_gpt_35_turbo, df_llama3, df_mixtral]

In [15]:
# Clean text and drop NA values
for df in dfs:
    df['text_clean'] = df['text'].str.lower().str.replace('[^\w\s]', '', regex=True)
    df.dropna(inplace=True)
    
for title, df in zip(titles, dfs):
    print(f"{title}\n{'=' * len(title)}")
    display(df["prompt_num"].value_counts())
    display(df["gender"].value_counts())
    display(df["race"].value_counts())
    print("\n")

GPT-4o_Dataset


prompt_num
0    225
1    225
2    225
3    225
4    225
5    225
Name: count, dtype: int64

gender
W    450
M    450
N    450
Name: count, dtype: int64

race
a White             270
a Black             270
an Asian            270
a Middle-Eastern    270
a Latino            270
Name: count, dtype: int64



GPT-4_Dataset


prompt_num
0    225
1    225
2    225
3    225
4    225
5    225
Name: count, dtype: int64

gender
W    450
M    450
N    450
Name: count, dtype: int64

race
a White             270
a Black             270
an Asian            270
a Middle-Eastern    270
a Latino            270
Name: count, dtype: int64



GPT-4_Turbo_Dataset


prompt_num
0    225
1    225
2    225
3    225
4    225
5    225
Name: count, dtype: int64

gender
W    450
M    450
N    450
Name: count, dtype: int64

race
a White             270
a Black             270
an Asian            270
a Middle-Eastern    270
a Latino            270
Name: count, dtype: int64



Gemini-1.5-Flash_Dataset


prompt_num
1    225
2    222
0    209
3    205
5    201
4    165
Name: count, dtype: int64

gender
N    436
W    402
M    389
Name: count, dtype: int64

race
an Asian            270
a Middle-Eastern    270
a White             219
a Black             198
a Latino            180
a Latina             90
Name: count, dtype: int64



GPT-3.5_Turbo_Dataset


prompt_num
0    225
1    225
2    225
3    225
4    225
5    225
Name: count, dtype: int64

gender
W    450
M    450
N    450
Name: count, dtype: int64

race
a White             270
a Black             270
an Asian            270
a Middle-Eastern    270
a Latino            270
Name: count, dtype: int64



Llama-3_Dataset


prompt_num
0    225
1    225
2    225
3    225
4    225
5    225
Name: count, dtype: int64

gender
W    450
M    450
N    450
Name: count, dtype: int64

race
a White             270
a Black             270
an Asian            270
a Middle-Eastern    270
a Latino            270
Name: count, dtype: int64



Mixtral_Dataset


prompt_num
0    225
1    225
2    225
3    225
4    225
5    225
Name: count, dtype: int64

gender
W    450
M    450
N    450
Name: count, dtype: int64

race
a White             270
a Black             270
an Asian            270
a Middle-Eastern    270
a Latino            270
Name: count, dtype: int64





In [17]:
# Define target groups
races = ['a Black', 'an Asian', 'a Latino', 'a Middle-Eastern', 'a White']
genders = ['M', 'W', 'N']
group_titles = []

# Generate all combinations of races and genders
targets = []
for race in races:
    for gender in genders:
        targets.append((race, gender))
        group_titles.append(f'{race} {gender}')

# Display the groups and their titles
count = 1
for group_title, target in zip(group_titles, targets):
    print(count, ': ', group_title, '\n')
    count += 1
    
# Parameters for marked_words function
unmarked_val = ['a White', 'M']

1 :  a Black M 

2 :  a Black W 

3 :  a Black N 

4 :  an Asian M 

5 :  an Asian W 

6 :  an Asian N 

7 :  a Latino M 

8 :  a Latino W 

9 :  a Latino N 

10 :  a Middle-Eastern M 

11 :  a Middle-Eastern W 

12 :  a Middle-Eastern N 

13 :  a White M 

14 :  a White W 

15 :  a White N 



In [30]:
# Apply marked_words function to each dataset and each target group
results = {}
for title, df in zip(titles, dfs):
    results[title] = {}
    print('Model: ',title)
    print('----------------------------------')
    
    for race, gender in targets:
        target_val = [race, gender]
        target_col = ['race', 'gender']
        top_words = marked_words(df, target_val, target_col, unmarked_val, verbose=False)
        results[title][f'{race} {gender}'] = top_words
        print('Identity group: ',f'{race} {gender}', ' -', len(results[title][f'{race} {gender}']), '\n List of Significant Words: ', results[title][f'{race} {gender}'], '\n')
    
# for title in titles:
#     print(title, '\n')
#     print(results[title], '\n')

Model:  GPT-4o_Dataset
----------------------------------
Identity group:  a Black M  - 0 
 List of Significant Words:  [] 

Identity group:  a Black W  - 0 
 List of Significant Words:  [] 

Identity group:  a Black N  - 13 
 List of Significant Words:  [['that', 4.525515888326928], ['their', 12.144211709284985], ['traditional', 4.72704753340205], ['blend', 4.580114462864325], ['vibrant', 4.768290058391787], ['unique', 5.4882517369077135], ['patterns', 5.494728240773512], ['intricate', 4.094123379097157], ['curls', 5.028265462790729], ['bold', 7.216654558886772], ['braids', 5.0776221189921396], ['prints', 4.120088894677117], ['intersectionality', 4.424070682602778]] 

Identity group:  an Asian M  - 2 
 List of Significant Words:  [['asian', 7.442001315145362], ['almondshaped', 5.794283855994778]] 

Identity group:  an Asian W  - 2 
 List of Significant Words:  [['asian', 7.943774779193449], ['almondshaped', 6.174506177949402]] 

Identity group:  an Asian N  - 8 
 List of Significant W