In [None]:
from datasets import load_dataset, load_from_disk
import numpy as np
import json
from sklearn.metrics.pairwise import cosine_similarity
import re
import random
random.seed(42)
# from scipy.special import kl_div
from scipy.stats import wasserstein_distance
from transformers import BertTokenizer, BertModel
import torch
import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd
import seaborn as sns

# Set the style of seaborn for better visuals
sns.set_style("whitegrid")

In [None]:
def not_na(row):
    for col in row:
        value = row[col]
        if isinstance(value, (list, tuple)):
            if any(pd.isna(item) for item in value):
                return False
        else:
            if pd.isna(value):
                return False
    return True

def process_dataset(dataset_path):
    dataset = load_from_disk(dataset_path)
    filtered_dataset = dataset.filter(not_na)
    return filtered_dataset.to_pandas()

# Process both datasets
df_baseline = process_dataset("baseline_values")
df_gendered = process_dataset("baseline_values_gendered")
# print(df_baseline)
# print(df_gendered)


# df_baseline_engineer = df_baseline[[True if 'engineer' in title.lower() else False for title in df_baseline['title']]]
# print(df_baseline_engineer)
# df_baseline_front = df_baseline[[True if 'front' in title.lower() else False for title in df_baseline['title']]]
# print(df_baseline_front)

# Plotting function for Location Distributions
def plot_combined_location_distribution(df1, df2):
    location_cols = ["na", "eu", "sa", "asia", "africa", "australia", "unknown", "remote"]
    location_indexes = ["North America", "Europe", "South America", "Asia", "Africa", "Australia/NZ", "Unknown", "Remote"]
    
    baseline_counts = df1[location_cols].sum()
    gendered_counts = df2[location_cols].sum()

    bar_width = 0.35
    r1 = range(len(baseline_counts))
    r2 = [x + bar_width for x in r1]

    plt.figure(figsize=(12, 6))
    plt.bar(r1, baseline_counts.values, width=bar_width, label='Gender-free embeddings', color='b', align='center')
    plt.bar(r2, gendered_counts.values, width=bar_width, label='Gendered', color='r', align='center')
    
    plt.xlabel('Location', fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.title('Location Distributions Comparison', fontsize=16)
    plt.xticks([r + bar_width for r in range(len(baseline_counts))], location_indexes, rotation=45, fontsize=14)
    plt.legend()
    plt.tight_layout()
    plt.show()

# Plotting function for Gender Distributions
def plot_combined_gender_distribution(df1, df2):
    gender_cols = ["female_count", "male_count"]
    
    baseline_counts = df1[gender_cols].sum()
    gendered_counts = df2[gender_cols].sum()

    bar_width = 0.35
    r1 = range(len(baseline_counts))
    r2 = [x + bar_width for x in r1]

    plt.figure(figsize=(10, 5))
    plt.bar(r1, baseline_counts.values, width=bar_width, label='Gender-free embeddings', color='b', align='center')
    plt.bar(r2, gendered_counts.values, width=bar_width, label='Gendered', color='r', align='center')
    
    plt.xlabel('Gender', fontsize=14)
    plt.ylabel('Count', fontsize=14)
    plt.title('Gender Distributions Comparison', fontsize=16)
    plt.xticks([r + bar_width for r in range(len(baseline_counts))], ["Female", "Male"], fontsize=14)
    plt.legend()
    plt.tight_layout()
    plt.show()

# Plot combined distributions
# plot_combined_location_distribution(df_baseline, df_gendered)
# plot_combined_gender_distribution(df_baseline, df_gendered)

location_avg = sum(df_baseline['location_count']) / len(df_baseline['location_count'])
gender_avg = sum(df_baseline['gender_count']) / len(df_baseline['gender_count'])
overall_avg = sum(df_baseline['overall']) / len(df_baseline['overall'])

print(f"Location difference {location_avg}, gender difference {gender_avg} and overall difference {overall_avg}" )

location_avg = sum(df_gendered['location_count']) / len(df_gendered['location_count'])
gender_avg = sum(df_gendered['gender_count']) / len(df_gendered['gender_count'])
overall_avg = sum(df_gendered['overall']) / len(df_gendered['overall'])

print(f"Location difference {location_avg}, gender difference {gender_avg} and overall difference {overall_avg}" )

In [None]:
important_columns = ["gender_count", "location_count", "overall", "generated_embedding", "female_count", "male_count", 
                     "na", "eu", "sa", "asia", "africa", "australia", "unknown", "remote"]
merged_df = df_baseline.merge(df_gendered, on="text", suffixes=('_baseline', '_gendered'))

In [None]:
merged_df

In [None]:
# Compute the absolute difference between the two columns
merged_df['female_count_diff'] = merged_df['female_count_gendered'] - merged_df['female_count_baseline']

# Sort the DataFrame based on the difference
sorted_df = merged_df.sort_values(by='female_count_diff', ascending=False)

# Display the top rows with the highest difference
sorted_df[['text', 'female_count_gendered', 'female_count_baseline', 'male_count_gendered', 'male_count_baseline']].head()


In [None]:
sorted_df['text'].head().iloc[0]


In [None]:
sorted_df['text'].head().iloc[1]
df_gendered['title'] = df_gendered['title'].str.lower()
df_baseline['title'] = df_baseline['title'].str.lower()
merged_df['title'] = merged_df['title_baseline']
merged_df = merged_df.drop(columns=['title_baseline', 'title_gendered'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert titles to lowercase to avoid case-sensitivity issues
merged_df['title'] = merged_df['title'].str.lower()

# Sort the dataframe by 'female_count_diff' and select the top 20
top_20_df = merged_df.sort_values(by='female_count_diff', ascending=False).head(20)

plt.figure(figsize=(15, 8))

# Use seaborn's barplot function to plot the top 20
sns.barplot(data=top_20_df, x='title', y='female_count_diff', palette='tab20')

plt.xlabel('Title')
plt.ylabel('Change in Female Selection Probability')
plt.title('Bar plot of Top 20 Female Count Differences by Title')
plt.xticks(rotation=45, ha='right')  # Rotate x labels for better readability
plt.tight_layout()
plt.show()



In [None]:
merged_df.groupby(['title']).count()

In [None]:
import spacy
from sklearn.cluster import KMeans

# Load a spaCy model (English medium model in this case)
nlp = spacy.load('en_core_web_lg')

# Convert titles to embeddings
embeddings = [nlp(title).vector for title in merged_df['title'].unique()]

# Decide the number of clusters (you might need to adjust this)
n_clusters = 75  # Example value

# Apply KMeans clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(embeddings)

# Create a mapping from titles to their cluster labels
title_to_cluster = dict(zip(merged_df['title'].unique(), kmeans.labels_))

# Assign titles in the dataframe to their respective clusters
merged_df['title_cluster'] = merged_df['title'].map(title_to_cluster)

# Now, you can use the 'title_cluster' column as a processed title for aggregation or further analysis


In [None]:
merged_df[['title', 'title_cluster']][merged_df.title_cluster == 15]

In [None]:
import numpy as np

def find_closest_title(cluster_center, embeddings, titles):
    distances = [np.linalg.norm(cluster_center - embedding) for embedding in embeddings]
    closest_index = np.argmin(distances)
    return titles[closest_index]

# Create a list of unique titles
unique_titles = merged_df['title'].unique()

# Find representative titles for each cluster
representative_titles = {}
for i in range(n_clusters):
    cluster_center = kmeans.cluster_centers_[i]
    rep_title = find_closest_title(cluster_center, embeddings, unique_titles)
    representative_titles[i] = rep_title

# Map cluster numbers to representative titles
merged_df['title_cluster_name'] = merged_df['title_cluster'].map(representative_titles)


In [None]:
import pandas as pd

# Group by the processed_titles and sum the respective columns
grouped = merged_df.groupby('title_cluster_name').agg({
    'female_count_baseline': 'sum',
    'female_count_gendered': 'sum'
}).reset_index()

# Compute the difference between the two columns
grouped['female_count_diff'] = grouped['female_count_gendered'] - grouped['female_count_baseline']

grouped = grouped.sort_values(by='female_count_diff', ascending=False)  # Sorting by difference if needed

# Display the result
print(grouped[['title_cluster_name', 'female_count_baseline', 'female_count_gendered', 'female_count_diff']])


In [None]:
# Filter out rows with certain values in the 'title_cluster_name' column
grouped_clean = grouped[~grouped['title_cluster_name'].isin(["finance guy", "engineer #2", "do"])]
grouped_clean['title_cluster_name'] = grouped_clean['title_cluster_name'].replace('business development', 'business developer')
grouped_clean['title_cluster_name'] = grouped_clean['title_cluster_name'].replace('business developer / assistant director', 'business developer')
grouped_clean['title_cluster_name'] = grouped_clean['title_cluster_name'].replace('data science, engineering', 'data engineer')

grouped_clean = grouped_clean.groupby('title_cluster_name').agg({
    'female_count_baseline': 'sum',
    'female_count_gendered': 'sum'
}).reset_index()

# Compute the difference between the two columns
grouped_clean['female_count_diff'] = grouped_clean['female_count_gendered'] - grouped_clean['female_count_baseline']

grouped_clean = grouped_clean.sort_values(by='female_count_diff', ascending=False)  # Sorting by difference if needed

In [None]:
import textwrap

%config InlineBackend.figure_format = 'retina'

# Set the Seaborn style and font scale
sns.set(style='whitegrid')

# Set figure size and DPI (for high-quality image)
# plt.figure(figsize=(6, 8), dpi=300)  # Adjusted figure size for vertical plot

# Sort the data and select the top 10
top_10_df_female = grouped_clean.sort_values(by='female_count_diff', ascending=True).head(10)

# Use Seaborn's barplot function to plot the top 10 vertically
base_color = "#7F00FF"
palette = sns.dark_palette(base_color, n_colors=len(top_10_df_female))
sns.barplot(data=top_10_df_female, y='title_cluster_name', x='female_count_diff', palette=palette, orient='h')  # Switched x and y

min_val = -0.15  # Get minimum value
max_val = top_10_df_female['female_count_diff'].max()  # Get maximum value
plt.xticks(np.arange(min_val, max_val + 0.05, 0.05), fontsize=10)  # Set x-ticks

# Customize axis labels and title
plt.ylabel('', fontweight='bold', fontsize=12)
plt.xlabel('Change in Selection Probability of Female Candidates', fontweight='bold', fontsize=10)

# Adjust tick font sizes
# Show the plot
plt.tight_layout()
plt.show()

In [None]:
grouped= merged_df.groupby('title_cluster_name').agg({
    'male_count_baseline': 'sum',
    'male_count_gendered': 'sum'
}).reset_index()

grouped_clean = grouped[~grouped['title_cluster_name'].isin(["finance guy", "engineer #2", "do", "competitive salary and equity", "remote", "qualcomm r&#38;d\n"])]
grouped_clean['title_cluster_name'] = grouped_clean['title_cluster_name'].replace('business development', 'business developer')
grouped_clean['title_cluster_name'] = grouped_clean['title_cluster_name'].replace('business developer / assistant director', 'business developer')
grouped_clean['title_cluster_name'] = grouped_clean['title_cluster_name'].replace('data science, engineering', 'data engineer')
grouped_clean['title_cluster_name'] = grouped_clean['title_cluster_name'].replace('frontend developer', 'front-end developer')
grouped_clean['title_cluster_name'] = grouped_clean['title_cluster_name'].replace('full-time developer', 'software developers')
grouped_clean['title_cluster_name'] = grouped_clean['title_cluster_name'].replace('remote engineering team', 'engineer')

grouped_clean = grouped_clean.groupby('title_cluster_name').agg({
    'male_count_baseline': 'sum',
    'male_count_gendered': 'sum'
}).reset_index()

# Compute the difference between the two columns
grouped_clean['male_count_diff'] = grouped_clean['male_count_gendered'] - grouped_clean['male_count_baseline']

grouped_clean = grouped_clean.sort_values(by='male_count_diff', ascending=False)  # Sorting by difference if needed

In [None]:
# # Set the Seaborn style and font scale
# sns.set(style='whitegrid')

# Set figure size and DPI (for high-quality image)
# plt.figure(figsize=(6, 8), dpi=300)  # Adjusted figure size for vertical plot

# Sort the data and select the top 10
top_10_df_male = grouped_clean.sort_values(by='male_count_diff', ascending=True).head(10)

# Use Seaborn's barplot function to plot the top 10 vertically
base_color = "#478778"
palette = sns.dark_palette(base_color, n_colors=len(top_10_df_male))
sns.barplot(data=top_10_df_male, y='title_cluster_name', x='male_count_diff', palette=palette, orient='h')  # Switched x and y

# Customize axis labels and title
plt.ylabel('Job Titles', fontweight='bold', fontsize=12,loc='top')
plt.xlabel('Change in Selection Probability of Male Candidates', fontweight='bold', fontsize=10)

# Adjust tick font sizes
plt.yticks(fontsize=12)
plt.xticks(fontsize=10)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
top_20_df['title_cluster_name']

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap

sns.set_style("white")

def clean_and_group(df, gender):
    column_prefix = f"{gender}_count"
    
    # Cleaning titles
    replacements = {
        'business development': 'business developer',
        'business developer / assistant director': 'business developer',
        'data science, engineering': 'data engineer',
        'frontend developer': 'front-end developer',
        'full-time developer': 'software developers',
        'remote engineering team': 'engineer'

    }

    df = df[~df['title_cluster_name'].isin(["finance guy", "engineer #2", "do", "competitive salary and equity", "remote", "qualcomm r&#38;d\n"])]
    
    for old, new in replacements.items():
        df['title_cluster_name'] = df['title_cluster_name'].replace(old, new)
    
    # Grouping by title
    grouped = df.groupby('title_cluster_name').agg({
        f'{column_prefix}_baseline': 'sum',
        f'{column_prefix}_gendered': 'sum'
    }).reset_index()

    # Compute the difference
    grouped[f'{column_prefix}_diff'] = grouped[f'{column_prefix}_gendered'] - grouped[f'{column_prefix}_baseline']
    
    return grouped.sort_values(by=f'{column_prefix}_diff', ascending=False)

# Process female and male data
grouped_female = clean_and_group(merged_df, 'female')
grouped_male = clean_and_group(merged_df, 'male')

# Combine the dataframes to get a single dataframe for plotting
merged_grouped = pd.merge(grouped_female, grouped_male, on="title_cluster_name", how="inner").fillna(0)

# # Take top 10 records for each gender
top_10_female = merged_grouped.sort_values(by='female_count_diff', ascending=True).head(10)
# print(top_10_female)

top_10_male = merged_grouped.sort_values(by='male_count_diff', ascending=True).head(10)
# print(top_10_male)

# Concatenate the two dataframes
top_20_df = pd.concat([top_10_female, top_10_male], ignore_index=True)

# Plotting
# sns.set(style='whitegrid')
# Use a palette for better color contrast and aesthetic look
palette = sns.color_palette("pastel")
female_color = palette[0]
male_color = palette[3]

# Plot size
plt.figure(figsize=(12, 6), dpi=512)

# Bar width for better distinction and visibility
bar_width = 0.4
r1 = range(len(top_20_df))
r2 = [x + bar_width for x in r1]

# Plot male and female bars side by side for each title
sns.barplot(data=top_20_df, x='title_cluster_name', y='female_count_diff', color='#009688', label='Female')
sns.barplot(data=top_20_df, x='title_cluster_name', y='male_count_diff', color='#D55E00', label='Male', alpha=0.6)  # Setting alpha for better visibility in case of overlaps

# Customize axis labels, title, and legend
# plt.title('Change in Probability for Job Titles by Gender', fontweight='bold', fontsize=12)
plt.xlabel('Job Titles', fontweight='bold', fontsize=12)
plt.ylabel('Change in Probability', fontweight='bold', fontsize=12)
wrapped_labels = [textwrap.fill(label, width=22) for label in top_20_df['title_cluster_name']]
plt.xticks([r + bar_width for r in range(len(top_20_df))], wrapped_labels, rotation=45, ha='right', fontsize=10)
plt.yticks(fontsize=8)
plt.legend(loc='upper right', fontsize=14, frameon=True)

# Aesthetic tweaks: Remove unnecessary spines
sns.despine(top=True, right=True)

# Ensure layout fits and then display the plot
plt.tight_layout()
plt.show()


In [None]:
import textwrap
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

%config InlineBackend.figure_format = 'retina'

# Set the Seaborn style and font scale
sns.set(style='whitegrid')

# Set figure size and DPI (for high-quality image)
fig = plt.figure(figsize=(12, 12), dpi=300)  # Adjusted figure size for vertical plot


# First subplot
plt.subplot(2, 1, 1)  # 2 rows, 1 column, first plot

# Use Seaborn's barplot function to plot the top 10 vertically
base_color = "#7F00FF"
palette = sns.dark_palette(base_color, n_colors=len(top_10_df_female))
sns.barplot(data=top_10_df_female, y='title_cluster_name', x='female_count_diff', palette=palette, orient='h')

min_val = -0.15  # Get minimum value
max_val = top_10_df_female['female_count_diff'].max()  # Get maximum value
plt.xticks(np.arange(min_val, max_val + 0.05, 0.05), fontsize=10)  # Set x-ticks
plt.yticks(fontsize=17)

# Customize axis labels and title
plt.ylabel('', fontweight='bold', fontsize=12)
plt.xlabel('Change in Selection Probability of Female Candidates', fontweight='bold', fontsize=16)

# Second subplot
plt.subplot(2, 1, 2)  # 2 rows, 1 column, second plot

# Use Seaborn's barplot function to plot the top 10 vertically
base_color = "#478778"
palette = sns.dark_palette(base_color, n_colors=len(top_10_df_male))
sns.barplot(data=top_10_df_male, y='title_cluster_name', x='male_count_diff', palette=palette, orient='h')

# Customize axis labels and title
plt.ylabel('', fontweight='bold', fontsize=12)
plt.xlabel('Change in Selection Probability of Male Candidates', fontweight='bold', fontsize=16)

fig.text(0.04, 0.5, 'Job Titles', va='center', rotation='vertical', fontweight='bold', fontsize=16)

# Adjust tick font sizes
plt.yticks(fontsize=17)
plt.xticks(fontsize=10)

# Show the plot
plt.tight_layout()
plt.show()


In [None]:
print(grouped_male)