## Yearly Occupation Bias Visualization
This code is used for generating yearly gender bias in occupations (nahar and assafir plots)

In [4]:
import pandas as pd
from arabic_reshaper import reshape
from bidi.algorithm import get_display
import seaborn as sns
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.colors as mcolors


merged_df_dis=pd.read_csv("output\occupation_disentangled.csv")
merged_df_notdis=pd.read_csv("output\occupation_notdisentangled.csv")


merged_df_dis['occupation_combined'] = merged_df_dis['occupation_combined'].str.replace('ال', '')   
merged_df_dis['male occupation'] = merged_df_dis['male occupation'].str.replace('ال', '')   

merged_df_notdis['occupation_combined'] = merged_df_notdis['occupation_combined'].str.replace('ال', '')   
merged_df_notdis['male occupation'] = merged_df_notdis['male occupation'].str.replace('ال', '')   


In [None]:
# Mapping dictionary from Arabic to English
occupation_mapping = {
    'مهندسة': 'Engineer',
    'مهندس': 'Engineer',
    'طبيبة': 'Doctor',
    'دكتورة': 'Doctor',
    'طبيب': 'Doctor',
    'دكتور': 'Doctor',
    'محامية': 'Lawyer',
    'محامي': 'Lawyer',
    'شرطية': 'Police Officer',
    'شرطي': 'Police Officer',
    'ممرضة': 'Nurse',
    'ممرض': 'Nurse',
    'بائعة': 'Seller',
    'بائع': 'Seller',
    'صيدلانية': 'Pharmacist',
    'صيدلي': 'Pharmacist',
    'موظفة': 'Employee',
    'موظف': 'Employee',
    'مديرة': 'Manager',
    'مدير': 'Manager',
    'كاتبة': 'Writer',
    'أديبة': 'Writer',
    'كاتب': 'Writer',
    'أديب': 'Writer',
    'باحثة': 'Researcher',
    'باحث': 'Researcher',
    'صحفية': 'Journalist',
    'صحفي': 'Journalist',
    'سفيرة': 'Ambassador',
    'سفير': 'Ambassador',
    'وزيرة': 'Minister',
    'وزير': 'Minister',
    'خادمة': 'Servant',
    'خادم': 'Servant',
    'طباخة': 'Cook',
    'طباخ': 'Cook',
    'سكرتيرة': 'Secretary',
    'سكرتير': 'Secretary',
    'نائبة': 'Deputy',
    'نائب': 'Deputy',
    'بروفيسورة': 'Professor',
    'أستاذة': 'Professor',
    'بروفيسور': 'Professor',
    'أستاذ': 'Professor',
    'مصورة': 'Photographer',
    'مصور': 'Photographer',
    'سائقة': 'Driver',
    'سائق': 'Driver',
    'راقصة': 'Dancer',
    'راقص': 'Dancer',
    'مغنية': 'Singer',
    'مغني': 'Singer',
    'ممثلة': 'Actor',
    'ممثل': 'Actor',
    'فنانة': 'Artist',
    'فنان': 'Artist',
    'شاعرة': 'Poet',
    'شاعر': 'Poet',
    'خبازة': 'Baker',
    'خباز': 'Baker',
    'عاملة': 'Worker',
    'عامل': 'Worker',
    'بستانية': 'Gardener',
    'بستاني': 'Gardener',
    'ناطورة': 'Janitor',
    'ناطور': 'Janitor',
    'مفتشة': 'Inspector',
    'مفتش': 'Inspector',
    'عازفة': 'Musician',
    'عازف': 'Musician',
    'جندية': 'Soldier',
    'جندي': 'Soldier',
    'حارسة': 'Guard',
    'حارس': 'Guard',
    'رسامة': 'Painter',
    'رسام': 'Painter'
}

# Assuming the column containing Arabic words is named 'male_occupation'
column_to_translate = 'male occupation'

# Translate each Arabic word to English using the mapping dictionary
merged_df_dis['translated_occupation'] = merged_df_dis[column_to_translate].map(occupation_mapping)

# Save the updated dataframe to a new CSV file
merged_df_dis.to_csv('translated_file.csv', index=False)

print("Translation completed and saved to translated_file.csv.")


### Mostly Positive Plot

In [None]:
merged_df_notdis['occupation'] = merged_df_notdis['occupation_combined'].apply(lambda x: get_display(reshape(x)))
merged_df_dis['occupation'] = merged_df_dis['occupation_combined'].apply(lambda x: get_display(reshape(x)))
num_elements = len(merged_df_notdis['occupation'].unique())
elements = merged_df_notdis['occupation'].unique()


occupation_summary = pd.DataFrame(columns=['occupation', 'total_years', 'positive_years', 'percentage_positive'])

# Loop through each occupation
for occupation in elements:
    df_occ = merged_df_notdis[merged_df_notdis['occupation'] == occupation]
    total_years = len(df_occ)
    positive_years = len(df_occ[df_occ['projection'] > 0])
    percentage_positive = (positive_years / total_years) * 100
    
    occupation_summary = occupation_summary.append({
        'occupation': occupation,
        'total_years': total_years,
        'positive_years': positive_years,
        'percentage_positive': percentage_positive
    }, ignore_index=True)

# Filter occupations with a positive percentage greater than 80%
occupations_mostly_positive = occupation_summary[occupation_summary['percentage_positive'] > 60]

print(occupations_mostly_positive)
num_elements = len(occupations_mostly_positive['occupation'].unique())
elements=occupations_mostly_positive['occupation'].unique()

In [None]:
n=2
# Calculate the number of elements
num_elements = len(elements)

# Calculate the number of rows required for the subplot grid
num_rows = (num_elements + n - 1) // n

# Plotting setup
fig, axes = plt.subplots(nrows=num_rows, ncols=n, figsize=(150, 35 * num_rows), sharex=False)

# Define a list of colors
color_list = ['maroon', 'seagreen', 'chocolate', 'darkblue', 'olive']
num_colors = len(color_list)

# Create a colormap from the color list
cmap = mcolors.ListedColormap(color_list)

# Lists to store legend handles and labels for the common legend
legend_handles = []
# Plot each element as a subplot
for i in range(num_elements):
    row = i // n  # Calculate the row index
    col = i % n   # Calculate the column index
    
    df_notdis = merged_df_notdis[merged_df_notdis['occupation'] == elements[i]]
    df_notdis['year'] = pd.to_numeric(df_notdis['year'])  # Convert 'year' column to numeric type
    df_notdis = df_notdis.sort_values('year')
    
    df_dis = merged_df_dis[merged_df_dis['occupation'] == elements[i]]
    df_dis['year'] = pd.to_numeric(df_dis['year'])  # Convert 'year' column to numeric type
    df_dis = df_dis.sort_values('year')
    
    ax = axes[row, col]  # Access the correct subplot

    subplot_label = '(' + chr(ord('a') + i) + ') ' + df_dis['translated_occupation'].iloc[0]

    ax.text(0.5, 0.93, f"{subplot_label}", transform=ax.transAxes,
            fontsize=160, fontweight='bold', va='center', ha='center')

    
    #if not df_dis.empty:
    # Plot bars with different colors for positive and negative values
    colors = ["#36648B" if value >= 0 else "#CD6889" for value in df_dis['projection']]
    line = ax.bar(df_dis['year'], df_dis['projection'], color=colors)
    # Update y-axis limits based on data range
    y_min = df_dis['projection'].min()
    y_max = df_dis['projection'].max()
    ax.set_ylim(y_min, y_max)
    ax.set_ylabel(elements[i], fontsize=200)
    ax.set_ylim(-0.2, 0.5)  # Adjust the y-axis limits as per your data range
    ax.axhline(y=0, color='black', linestyle='--')
    
    ax.tick_params(axis='x', rotation=45, labelsize=140)
    ax.tick_params(axis='y', labelsize=140)

    # Set x-axis locator to show all years without skipping
    m=5
    ax.xaxis.set_major_locator(ticker.MultipleLocator(base=m))

# Remove any empty subplots
if num_elements < num_rows * n:
    for i in range(num_elements, num_rows * n):
        fig.delaxes(axes.flat[i])
legend_labels=[]
legend_labels.append('Direct Bias')
legend_labels.append('Direct Bias after grammatical disentanglement')
# Adjust subplot spacing
plt.tight_layout()
# Adjust subplot spacing
#plt.subplots_adjust(hspace=0.5)  # Increase vertical spacing between subplots
for ax in axes.flat:
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')  # Rotate x-axis labels
# Create a common legend outside the subplots
fig.legend(handles=legend_handles, labels=legend_labels, loc='lower right', fontsize=65,bbox_to_anchor=(0.5, 0., 0.5, 0.5))
# Save the plot to the "figures" directory (create it if it doesn't exist)
if not os.path.exists('figures'):
    os.makedirs('figures')
plot_file_path = os.path.join('figures', 'nahar_occupations_mostly_positive.png')
plt.savefig(plot_file_path, dpi=100,bbox_inches="tight")

# Show the plot
plt.show()


### Mostly Negative Plot

In [None]:
merged_df_notdis['occupation'] = merged_df_notdis['occupation_combined'].apply(lambda x: get_display(reshape(x)))
merged_df_dis['occupation'] = merged_df_dis['occupation_combined'].apply(lambda x: get_display(reshape(x)))
num_elements = len(merged_df_notdis['occupation'].unique())
elements = merged_df_notdis['occupation'].unique()


occupation_summary = pd.DataFrame(columns=['occupation', 'total_years', 'positive_years', 'percentage_positive'])

# Loop through each occupation
for occupation in elements:
    df_occ = merged_df_notdis[merged_df_notdis['occupation'] == occupation]
    total_years = len(df_occ)
    negative_years = len(df_occ[df_occ['projection'] < 0])
    percentage_negative = (negative_years / total_years) * 100
    
    occupation_summary = occupation_summary.append({
        'occupation': occupation,
        'total_years': total_years,
        'negayive_years': negative_years,
        'percentage_negative': percentage_negative
    }, ignore_index=True)

# Filter occupations with a positive percentage greater than 80%
occupations_mostly_negative = occupation_summary[occupation_summary['percentage_negative'] > 60]

print(occupations_mostly_negative)
num_elements = len(occupations_mostly_negative['occupation'].unique())
elements=occupations_mostly_negative['occupation'].unique()

In [None]:
n=2
# Calculate the number of elements
num_elements = len(elements)

# Calculate the number of rows required for the subplot grid
num_rows = (num_elements + n - 1) // n

# Plotting setup
fig, axes = plt.subplots(nrows=num_rows, ncols=n, figsize=(150, 35 * num_rows), sharex=False)

# Define a list of colors
color_list = ['maroon', 'seagreen', 'chocolate', 'darkblue', 'olive']
num_colors = len(color_list)

# Create a colormap from the color list
cmap = mcolors.ListedColormap(color_list)

# Lists to store legend handles and labels for the common legend
legend_handles = []
# Plot each element as a subplot
for i in range(num_elements):
    row = i // n  # Calculate the row index
    col = i % n   # Calculate the column index
    
    df_notdis = merged_df_notdis[merged_df_notdis['occupation'] == elements[i]]
    df_notdis['year'] = pd.to_numeric(df_notdis['year'])  # Convert 'year' column to numeric type
    df_notdis = df_notdis.sort_values('year')
    
    df_dis = merged_df_dis[merged_df_dis['occupation'] == elements[i]]
    df_dis['year'] = pd.to_numeric(df_dis['year'])  # Convert 'year' column to numeric type
    df_dis = df_dis.sort_values('year')
    
    ax = axes[row, col]  # Access the correct subplot
    subplot_label = '(' + chr(ord('a') + i) + ') ' + df_dis['translated_occupation'].iloc[0] # Convert the index to a letter label
    ax.text(0.5, 0.93, f"{subplot_label}", transform=ax.transAxes,
            fontsize=160, fontweight='bold', va='center', ha='center')

    # Plot bars with different colors for positive and negative values
    colors = ["#36648B" if value >= 0 else "#CD6889" for value in df_dis['projection']]
    line = ax.bar(df_dis['year'], df_dis['projection'], color=colors)
        # Update y-axis limits based on data range
    y_min = df_dis['projection'].min()
    y_max = df_dis['projection'].max()
    ax.set_ylim(y_min, y_max)
    ax.set_ylabel(elements[i], fontsize=200)
    ax.set_ylim(-0.41, 0.18)  # Adjust the y-axis limits as per your data range
    ax.axhline(y=0, color='black', linestyle='--')
    
    ax.tick_params(axis='x', rotation=45, labelsize=140)
    ax.tick_params(axis='y', labelsize=140)

    # Set x-axis locator to show all years without skipping
    m=5
    ax.xaxis.set_major_locator(ticker.MultipleLocator(base=m))

# Remove any empty subplots
if num_elements < num_rows * n:
    for i in range(num_elements, num_rows * n):
        fig.delaxes(axes.flat[i])
legend_labels=[]
legend_labels.append('Direct Bias')
legend_labels.append('Direct Bias after grammatical disentanglement')


# Adjust subplot spacing
plt.tight_layout()
# Adjust subplot spacing
#plt.subplots_adjust(hspace=0.5)  # Increase vertical spacing between subplots
for ax in axes.flat:
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right')  # Rotate x-axis labels
# Create a common legend outside the subplots
fig.legend(handles=legend_handles, labels=legend_labels, loc='lower right', fontsize=65,bbox_to_anchor=(0.5, 0., 0.5, 0.5))
# Save the plot to the "figures" directory (create it if it doesn't exist)
if not os.path.exists('figures'):
    os.makedirs('figures')
plot_file_path = os.path.join('figures', 'nahar_occupations_mostly_negative.png')
plt.savefig(plot_file_path, dpi=100)

# Show the plot
plt.show()


## Static Data Occupation Bias Computation 
This code is used to generate plots of the bias occupation in wiki, mnad, and uan

In [None]:
import pandas as pd
from arabic_reshaper import reshape
from bidi.algorithm import get_display
#import plotly.express as px
import seaborn as sns
import os

merged_df_dis=pd.read_csv("visualize_words.csv")
merged_df_dis['male_occupation'] = merged_df_dis['male_occupation'].str.replace('ال', '') 
# Mapping dictionary from Arabic to English
occupation_mapping = {
    'مهندسة': 'Engineer',
    'مهندس': 'Engineer',
    'طبيبة': 'Doctor',
    'دكتورة': 'Doctor',
    'طبيب': 'Physician',
    'دكتور': 'Doctor',
    'محامية': 'Lawyer',
    'محامي': 'Lawyer',
    'شرطية': 'Police Officer',
    'شرطي': 'Police Officer',
    'ممرضة': 'Nurse',
    'ممرض': 'Nurse',
    'بائعة': 'Seller',
    'بائع': 'Seller',
    'صيدلانية': 'Pharmacist',
    'صيدلي': 'Pharmacist',
    'موظفة': 'Employee',
    'موظف': 'Employee',
    'مديرة': 'Manager',
    'مدير': 'Manager',
    'كاتبة': 'Writer',
    'أديبة': 'Writer',
    'كاتب': 'Writer',
    'أديب': 'Author',
    'باحثة': 'Researcher',
    'باحث': 'Researcher',
    'صحفية': 'Journalist',
    'صحفي': 'Journalist',
    'سفيرة': 'Ambassador',
    'سفير': 'Ambassador',
    'وزيرة': 'Minister',
    'وزير': 'Minister',
    'خادمة': 'Servant',
    'خادم': 'Servant',
    'طباخة': 'Cook',
    'طباخ': 'Cook',
    'سكرتيرة': 'Secretary',
    'سكرتير': 'Secretary',
    'نائبة': 'Deputy',
    'نائب': 'Deputy',
    'بروفيسورة': 'Professor',
    'أستاذة': 'Teacher',
    'بروفيسور': 'Professor',
    'أستاذ': 'Teacher',
    'مصورة': 'Photographer',
    'مصور': 'Photographer',
    'سائقة': 'Driver',
    'سائق': 'Driver',
    'راقصة': 'Dancer',
    'راقص': 'Dancer',
    'مغنية': 'Singer',
    'مغني': 'Singer',
    'ممثلة': 'Actor',
    'ممثل': 'Actor',
    'فنانة': 'Artist',
    'فنان': 'Artist',
    'شاعرة': 'Poet',
    'شاعر': 'Poet',
    'خبازة': 'Baker',
    'خباز': 'Baker',
    'عاملة': 'Worker',
    'عامل': 'Worker',
    'بستانية': 'Gardener',
    'بستاني': 'Gardener',
    'ناطورة': 'Janitor',
    'ناطور': 'Janitor',
    'مفتشة': 'Inspector',
    'مفتش': 'Inspector',
    'عازفة': 'Musician',
    'عازف': 'Musician',
    'جندية': 'Soldier',
    'جندي': 'Soldier',
    'حارسة': 'Guard',
    'حارس': 'Guard',
    'رسامة': 'Painter',
    'رسام': 'Painter'
}

# Assuming the column containing Arabic words is named 'male_occupation'
column_to_translate = 'male_occupation'

# Translate each Arabic word to English using the mapping dictionary
merged_df_dis['translated_occupation'] = merged_df_dis[column_to_translate].map(occupation_mapping)

# Save the updated dataframe to a new CSV file
merged_df_dis.to_csv('translated_file.csv', index=False)

print("Translation completed and saved to translated_file.csv.")


In [None]:
projections_df=merged_df_dis
projections_df

### Plot English Occupations Bias

In [None]:
# Scatter plot visualization
fig, ax = plt.subplots(figsize=(6, 6))
plt.scatter(projections_df['projection'], range(len(projections_df['translated_occupation'])), marker='x', color="gray")

for i, (label, x, y) in enumerate(zip(projections_df['translated_occupation'], projections_df['projection'], range(len(projections_df['occupation_combined'])))):
    color = '#CD6889' if x < 0 else '#36648B'
    # Determine whether to place label to the left or right based on x-coordinate
    # Determine the length of the label
    label_length = len(label)//2

    # Determine whether to place label to the left or right based on x-coordinate
    if x < 0:
        xytext = (-6.5*label_length-5,2)  # Move label to the left, adjust multiplier as needed
        ha = 'right'       # Right-align the label text
    else:
        xytext = (0, 0)   # Move label to the right, adjust multiplier as needed
        ha = 'left'        # Left-align the label text
    plt.annotate(label, xy=(x, y), xytext=xytext, textcoords='offset points', fontsize=12, color=color, weight='bold')
# Add vertical dotted line at x=0
plt.axvline(x=0, color='gray', linestyle='--', linewidth=1.5)
# Adjust plot limits and add title
plt.xlim(min(projections_df['projection']) - 0.2, max(projections_df['projection']) + 0.2)
plt.ylim(0 - 2, len(projections_df['occupation_combined']) + 2)
#plt.title('Visualization of Occupation Words on Gender Direction')
# Remove y-axis labels
plt.gca().set_yticklabels([])
# Remove extra whitespace around the plot
plt.tight_layout()
# Display the plot
plt.savefig("mnad.png", dpi=600)
plt.show()

### Plot Arabic Occupation Bias

In [None]:
# Scatter plot visualization
fig, ax = plt.subplots(figsize=(6, 6))
plt.scatter(projections_df['projection'], range(len(projections_df['reshaped_labels'])), marker='x', color="gray")

for i, (label, x, y) in enumerate(zip(projections_df['reshaped_labels'], projections_df['projection'], range(len(projections_df['occupation_combined'])))):
    color = '#CD6889' if x < 0 else '#36648B'
    # Determine whether to place label to the left or right based on x-coordinate
    # Determine the length of the label
    label_length = len(label)//2

    # Determine whether to place label to the left or right based on x-coordinate
    if x < 0:
        xytext = (-6.5*label_length-5,2)  # Move label to the left, adjust multiplier as needed
        ha = 'right'       # Right-align the label text
    else:
        xytext = (0, 0)   # Move label to the right, adjust multiplier as needed
        ha = 'left'        # Left-align the label text
    plt.annotate(label, xy=(x, y), xytext=xytext, textcoords='offset points', fontsize=12, color=color, weight='bold')
# Add vertical dotted line at x=0
plt.axvline(x=0, color='gray', linestyle='--', linewidth=1.5)
# Adjust plot limits and add title
plt.xlim(min(projections_df['projection']) - 0.2, max(projections_df['projection']) + 0.2)
plt.ylim(0 - 2, len(projections_df['occupation_combined']) + 2)
#plt.title('Visualization of Occupation Words on Gender Direction')
# Remove y-axis labels
plt.gca().set_yticklabels([])
# Remove extra whitespace around the plot
plt.tight_layout()
# Display the plot
plt.savefig("mnad2.png", dpi=600)
plt.show()