## Yearly Occupation Bias Computation
This code is used for generating yearly gender bias in occupations (nahar and assafir data)

In [None]:
from GenderPCA import GenderPCA
from GenderPCAVisualizer import GenderPCAVisualizer
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from arabic_reshaper import reshape
from bidi.algorithm import get_display
import seaborn as sns
import os
import re

male_occ=["مهندس","طبيب","دكتور","محامي","ممرض","بائع","صيدلي","موظف","مدير","كاتب","باحث","صحفي","سفير","وزير","خادم","سكرتير","نائب","بروفيسور","أديب","مصور","راقص","مغني","ممثل","فنان","شاعر","عامل","عازف","جندي","أستاذ","حارس","رسام"]
female_occ=["مهندسة","طبيبة","دكتورة","محامية","ممرضة","بائعة","صيدلانية","موظفة","مديرة","كاتبة","باحثة","صحفية","سفيرة","وزيرة","خادمة","سكرتيرة","نائبة","بروفيسورة","أديبة","مصورة","راقصة","مغنية","ممثلة","فنانة","شاعرة","عاملة","عازفة","جندية","أستاذة","حارسة","رسامة"]

male_word_pairs=["ابن","أبوة","أخوة","أرمل","أزواج","آباء","أب","أبي","أعمام","ذكور","عم","أمير","حفيد","راهب","ذكر","أخ","سادة","رجال","أحفاد","سيد","عريس","عشيق","ملوك","صديق","رجل","له","نفسه","هو","ملك","أمراء","فتى","فتيان","زوج","شاب","بابا","أجداد","أبناء","جد"]
female_word_pairs=["ابنة","أمومة","أخوات","أرملة","زوجات","أمهات","أم","أمي","عمات","اناث","عمة","أميرة","حفيدة","راهبة","أنثى","أخت","سيدات","نساء","حفيدات","سيدة","عروس","عشيقة","ملكات","صديقة","امرأة","لها","نفسها","هي","ملكة","أميرات","فتاة","فتيات","زوجة","شابة","ماما","جدات","بنات","جدة"]

masculine_nouns_file=r"nouns/masculine_nouns_ar"
feminine_nouns_file=r"nouns/feminine_nouns_ar"


# Initialize a list to store the average bias values
average_bias_list = []
semantic_average_bias_list=[]
years=[]
# Loop over files in the directory
for filename in os.listdir('./models'):
    if filename.endswith('.bin'):
#        print(filename)
        filename = os.path.join("./models", filename)
        model_path = filename
        
        # Try to extract year from filename
        year_match = re.search(r'(\d{4})\.bin$', filename)
        if year_match:
            year = int(year_match.group(1))
            print(f"Processing file for year: {year}")
        else:
            year = filename
            print(f"Processing file with no year: {filename}")
        try:   
            semantic_bias = GenderPCA(model_path, male_occ, female_occ, male_word_pairs, female_word_pairs)

            semantic_bias.load_model()
            semantic_bias.check_pairs()
            semantic_bias.check_terms()
            
            print("PCA before checking terms")
            semantic_bias.do_pca()

            print("PCA after checking terms")
            semantic_bias.modify_word_pairs()
            with open(os.path.join('pairs', f'{year}.txt'),'w', encoding='utf-8') as file:
                file.write(','.join(semantic_bias.male_word_pairs))
                file.write(','.join(semantic_bias.female_word_pairs))
            

            sp = ([(semantic_bias.compute_bias(w,semantic_bias.ar_gender_direction),w) for w in semantic_bias.male_term])

            sp2 = ([(semantic_bias.compute_bias(w,semantic_bias.ar_gender_direction),w) for w in semantic_bias.female_term])


            average_bias=GenderPCAVisualizer(sp,sp2, semantic_bias.male_term, semantic_bias.female_term).visualize_histogram(str(year)+'notdis')
            average_bias_list.append(average_bias)
            years.append(year)


            semantic_bias.get_grammatical_direction(masculine_nouns_file,feminine_nouns_file)
            
            
            sp = ([(semantic_bias.compute_bias(w,semantic_bias.ar_semantic_gender_direction),w) for w in semantic_bias.male_term])

            sp2 = ([(semantic_bias.compute_bias(w,semantic_bias.ar_semantic_gender_direction),w) for w in semantic_bias.female_term])
            

            semantic_average_bias=GenderPCAVisualizer(sp,sp2, semantic_bias.male_term, semantic_bias.female_term).visualize_histogram(str(year)+'dis')
            semantic_average_bias_list.append(semantic_average_bias)
        except:
            print("error")

csv_files_dis = [os.path.join('results', f'{year}dis.csv') for year in years] 
csv_files_notdis = [os.path.join('results', f'{year}notdis.csv') for year in years] 

data_frames_dis = [pd.read_csv(file, encoding='utf-8') for file in csv_files_dis]

data_frames_notdis = [pd.read_csv(file, encoding='utf-8') for file in csv_files_notdis]

merged_df_dis = pd.concat(data_frames_dis)
merged_df_notdis = pd.concat(data_frames_notdis)

merged_df_dis.to_csv("output/occupation_disentangled.csv")
merged_df_notdis.to_csv("output/occupation_notdisentangled.csv")