In [37]:
!pip install gradio==4.17.0



In [38]:
import gradio as gr
print(gr.__version__)
import pandas as pd
import numpy as np

4.17.0


In [39]:
import seaborn as sns
import matplotlib.pyplot as plt

In [40]:
from plotly.offline import init_notebook_mode, iplot
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.offline as py


In [41]:
def preProcess(df):

    df['TotalPay'] = df['BasePay'] + df['Bonus']

    #droppping nulls
    df.dropna()


    #removing outliers
    data = np.sort(df["TotalPay"])
    Q1 = np.percentile(data, 25, interpolation = 'midpoint')
    Q2 = np.percentile(data, 50, interpolation = 'midpoint')
    Q3 = np.percentile(data, 75, interpolation = 'midpoint')

    IQR = Q3 - Q1
    low_lim = Q1 - 1.5 * IQR
    up_lim = Q3 + 1.5 * IQR

    df =  df[(df["TotalPay"] >= low_lim) & (df["TotalPay"] <= up_lim)]

    return df

In [42]:

def jobTitleDistribution(fileobj):
    #file_path = fileobj.name
    df = pd.read_csv(fileobj.name)

    df=preProcess(df)
    title = pd.get_dummies(df, columns=['Gender']).groupby('JobTitle').sum()

    # Perform some operations on the DataFrame (for example, get summary statistics)
    #summary_stats = df.columns

    female = go.Pie(labels=title.index,values=title['Gender_Female'],name="Female",hole=0.5,domain={'x': [0,0.46]})
    male = go.Pie(labels=title.index,values=title['Gender_Male'],name="Male",hole=0.5,domain={'x': [0.52,1]})

    layout = dict(title = 'Job Title Distribution', font=dict(size=14), legend=dict(orientation="h"),
                  annotations = [dict(x=0.2, y=0.5, text='Female', showarrow=False, font=dict(size=20)),
                                dict(x=0.8, y=0.5, text='Male', showarrow=False, font=dict(size=20)) ])

    fig = dict(data=[female, male], layout=layout)

    # Return the summary statistics as a string
    return go.Figure(fig).update_layout(width=1000, height=500)

In [43]:
def histogram(fileobj):
    df = pd.read_csv(fileobj)
    df = preProcess(df)

    ml = df[df['Gender'] == 'Male']
    ml1 = ml['TotalPay']
    fm = df[df['Gender'] == 'Female']
    fm1 = fm['TotalPay']

    fig, ax = plt.subplots()

    ml1.hist(ax=ax, density=1, histtype='stepfilled', bins=20, alpha=0.5, label='Male')
    fm1.hist(ax=ax, density=1, histtype='stepfilled', bins=20, alpha=0.5, color='#f270c7', label='Female')

    ax.set_xlabel('Total Pay', fontsize=12)
    ax.set_ylabel('PMF', fontsize=12)
    ax.legend()

    return fig




In [44]:
import statsmodels.formula.api as smf

In [45]:

def modifyForReg(df):
  # create dummy variable to represent gender where 1=Male, 0=Female
  df['Male'] = pd.get_dummies(df['Gender'], drop_first=True)

  # centering the age based on average age
  df['C_Age'] = df['Age'] - df['Age'].mean()

  # converting the salary into a unit of thousand of dollar to simplify/shorten the coef
  df['TotalPay1'] = df['TotalPay'] / 1000

  df1 = df[['JobTitle', 'Male', 'Education', 'PerfEval', 'C_Age', 'Seniority', 'TotalPay']]
  df1.columns = ['Job_Title', 'Male', 'Edu_Lvl', 'Perf_Eval', 'C_Age', 'Years_Exp', 'Salary']
  order = ['High School', 'College', 'Masters', 'PhD']
  df1['Edu_Lvl'] = pd.Categorical(df1['Edu_Lvl'], ordered=True, categories=order)
  df1 = df1.sort_values(by="Edu_Lvl")

  return df1



In [46]:
def print_coef_std_err(mod_results):
    """
    Function to combine estimated coefficients and standard error in one DataFrame
    """
    coef = mod_results.params
    std_err = mod_results.bse

    df = pd.DataFrame(data=np.transpose([coef, std_err]),
                      index=coef.index,
                      columns=["coef", "std err"])
    return df


def create_graph_num(df1,mod_res, pred, interaction):
    """
    Create fitted line graph for x-axis with numerical value
    """
    # Extract the Coefficient and Standard Error to DataFrame
    results_dataframe = print_coef_std_err(mod_results=mod_res)

    # plot C_Age vs Salary
    predictors = [pred, "Male", interaction]
    outcome = "Salary"
    data_1 = df1.copy()
    results_ = results_dataframe.copy()
    linestyles = {0: "--", 1: "-"}
    c = {0: "c", 1: "#e9bada"}
    markers = {0: "x", 1: "."}


        # Create a Matplotlib figure and axis
    fig, ax = plt.subplots()

    # Plot the data
    for i in range(2):
        ax.scatter(data_1[data_1[predictors[1]] == i][predictors[0]], data_1[data_1[predictors[1]] == i][outcome], color=c[i], marker=markers[i])

    beta0_hat = results_.loc["Intercept"]["coef"]
    beta1_hat = results_.loc[predictors[0]]["coef"]
    beta2_hat = results_.loc[predictors[1]]["coef"]
    beta3_hat = results_.loc[predictors[2]]["coef"]

    x_domain = np.linspace(np.min(df1[predictors[0]]), np.max(df1[predictors[0]]), 100)
    fitted_values = [beta0_hat + beta1_hat * x_domain + beta2_hat * i + beta3_hat * i * x_domain for i in range(2)]

    # Plot two fitted line
    for i in range(2):
        ax.plot(x_domain, fitted_values[i], c=c[i], label=f"Fitted line (Male={i})", linestyle=linestyles[i])

    # Add a legend and labels
    ax.legend()
    ax.set_ylabel(f"{outcome}", fontsize=12)
    ax.set_xlabel(f"{predictors[0]}", fontsize=12)

    return fig


def create_graph_cat(df1,results_edu):
    """
    Create fitted line graph for x-axis with categorical value
    """
    linestyles = {0: "--", 1: "-"}
    c = {0: "c", 1: "#e9bada"}
    markers = {0: "x", 1: "."}

     # Create a Matplotlib figure and axis
    fig, ax = plt.subplots()

    # Plot the data
    for i in range(2):
        ax.scatter(df1[df1['Male'] == i]['Edu_Lvl'], df1[df1['Male'] == i]['Salary'], color=c[i], marker=markers[i])

    # Extract the Coefficient and Standard Error to DataFrame
    results_ = print_coef_std_err(mod_results=results_edu)

    beta0_hat = results_.loc["Intercept"]["coef"]
    beta1_hat = results_.loc['Edu_Lvl[T.College]']["coef"]
    beta2_hat = results_.loc['Edu_Lvl[T.Masters]']["coef"]
    beta3_hat = results_.loc['Edu_Lvl[T.PhD]']["coef"]
    beta4_hat = results_.loc['Male']["coef"]
    beta5_hat = results_.loc['Edu_Lvl[T.College]:Male']["coef"]
    beta6_hat = results_.loc['Edu_Lvl[T.Masters]:Male']["coef"]
    beta7_hat = results_.loc['Edu_Lvl[T.PhD]:Male']["coef"]

    x_domain = ['High School', 'College', 'Masters', 'PhD']

    # Calculate the fitted value for each category
    def fitted_val(n=0):
        fitted_val_var = [0, beta1_hat + beta5_hat * n, beta2_hat + beta6_hat * n, beta3_hat + beta7_hat * n]
        return fitted_val_var + beta0_hat + beta4_hat * n

    # Plot two fitted line
    for i in range(2):
        ax.plot(x_domain, fitted_val(n=i), c=c[i], label=f"Fitted line (Male={i})", linestyle=linestyles[i])

    # Add a legend and labels
    ax.legend()
    ax.set_ylabel("Salary", fontsize=12)
    ax.set_xlabel("Education Level", fontsize=12)

    return fig


In [47]:
def C_age_reg(fileobj):

  df = pd.read_csv(fileobj)
  df = preProcess(df)
  df1=modifyForReg(df)
  model_age = smf.ols("Salary ~ C_Age * Male", df1)
  results_age = model_age.fit()


  return create_graph_num(df1,results_age, "C_Age", "C_Age:Male")


In [48]:
def Years_Exp_reg(fileobj):

  df = pd.read_csv(fileobj)
  df = preProcess(df)
  df1=modifyForReg(df)
  model_exp = smf.ols("Salary ~ Years_Exp * Male", df1)
  results_exp = model_exp.fit()


  return create_graph_num(df1,results_exp, "Years_Exp", "Years_Exp:Male")

In [49]:
def Perf_Eval_reg(fileobj):

  df = pd.read_csv(fileobj)
  df = preProcess(df)
  df1=modifyForReg(df)
  model_perf = smf.ols("Salary ~ Perf_Eval * Male", df1)
  results_perf = model_perf.fit()


  return create_graph_num(df1,results_perf, "Perf_Eval", "Perf_Eval:Male")

In [50]:
def Edu_Lvl_reg(fileobj):

  df = pd.read_csv(fileobj)
  df = preProcess(df)
  df1=modifyForReg(df)
  model_edu = smf.ols("Salary ~ Edu_Lvl * Male", df1)
  results_edu = model_edu.fit()

  return create_graph_cat(df1,results_edu)


COMPARISON GRAPHS (Turkey)

In [51]:
age_bins = [18, 28, 38, 48, 58]
seniority_order = [1, 2, 3, 4, 5]
performance_order = [1, 2, 3, 4, 5]
edu_order = ['High School', 'College', 'Masters', 'PhD']

def find_paygap(pgs,feature):
    # Calculate the average pay gap
    if  pgs is None or len(pgs) == 0:
        return ("No pay gap data available.")
    else:
        x = round((sum(pgs) / len(pgs))*100,2)
        if x>0:
          return "Our analysis on the basis of "+feature+" indicates that there is a gender-based pay gap within the company, with male employees earning, on average, " + str(x) + "% more than female employees in comparable roles."
        else:
          return "Our analysis indicates that there is a gender-based pay gap within the company, with male employees earning, on average, " + str(abs(x))+ "% less than female employees in comparable roles."


Age

In [52]:
def plot_pgs_age(age_bins, df):
    # Calculate pay gap for each age category
    pay_gaps = []
    for i in range(len(age_bins) - 1):
        age_min = age_bins[i]
        age_max = age_bins[i + 1]
        subset = df[(df['Age'] >= age_min) & (df['Age'] < age_max)]
        male_avg_salary = subset[subset['Male'] == 1]['TotalPay'].mean()
        female_avg_salary = subset[subset['Male'] == 0]['TotalPay'].mean()
        pay_gap = (male_avg_salary - female_avg_salary) / male_avg_salary
        pay_gaps.append(pay_gap)

    # Include the last age category
    subset_last_category = df[df['Age'] >= age_bins[-1]]
    male_avg_salary_last = subset_last_category[subset_last_category['Male'] == 1]['TotalPay'].mean()
    female_avg_salary_last = subset_last_category[subset_last_category['Male'] == 0]['TotalPay'].mean()
    pay_gap_last = (male_avg_salary_last - female_avg_salary_last) / male_avg_salary_last
    pay_gaps.append(pay_gap_last)

    return pay_gaps

def pg_by_age(df1, df2, y1, y2, age_bins):
    # Plot pay gaps for df1
    age_labels = [f'{age_bins[i]}-{age_bins[i + 1] - 1}' for i in range(len(age_bins) - 1)] + [f'{age_bins[-1]}+']

    pay_gaps_df1 = plot_pgs_age(age_bins, df1)
    pay_gaps_df2 = plot_pgs_age(age_bins, df2)

    bar_width = 0.35
    index = np.arange(len(age_labels))

    fig, ax = plt.subplots()
    ax.bar(index, pay_gaps_df1, bar_width, label=str(y1), color='skyblue')
    ax.bar(index + bar_width, pay_gaps_df2, bar_width, label=str(y2), color='lightcoral')

    ax.set_xlabel('Age Category')
    ax.set_ylabel('Pay Gap')
    ax.set_title('Pay Gap in Each Age Category')
    ax.set_xticks(index + bar_width / 2)
    ax.set_xticklabels(age_labels, rotation=45)
    ax.legend()

    plt.tight_layout()


    return fig,plot_pgs_age(age_bins,df1)

    #Returns current years paygaps to find average

def file_to_plot_age(f1,f2):
  df1 = pd.read_csv(f1)
  df1 = preProcess(df1)
  modifyForReg(df1)

  df2 = pd.read_csv(f2)
  df2 = preProcess(df2)
  modifyForReg(df2)

  fig,pgs=pg_by_age(df1,df2,'Current Year','Previous Year',age_bins)
  text=find_paygap(pgs, 'age')

  #here, the default values for years are displayed ('Current Year','Previous Year')
  return [fig,text]

Seniority

In [53]:


def pg_by_seniority(df1, df2, y1, y2, seniority_order):
    # TO ADD MORE YEARS ADD SIMILAR STATEMENTS

    df1['Seniority_Group'] = pd.Categorical(df1['Seniority'], ordered=True, categories=seniority_order)
    df2['Seniority_Group'] = pd.Categorical(df2['Seniority'], ordered=True, categories=seniority_order)

    pay_gaps_seniority_df1 = []
    pay_gaps_seniority_df2 = []

    # Calculate the average TotalPay for each performance group and gender
    for i in range(len(seniority_order) - 1):
        min_val = seniority_order[i]
        max_val = seniority_order[i + 1]
        subset_df1 = df1[(df1['Seniority'] >= min_val) & (df1['Seniority'] < max_val)]
        subset_df2 = df2[(df2['Seniority'] >= min_val) & (df2['Seniority'] < max_val)]


        male_avg_salary_df1 = subset_df1[subset_df1['Male'] == 1]['TotalPay'].mean()
        female_avg_salary_df1 = subset_df1[subset_df1['Male'] == 0]['TotalPay'].mean()

        male_avg_salary_df2 = subset_df2[subset_df2['Male'] == 1]['TotalPay'].mean()
        female_avg_salary_df2 = subset_df2[subset_df2['Male'] == 0]['TotalPay'].mean()

        # Handling the case when male_avg_salary is zero for df1
        if male_avg_salary_df1 != 0:
            pay_gap_df1 = (male_avg_salary_df1 - female_avg_salary_df1) / male_avg_salary_df1
            pay_gaps_seniority_df1.append(pay_gap_df1)
        else:
            pay_gaps_seniority_df1.append(0)  # Set pay gap to 0 if male_avg_salary is zero for df1

        # Handling the case when male_avg_salary is zero for df2
        if male_avg_salary_df2 != 0:
            pay_gap_df2 = (male_avg_salary_df2 - female_avg_salary_df2) / male_avg_salary_df2
            pay_gaps_seniority_df2.append(pay_gap_df2)
        else:
            pay_gaps_seniority_df2.append(0)  # Set pay gap to 0 if male_avg_salary is zero for df2

    # Calculate the average TotalPay for the last performance group and gender for df1
    subset_last_category_df1 = df1[df1['Seniority'] == seniority_order[-1]]
    male_avg_salary_last_df1 = subset_last_category_df1[subset_last_category_df1['Male'] == 1]['TotalPay'].mean()
    female_avg_salary_last_df1 = subset_last_category_df1[subset_last_category_df1['Male'] == 0]['TotalPay'].mean()

    # Calculate the average TotalPay for the last performance group and gender for df2
    subset_last_category_df2 = df2[df2['Seniority'] == seniority_order[-1]]
    male_avg_salary_last_df2 = subset_last_category_df2[subset_last_category_df2['Male'] == 1]['TotalPay'].mean()
    female_avg_salary_last_df2 = subset_last_category_df2[subset_last_category_df2['Male'] == 0]['TotalPay'].mean()

    # Handling the case when male_avg_salary_last is zero for df1
    if male_avg_salary_last_df1 != 0:
        pay_gap_last_df1 = (male_avg_salary_last_df1 - female_avg_salary_last_df1) / male_avg_salary_last_df1
        pay_gaps_seniority_df1.append(pay_gap_last_df1)
    else:
        pay_gaps_seniority_df1.append(0)  # Set pay gap to 0 if male_avg_salary_last is zero for df1

    # Handling the case when male_avg_salary_last is zero for df2
    if male_avg_salary_last_df2 != 0:
        pay_gap_last_df2 = (male_avg_salary_last_df2 - female_avg_salary_last_df2) / male_avg_salary_last_df2
        pay_gaps_seniority_df2.append(pay_gap_last_df2)
    else:
        pay_gaps_seniority_df2.append(0)  # Set pay gap to 0 if male_avg_salary_last is zero for df2


    # Plot pay gaps for seniority for both dataframes
    bar_width = 0.35
    index = np.arange(len(seniority_order))

    fig, ax = plt.subplots()
    ax.bar(index, pay_gaps_seniority_df1, bar_width, label=str(y1), color='skyblue')
    ax.bar(index + bar_width, pay_gaps_seniority_df2, bar_width, label=str(y2), color='lightcoral')

    ax.set_xlabel('Seniority Level')
    ax.set_ylabel('Pay Gap')
    ax.set_title('Pay Gap in Each Seniority Level')
    ax.set_xticks(index + bar_width / 2)
    ax.set_xticklabels(seniority_order)
    ax.legend()
    plt.tight_layout()

    return fig,pay_gaps_seniority_df1

def file_to_plot_seniority(f1,f2):
  df1 = pd.read_csv(f1)
  df1 = preProcess(df1)
  modifyForReg(df1)

  df2 = pd.read_csv(f2)
  df2 = preProcess(df2)
  modifyForReg(df2)

  fig,pgs=pg_by_seniority(df1,df2,'Current Year','Previous Year',seniority_order)
  text=find_paygap(pgs, 'seniority')

  #here, the default values for years are displayed ('Current Year','Previous Year')
  return [fig,text]


Perf_eval

In [54]:
# @title Default title text

def pg_by_perf(df1, df2, y1, y2, performance_order):
    df1['PerfEval'] = pd.to_numeric(df1['PerfEval'], errors='coerce')  # Convert 'PerfEval' to numeric, handling errors by coercing to NaN
    df2['PerfEval'] = pd.to_numeric(df2['PerfEval'], errors='coerce')  # Convert 'PerfEval' to numeric, handling errors by coercing to NaN

    df1['Performance_Group'] = pd.Categorical(df1['PerfEval'], ordered=True, categories=performance_order)
    df2['Performance_Group'] = pd.Categorical(df2['PerfEval'], ordered=True, categories=performance_order)

    # Calculate pay gaps for both dataframes
    pay_gaps_performance_df1 = []
    pay_gaps_performance_df2 = []
    for i in range(len(performance_order) - 1):
        min_val = performance_order[i]
        max_val = performance_order[i + 1]
        subset_df1 = df1[(df1['PerfEval'] >= min_val) & (df1['PerfEval'] < max_val)]
        subset_df2 = df2[(df2['PerfEval'] >= min_val) & (df2['PerfEval'] < max_val)]

        male_avg_salary_df1 = subset_df1[subset_df1['Male'] == 1]['TotalPay'].mean()
        female_avg_salary_df1 = subset_df1[subset_df1['Male'] == 0]['TotalPay'].mean()
        pay_gap_df1 = (male_avg_salary_df1 - female_avg_salary_df1) / male_avg_salary_df1
        pay_gaps_performance_df1.append(pay_gap_df1)

        male_avg_salary_df2 = subset_df2[subset_df2['Male'] == 1]['TotalPay'].mean()
        female_avg_salary_df2 = subset_df2[subset_df2['Male'] == 0]['TotalPay'].mean()
        pay_gap_df2 = (male_avg_salary_df2 - female_avg_salary_df2) / male_avg_salary_df2
        pay_gaps_performance_df2.append(pay_gap_df2)

    # Calculate the average TotalPay for the last performance group and gender for df1
    subset_last_category_df1 = df1[df1['PerfEval'] == performance_order[-1]]
    male_avg_salary_last_df1 = subset_last_category_df1[subset_last_category_df1['Male'] == 1]['TotalPay'].mean()
    female_avg_salary_last_df1 = subset_last_category_df1[subset_last_category_df1['Male'] == 0]['TotalPay'].mean()

    # Calculate the average TotalPay for the last performance group and gender for df2
    subset_last_category_df2 = df2[df2['PerfEval'] == performance_order[-1]]
    male_avg_salary_last_df2 = subset_last_category_df2[subset_last_category_df2['Male'] == 1]['TotalPay'].mean()
    female_avg_salary_last_df2 = subset_last_category_df2[subset_last_category_df2['Male'] == 0]['TotalPay'].mean()

    # Handling the case when male_avg_salary_last is zero for df1
    if male_avg_salary_last_df1 != 0:
        pay_gap_last_df1 = (male_avg_salary_last_df1 - female_avg_salary_last_df1) / male_avg_salary_last_df1
        pay_gaps_performance_df1.append(pay_gap_last_df1)
    else:
        pay_gaps_performance_df1.append(0)  # Set pay gap to 0 if male_avg_salary_last is zero for df1

    # Handling the case when male_avg_salary_last is zero for df2
    if male_avg_salary_last_df2 != 0:
        pay_gap_last_df2 = (male_avg_salary_last_df2 - female_avg_salary_last_df2) / male_avg_salary_last_df2
        pay_gaps_performance_df2.append(pay_gap_last_df2)
    else:
        pay_gaps_performance_df2.append(0)  # Set pay gap to 0 if male_avg_salary_last is zero for df2

    # Plot pay gaps for performance for both dataframes
    bar_width = 0.35
    index = range(len(performance_order))

    fig, ax = plt.subplots()
    ax.bar(index, pay_gaps_performance_df1, bar_width, label=str(y1), color='skyblue')
    ax.bar([i + bar_width for i in index], pay_gaps_performance_df2, bar_width, label=str(y2), color='lightcoral')

    ax.set_xlabel('Performance Level')
    ax.set_ylabel('Pay Gap')
    ax.set_title('Pay Gap in Each Performance Level')
    ax.set_xticks([i + bar_width / 2 for i in index])
    ax.set_xticklabels(performance_order)
    ax.legend()
    plt.tight_layout()

    return fig, pay_gaps_performance_df1

def file_to_plot_perf(f1,f2):
  df1 = pd.read_csv(f1)
  df1 = preProcess(df1)
  modifyForReg(df1)

  df2 = pd.read_csv(f2)
  df2 = preProcess(df2)
  modifyForReg(df2)

  fig,pgs=pg_by_perf(df1,df2,'Current Year','Previous Year',performance_order)
  text=find_paygap(pgs,'performance')

  #here, the default values for years are displayed ('Current Year','Previous Year')
  return [fig,text]


Edu_Lvl

In [59]:

def pg_by_edu(df1, df2, y1, y2, edu_order):
    df1['Education_Group'] = pd.Categorical(df1['Education'], ordered=True, categories=edu_order)
    df2['Education_Group'] = pd.Categorical(df2['Education'], ordered=True, categories=edu_order)

    # Calculate pay gaps for both dataframes
    pay_gaps_education_df1 = []
    pay_gaps_education_df2 = []
    for edu_level in edu_order:
        subset_df1 = df1[df1['Education_Group'] == edu_level]
        subset_df2 = df2[df2['Education_Group'] == edu_level]

        male_avg_salary_df1 = subset_df1[subset_df1['Male'] == 1]['TotalPay'].mean()
        female_avg_salary_df1 = subset_df1[subset_df1['Male'] == 0]['TotalPay'].mean()
        pay_gap_df1 = (male_avg_salary_df1 - female_avg_salary_df1) / male_avg_salary_df1
        pay_gaps_education_df1.append(pay_gap_df1)

        male_avg_salary_df2 = subset_df2[subset_df2['Male'] == 1]['TotalPay'].mean()
        female_avg_salary_df2 = subset_df2[subset_df2['Male'] == 0]['TotalPay'].mean()
        pay_gap_df2 = (male_avg_salary_df2 - female_avg_salary_df2) / male_avg_salary_df2
        pay_gaps_education_df2.append(pay_gap_df2)

    # Plot pay gaps for education for both dataframes
    bar_width = 0.35
    index = range(len(edu_order))

    fig, ax = plt.subplots()
    ax.bar(index, pay_gaps_education_df1, bar_width, label=str(y1), color='skyblue')
    ax.bar([i + bar_width for i in index], pay_gaps_education_df2, bar_width, label=str(y2), color='lightcoral')

    ax.set_xlabel('Education Level')
    ax.set_ylabel('Pay Gap')
    ax.set_title('Pay Gap in Each Education Level')
    ax.set_xticks([i + bar_width / 2 for i in index])
    ax.set_xticklabels(edu_order, rotation=25)
    ax.legend()
    plt.tight_layout()

    return fig, pay_gaps_education_df1

def file_to_plot_edu(f1,f2):
  df1 = pd.read_csv(f1)
  df1 = preProcess(df1)
  modifyForReg(df1)

  df2 = pd.read_csv(f2)
  df2 = preProcess(df2)
  modifyForReg(df2)

  fig,pgs=pg_by_edu(df1,df2,'Current Year','Previous Year',edu_order)
  text=find_paygap(pgs,'education level')

  #here, the default values for years are displayed ('Current Year','Previous Year')
  return [fig,text]

Linear Regression


In [56]:
import pandas as pd
"""from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.linear_model import LinearRegression
""" # in oneAPI
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

''' df1 = pd.read_csv("GenderPayGap4.csv")
df2 = pd.read_csv("Glassdoor Gender Pay Gap.csv")
df3 = pd.read_csv("GenderPAyGap2 - Sheet1 (2).csv")
df4 = pd.read_csv("GenderPayGap3.csv")
df5 = pd.read_csv("gender_pay_gap_data 2018 (1).csv") '''

# Calculate pay gaps (replace function if calculations differ)
def pay_gap(df):
    df['TotalPay'] = df['BasePay'] + df['Bonus']
    male_data = df[(df.Gender == 'Male')]
    female_data = df[(df.Gender == 'Female')]
    avg_pay_m = male_data['TotalPay'].mean()
    avg_pay_f = female_data['TotalPay'].mean()
    pay_gap_curr = (avg_pay_m - avg_pay_f) / avg_pay_m
    return round(pay_gap_curr * 100, 2)  # Return only the pay gap percentage


def pg_trend(f1,f2,f3,f4,f5):

    df1 = pd.read_csv(f1)
    df1 = preProcess(df1)
    modifyForReg(df1)

    df2 = pd.read_csv(f2)
    df2 = preProcess(df2)
    modifyForReg(df2)

    df3 = pd.read_csv(f3)
    df3 = preProcess(df3)
    modifyForReg(df3)

    df4 = pd.read_csv(f4)
    df4 = preProcess(df4)
    modifyForReg(df4)

    df5 = pd.read_csv(f5)
    df5 = preProcess(df5)
    modifyForReg(df5)



    # Create DataFrames for each year (modify as needed)
    data_2020 = pd.DataFrame({'Year': [2020], 'PayGap': [pay_gap(df5)]})
    data_2021 = pd.DataFrame({'Year': [2021], 'PayGap': [pay_gap(df4)]})
    data_2022 = pd.DataFrame({'Year': [2022], 'PayGap': [pay_gap(df3)]})
    data_2023 = pd.DataFrame({'Year': [2023], 'PayGap': [pay_gap(df2)]})

    data_2024 = pd.DataFrame({'Year': [2024], 'PayGap': [pay_gap(df1)]})

    # Combine DataFrames
    df = pd.concat([data_2020, data_2021, data_2022, data_2023, data_2024], ignore_index=True)

    # Split data (use all-future split for prediction)
    X = df.drop("PayGap", axis=1)
    y = df["PayGap"]

    # Train model with all data
    model = LinearRegression()
    model.fit(X, y)

    # Retrieve "Year" for prediction
    X_with_year = df[["Year"]].copy()  # Copy to avoid modification

    # Predict pay gaps for future years
    predicted_paygaps = model.predict(X_with_year)

    # Visualize trend
    fig, ax = plt.subplots()
    ax.plot(df["Year"], df["PayGap"])
    ax.plot(X_with_year["Year"], predicted_paygaps)
    ax.set_xlabel("Year")
    ax.set_ylabel("PayGap")
    ax.set_title("PayGap Trend Prediction (using {} model)".format(model.__class__.__name__))

    ax.set_xticks(df['Year'])
    plt.tight_layout()

    return fig




**MAIN**

In [60]:
with gr.Blocks() as demo:


    gr.Markdown(

"""              #        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&#09;&#09;&#09; Gender Pay Equity Analysis Software
       ### Gain insights into your company's gender pay scenario
    Columns of file: JobTitle, Gender, Age, PerfEval, Education, Dept, Seniority, BasePay, Bonus


    """)

    gr.Markdown('2024')

    fileobj=gr.File()

    btn = gr.Button("Generate",interactive=True)


    #normalised histogram - no. of ppl(pmf) vs total pay
    with gr.Tab('Histogram - PMF vs Total Pay'):
        btn.click(fn=histogram, inputs=fileobj, outputs=gr.Plot(), api_name="btn1")

    with gr.Tab('Job Title Distribution'):
        btn.click(fn=jobTitleDistribution, inputs=fileobj, outputs=gr.Plot(), api_name="btn1")

    with gr.Tab('Scatter plot - C_age'):
        btn.click(fn=C_age_reg, inputs=fileobj, outputs=gr.Plot(), api_name="btn1")

    with gr.Tab('Scatter plot - Years_Exp'):
        btn.click(fn=Years_Exp_reg, inputs=fileobj, outputs=gr.Plot(), api_name="btn1")

    with gr.Tab('Scatter plot - Perf_Eval'):
        btn.click(fn=Perf_Eval_reg, inputs=fileobj, outputs=gr.Plot(), api_name="btn1")

    with gr.Tab('Scatter plot - Edu_Lvl'):
        btn.click(fn=Edu_Lvl_reg, inputs=fileobj, outputs=gr.Plot(), api_name="btn1")

    with gr.Tab('Comparison over two years'):

        gr.Markdown('Previous Year')
        fileobj2=gr.File()
        btn2=gr.Button("Generate",interactive=True)
        with gr.Tab('Age'):
          text_age=gr.Markdown()
          btn2.click(fn=file_to_plot_age, inputs=[fileobj,fileobj2], outputs=[gr.Plot(),text_age], api_name="btn1")
        with gr.Tab('Seniority'):
          text_sen=gr.Markdown()
          btn2.click(fn=file_to_plot_seniority, inputs=[fileobj,fileobj2], outputs=[gr.Plot(),text_sen], api_name="btn1")
        with gr.Tab('Performance'):
          text_perf=gr.Markdown()
          btn2.click(fn=file_to_plot_perf, inputs=[fileobj,fileobj2], outputs=[gr.Plot(),text_perf], api_name="btn1")
        with gr.Tab('Education'):
          text_edu=gr.Markdown()
          btn2.click(fn=file_to_plot_edu, inputs=[fileobj,fileobj2], outputs=[gr.Plot(),text_edu], api_name="btn1")

    with gr.Tab('Future Pay Gap Trend'):

        gr.Markdown('2023')
        fileobj2=gr.File()
        gr.Markdown('2022')
        fileobj3=gr.File()
        gr.Markdown('2021')
        fileobj4=gr.File()
        gr.Markdown('2020')
        fileobj5=gr.File()

        btn3=gr.Button("Generate",interactive=True)
        btn3.click(fn=pg_trend, inputs=[fileobj,fileobj2,fileobj3,fileobj4,fileobj5], outputs=gr.Plot(), api_name="btn1")





In [61]:
demo.launch(share=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

