# Overview
- In this notebook, we run a linear regression model to measure the effect of gender and developer roles on salary over the years.

In [1]:
import pandas as pd
import numpy as np

In [2]:
devTypes_map = {'Developer': ['Developer, full-stack',
  'Full-Stack Web Developer',
  'Full-stack web developer',
  'Developer, back-end;Developer, front-end;Developer, full-stack',
  'Full-stack developer',
  'Developer, front-end;Developer, full-stack',
  'Developer, front-end;Developer, full-stack;Developer, back-end',
  'Developer, back-end;Developer, full-stack',
  'Developer, full-stack;Developer, back-end',
  'Back-end developer;Front-end developer;Full-stack developer',
  'Back-end developer;Full-stack developer',
  'Developer, back-end',
  'Back-end web developer',
  'Developer, back-end;Developer, front-end;Developer, full-stack',
  'Back-end developer',
  'Developer, front-end;Developer, full-stack;Developer, back-end',
  'Developer, back-end;Developer, full-stack',
  'Developer, full-stack;Developer, back-end',
  'Back-end developer;Front-end developer;Full-stack developer',
  'Back-end developer;Full-stack developer',
  'Developer, back-end;Developer, desktop or enterprise applications',
  'Enterprise level services developer',
  'Developer, front-end',
  'Front-end web developer',
  'Developer, back-end;Developer, front-end;Developer, full-stack',
  'Developer, front-end;Developer, full-stack',
  'Developer, front-end;Developer, full-stack;Developer, back-end',
  'Developer, back-end;Developer, full-stack',
  'Developer, full-stack;Developer, back-end',
  'Back-end developer;Front-end developer;Full-stack developer',
  'Developer, desktop or enterprise applications',
  'Desktop developer',
  'Developer, back-end;Developer, desktop or enterprise applications',
  'Developer, mobile',
  'Mobile Dev (Android, iOS, WP & Multi-Platform)',
  'Mobile developer',
  'Developer, embedded applications or devices',
  'Embedded application developer',
  'Data scientist or machine learning specialist',
  'Mathematics Developers (Data Scientists, Machine Learning Devs & Devs with Stats & Math Backgrounds)',
  'DevOps specialist',
  'Developer, game or graphics',
  'Developer Experience',
  'Database administrator',
  'Developer Advocate'],
  'QA Assurance':['Developer, QA or test', "Quality Assurance", "QA Assurance"],
 'Product Manager': ['Project manager', 'Product manager'],
 'Engineering manager': ['Engineering manager'],
 'Student': ['Student'],
 'Academic researcher': ['Academic researcher'],
 'Research & Development role': ['Research & Development role'],
 'Senior Executive (C-Suite, VP, etc)': ['Senior Executive (C-Suite, VP, etc)'],
 'Engineer, data': ['Engineer, data'],
 'Cloud infrastructure engineer': ['Cloud infrastructure engineer'],
 'Data or business analyst': ['Data or business analyst'],
 'System administrator': ['System administrator'],
 'Security professional': ['Security professional'],
 'Engineer, site reliability': ['Engineer, site reliability'],
 'Educator': ['Educator'],
 'Scientist': ['Scientist'],
 'Blockchain': ['Blockchain'],
 'Hardware Engineer': ['Hardware Engineer'],
 'Designer': ['Designer'],
 'Marketing or sales professional': ['Marketing or sales professional']}

In [3]:
import pandas as pd
woman_not_woman_df = pd.read_csv("../data/woman_not_woman_df.csv")
woman_not_woman_df.shape

(275867, 10)

In [5]:
woman_not_woman_df.head()

Unnamed: 0,Year,Gender,Country,DevType,ConvertedCompYearly,YearsCode,gender_orig,Continent,country_alpha_code,Cargo
0,2014,Not Woman,United States of America,Full-Stack Web Developer,140000.0,11.0,Male,North America,USA,Desenvolvedor
1,2014,Woman,United States of America,Enterprise Level Services,110000.0,3.5,Female,North America,USA,
2,2014,Not Woman,United States of America,Enterprise Level Services,110000.0,11.0,Male,North America,USA,
3,2014,Not Woman,United States of America,Manager of Developers or Team Leader,90000.0,8.0,Male,North America,USA,
4,2014,Not Woman,United States of America,IT Staff / System Administrator,70000.0,11.0,Male,North America,USA,


In [6]:
designer_df = woman_not_woman_df[(woman_not_woman_df.DevType.str.lower().str.contains("designer"))]
qa_df = woman_not_woman_df[(woman_not_woman_df.DevType.isin(devTypes_map["QA Assurance"]))]
dev_df = woman_not_woman_df[(woman_not_woman_df.DevType.isin(devTypes_map["Developer"]))]
pm_df = woman_not_woman_df[(woman_not_woman_df.DevType.isin(devTypes_map["Product Manager"]))]

print("Designer ", designer_df.shape)
print("QA ", qa_df.shape)
print("DEV ", dev_df.shape)
print("PM ", pm_df.shape)

Designer  (5935, 10)
QA  (1664, 10)
DEV  (217064, 10)
PM  (729, 10)


In [7]:
woman_not_woman_df["is_designer"] = woman_not_woman_df.DevType.str.lower().str.contains("designer").astype(int)
woman_not_woman_df["is_qa"] = woman_not_woman_df.DevType.isin(devTypes_map["QA Assurance"]).astype(int)
woman_not_woman_df["is_dev"] = woman_not_woman_df.DevType.isin(devTypes_map["Developer"]).astype(int)
woman_not_woman_df["is_pm"] = woman_not_woman_df.DevType.isin(devTypes_map["Product Manager"]).astype(int)

woman_not_woman_df = woman_not_woman_df[(woman_not_woman_df.is_designer == 1) | (woman_not_woman_df.is_qa == 1) | (woman_not_woman_df.is_dev == 1) | (woman_not_woman_df.is_pm == 1)]

In [8]:
woman_not_woman_df.groupby(["is_designer", "is_qa", "is_dev", "is_pm"]).size().reset_index(name="count")

Unnamed: 0,is_designer,is_qa,is_dev,is_pm,count
0,0,0,0,1,729
1,0,0,1,0,217064
2,0,1,0,0,1664
3,1,0,0,0,5935


In [9]:
def process_salary(sal):
  if 0 <= sal <= 25000:
    return 1
  elif sal <= 50000:
    return 2
  elif sal <= 75000:
    return 3
  elif sal <= 125000:
    return 4
  else:
    return 5

woman_not_woman_df["ConvertedCompYearly_bins"] = woman_not_woman_df.ConvertedCompYearly.apply(lambda sal: process_salary(sal))
temp_df = woman_not_woman_df.groupby("ConvertedCompYearly_bins").size().reset_index(name="Amount")
temp_df["(%) Percent"] = round(temp_df["Amount"]/temp_df["Amount"].sum()*100, 2)
temp_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  woman_not_woman_df["ConvertedCompYearly_bins"] = woman_not_woman_df.ConvertedCompYearly.apply(lambda sal: process_salary(sal))


Unnamed: 0,ConvertedCompYearly_bins,Amount,(%) Percent
0,1,56019,24.85
1,2,47467,21.06
2,3,42868,19.02
3,4,44937,19.94
4,5,34101,15.13


# Using Regression Linear from Statsmodel lib - A model by Year

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
import sklearn, math
import plotly.express as px
import numpy as np
import warnings
from statsmodels.formula.api import ols
warnings.simplefilter(action='ignore', category=FutureWarning)

features_cols = ["is_designer", "is_qa", "is_dev", "is_pm", "Continent", "Gender", "country_alpha_code", "YearsCode", "ConvertedCompYearly"]
cat_cols = ["Continent", "Gender", "country_alpha_code"]

eval_data = []

for year in woman_not_woman_df.Year.drop_duplicates().tolist():
  df = woman_not_woman_df[((woman_not_woman_df["Year"] == year))][features_cols]
  df = df[df['ConvertedCompYearly'] <= 300000]

  print(f"{year}")
  print("Amount of data: ", df.shape)
  if df.shape[0] < 30:
    continue

  fit = ols('ConvertedCompYearly ~ C(Gender) + C(Continent) + is_designer + is_qa + is_pm + is_dev + C(country_alpha_code) + YearsCode', data=df).fit()
  print()

  gender_info_dict = df.groupby(["Gender"]).size().to_dict()
  amount_w = gender_info_dict["Woman"]
  amount_nw = gender_info_dict["Not Woman"]

  coef_pvalues_info = (
                      pd.DataFrame(fit.pvalues, columns=["Pvalues"])
                      .merge(
                          pd.DataFrame(fit.params, columns=["Coef"]),  left_index=True, right_index=True
                      )
                    ).to_dict()



  eval_data.append([year,
                    str(df.shape[0]),
                    str(round(fit.rsquared, 2)),
                    fit,
                    round((amount_w/(amount_w+amount_nw))*100, 2),
                    round((amount_nw/(amount_w+amount_nw))*100, 2),

                    round(coef_pvalues_info["Pvalues"]["C(Gender)[T.Woman]"], 2),
                    str(round(coef_pvalues_info["Coef"]["C(Gender)[T.Woman]"], 2)) + " (" + str(round(coef_pvalues_info["Pvalues"]["C(Gender)[T.Woman]"], 2)) + ")",


                    round(coef_pvalues_info["Pvalues"]["is_designer"], 2),
                    str(round(coef_pvalues_info["Coef"]["is_designer"], 2)) + " (" + str(round(coef_pvalues_info["Pvalues"]["is_designer"], 2)) + ")",

                    round(coef_pvalues_info["Pvalues"]["is_qa"], 2),
                    str(round(coef_pvalues_info["Coef"]["is_qa"], 2)) + " (" + str(round(coef_pvalues_info["Pvalues"]["is_qa"], 2)) + ")",

                    round(coef_pvalues_info["Pvalues"]["is_pm"], 2),
                    str(round(coef_pvalues_info["Coef"]["is_pm"], 2)) + " (" + str(round(coef_pvalues_info["Pvalues"]["is_pm"], 2)) + ")",

                    round(coef_pvalues_info["Pvalues"]["is_dev"], 2),
                    str(round(coef_pvalues_info["Coef"]["is_dev"], 2)) + " (" + str(round(coef_pvalues_info["Pvalues"]["is_dev"], 2)) + ")",

                    round(coef_pvalues_info["Pvalues"]["Intercept"], 2),
                    str(round(coef_pvalues_info["Coef"]["Intercept"], 2)) + " (" + str(round(coef_pvalues_info["Pvalues"]["Intercept"], 2)) + ")",

                    coef_pvalues_info]
                    )

2014
Amount of data:  (524, 9)

2015
Amount of data:  (10012, 9)

2016
Amount of data:  (31085, 9)

2017
Amount of data:  (892, 9)

2018
Amount of data:  (34062, 9)

2019
Amount of data:  (43135, 9)

2020
Amount of data:  (25861, 9)

2021
Amount of data:  (39616, 9)

2022
Amount of data:  (31133, 9)



In [11]:
eval_df_from_statmodel2 = pd.DataFrame(eval_data, columns=["Ano", "Qtd. Dados", "R2", "Model","Mulher(%)", "Não-Mulher (%)",
                                                          "Classe Mulher P-value", "Classe Mulher Coef.",
                                                          "Designer P-value",
                                                          "Designer Coef.",
                                                          "QA P-value",
                                                          "QA  Coef.",
                                                          "PM P-value",
                                                          "PM Coef.",
                                                          "Desenvolvedor P-value",
                                                          "Desenvolvedor Coef.",
                                                          "Intercept P-value" ,
                                                          "Intercept Coef." ,
                                                          "Model_Info"]).sort_values("Ano", ascending=True)
eval_df_from_statmodel2[["Ano", "Intercept Coef." ,
                          "Classe Mulher Coef.",
                          "Designer Coef.",
                          "QA  Coef.",
                          "PM Coef.",
                          "Desenvolvedor Coef.","Qtd. Dados", "R2",]]

Unnamed: 0,Ano,Intercept Coef.,Classe Mulher Coef.,Designer Coef.,QA Coef.,PM Coef.,Desenvolvedor Coef.,Qtd. Dados,R2
0,2014,25852.69 (0.0),-13803.06 (0.02),-0.0 (0.01),0.0 (nan),0.0 (nan),25852.69 (0.0),524,0.24
1,2015,17787.99 (0.0),-2336.93 (0.08),3797.75 (0.14),-2683.48 (0.23),16936.81 (0.0),-263.1 (0.83),10012,0.55
2,2016,8421.42 (0.0),-1936.26 (0.01),-2637.35 (0.1),403.25 (0.79),9843.52 (0.0),812.0 (0.35),31085,0.56
3,2017,5124.33 (0.32),1283.56 (0.68),-2245.21 (0.78),0.0 (0.1),-0.0 (0.1),7369.54 (0.11),892,0.74
4,2018,9371.79 (0.0),-3262.94 (0.0),61.67 (0.98),0.0 (0.07),1380.32 (0.83),7929.8 (0.0),34062,0.53
5,2019,9085.24 (0.0),-4494.56 (0.0),-4811.71 (0.0),-1669.35 (0.36),11926.31 (0.0),3639.99 (0.0),43135,0.53
6,2020,8290.35 (0.0),-4581.05 (0.0),-6253.86 (0.0),-3500.97 (0.15),15980.71 (0.0),2064.47 (0.2),25861,0.57
7,2021,9605.31 (0.0),-7298.22 (0.0),-1589.77 (0.72),-4743.12 (0.03),15568.65 (0.0),369.55 (0.81),39616,0.54
8,2022,11713.93 (0.0),-6310.2 (0.0),-2817.25 (0.65),-5345.04 (0.04),18240.01 (0.0),1636.21 (0.39),31133,0.53


In [13]:
eval_df_from_statmodel2 = pd.DataFrame(eval_data, columns=["Ano", "Qtd. Dados", "R2", "Model","Mulher(%)", "Não-Mulher (%)",
                                                          "Classe Mulher P-value", "Classe Mulher Coef.",
                                                          "Designer P-value",
                                                          "Designer Coef.",
                                                          "QA P-value",
                                                          "QA  Coef.",
                                                          "PM P-value",
                                                          "PM Coef.",
                                                          "Desenvolvedor P-value",
                                                          "Desenvolvedor Coef.",
                                                          "AnosExper P-value" ,
                                                          "AnosExper Coef." ,
                                                          "Model_Info"]).sort_values("R2", ascending=False)
eval_df_from_statmodel2[["Ano", "Qtd. Dados", "R2","Classe Mulher P-value", "Classe Mulher Coef.",]]#.drop(["Model", "Model_Info"], axis=1).head(15)

Unnamed: 0,Ano,Qtd. Dados,R2,Classe Mulher P-value,Classe Mulher Coef.
3,2017,892,0.74,0.68,1283.56 (0.68)
6,2020,25861,0.57,0.0,-4581.05 (0.0)
2,2016,31085,0.56,0.01,-1936.26 (0.01)
1,2015,10012,0.55,0.08,-2336.93 (0.08)
7,2021,39616,0.54,0.0,-7298.22 (0.0)
4,2018,34062,0.53,0.0,-3262.94 (0.0)
5,2019,43135,0.53,0.0,-4494.56 (0.0)
8,2022,31133,0.53,0.0,-6310.2 (0.0)
0,2014,524,0.24,0.02,-13803.06 (0.02)
