# Salarios

## 1. importar las librerías + csv con los datos de la encuesta. 

In [31]:
# importar librerias

import pandas as pd
import plotly.express as px  
from dash import Dash, dcc, html, Input, Output
import numpy as np
import PyCurrency_Converter


#crear un dataframe con toda la informacion de la encuesta
df_csv = pd.read_csv ('../data/survey_results_public2021.csv', index_col = [0]) # El indice sera la columna con el ID de la respuesta
 #mostrar df ()

# fuente: https://stats.bis.org/statx/srs/table/I3?c=&p=202204&m=A
df_currency = pd.read_csv ('../data/conversor.csv')



## 2. Preprocesar datos.

Tratar las columnas/conjunto de datos para comenzar a crear los gráficos. En este caso Age1stcode

In [32]:

# nombre columna: lo que han preguntado en el survey.
#currency: Which currency do you use day-to-day?
#comptotal: What is your current total compensation (answered in currency from currency column)
#compfreq: Is that compensation weekly, monthly, or yearly?

#copia de df y se eliminan todas las columnas que tengan nan: Solo se seleccionan filas completas y se descartan las demas
df=df_csv[['DevType','Currency', 'CompTotal', 'CompFreq']].dropna().copy()

df= df.drop(df[df.Currency == "none"].index) #eliminar todas las none currency

# 1. convertir todas las currency a la misma moneda. X ej dolares

#convertir la moneda a un string de 3 letras.
df['Currency']= df.Currency.str.split().str.get(0)

#unir ambos df 
df3 = pd.merge(df_currency, df, how='inner')

#sustituir la compensacion en la moneda del pais por su equivalente en USD
df3["CompTotal"] = (df3["CompTotal"]/df3["Conversion"]).round(2) #redondear el numero a 2 decmales

df3

Unnamed: 0,Coutry,Currency,Conversion,DevType,CompTotal,CompFreq
0,Afghanistan,AFN,76.8135,"Developer, front-end;Developer, full-stack;Dev...",820.17,Monthly
1,Afghanistan,AFN,76.8135,"Developer, front-end;Developer, full-stack;Dev...",84.62,Weekly
2,Afghanistan,AFN,76.8135,"Developer, front-end;Developer, full-stack;Dev...",260.37,Monthly
3,Afghanistan,AFN,76.8135,"Developer, mobile;Developer, front-end;Develop...",455.65,Monthly
4,Afghanistan,AFN,76.8135,System administrator,13018.54,Monthly
...,...,...,...,...,...,...
292977,Vietnam,VND,23152.9000,"Developer, mobile;Developer, desktop or enterp...",3455.29,Monthly
292978,Vietnam,VND,23152.9000,"Developer, back-end",993.40,Monthly
292979,Yemen,YER,1240.7580,"Developer, mobile;Developer, front-end;Develop...",322.38,Monthly
292980,Yemen,YER,1240.7580,"Developer, mobile;Developer, full-stack;Studen...",80.60,Monthly


In [41]:
#ahora se debe normalizar la compensacion. Para ello se asumira que: 
# un mes tiene 4 seamanas y un anyo tiene 12 meses. 
# un anyo tiene tambien 52 semanas

condiciones = [(df3['CompFreq'] =='Monthly'), (df3['CompFreq'] =='Weekly'), (df3['CompFreq'] =='Yearly')]


In [42]:
df3['semana'] = np.select(condiciones, [(df3['CompTotal'])/4, df3['CompTotal'], (df3['CompTotal'])/52]).round(2)      
df3['mes'] = np.select(condiciones, [df3['CompTotal'], (df3['CompTotal'])*4, (df3['CompTotal'])/12]).round(2)      
df3['anyo'] = np.select(condiciones, [(df3['CompTotal'])*12, (df3['CompTotal'])*52, df3['CompTotal']]).round(2) 
df3

Unnamed: 0,Coutry,Currency,Conversion,DevType,CompTotal,CompFreq,semana,mes,anyo
0,Afghanistan,AFN,76.8135,"Developer, front-end;Developer, full-stack;Dev...",820.17,Monthly,205.04,820.17,9842.04
1,Afghanistan,AFN,76.8135,"Developer, front-end;Developer, full-stack;Dev...",84.62,Weekly,84.62,338.48,4400.24
2,Afghanistan,AFN,76.8135,"Developer, front-end;Developer, full-stack;Dev...",260.37,Monthly,65.09,260.37,3124.44
3,Afghanistan,AFN,76.8135,"Developer, mobile;Developer, front-end;Develop...",455.65,Monthly,113.91,455.65,5467.80
4,Afghanistan,AFN,76.8135,System administrator,13018.54,Monthly,3254.64,13018.54,156222.48
...,...,...,...,...,...,...,...,...,...
292977,Vietnam,VND,23152.9000,"Developer, mobile;Developer, desktop or enterp...",3455.29,Monthly,863.82,3455.29,41463.48
292978,Vietnam,VND,23152.9000,"Developer, back-end",993.40,Monthly,248.35,993.40,11920.80
292979,Yemen,YER,1240.7580,"Developer, mobile;Developer, front-end;Develop...",322.38,Monthly,80.60,322.38,3868.56
292980,Yemen,YER,1240.7580,"Developer, mobile;Developer, full-stack;Studen...",80.60,Monthly,20.15,80.60,967.20


# hay una columna con la conversion hecha. 

Rehacer el dataframe.

In [43]:
df4=df_csv[['DevType','ConvertedCompYearly', 'YearsCodePro']].dropna().copy()




In [46]:
# eliminar filas en anyos que tengan texto. 
df4.drop(df4[df4['YearsCodePro'] == "Less than 1 year"].index, inplace = True) 
df4.drop(df4[df4['YearsCodePro'] == "More than 50 years"].index, inplace = True)
# eliminar filas en anyos que tengan texto. 
df4.drop(df4[df4['YearsCodePro'] == "Less than 1 year"].index, inplace = True) 
df4.drop(df4[df4['YearsCodePro'] == "More than 50 years"].index, inplace = True)
df4.drop(df4[df4['DevType'] == "Other (please specify):"].index, inplace = True)
#separar las filas que tienen mas de 1 devtype
df4['DevType'] = df4['DevType'].str.split(';').apply(lambda x: [e.strip() for e in x]).tolist()
#contar el numero de respuestas por cada DevType (color en el grafico)
respuestas=df4.explode('DevType').groupby('DevType')['DevType'].size()

#agrupar por tipo de dev 
df4=df4.explode('DevType').groupby('DevType').agg(['median'])

df4['respuestas']=[i for i in respuestas] #anyadir el numero de respuestas

#resetear el formato del df
df4 = df4.reset_index() #devType es ahora una columna mas
df4.columns = df4.columns.droplevel(1) # eliminar el segundo nivel en las columnas

df4

Unnamed: 0,DevType,ConvertedCompYearly,YearsCodePro,median_month,median_year,median_week,respuestas
0,Academic researcher,51372.0,6.0,4281.0,51372.0,987.923077,1
1,Data or business analyst,62758.5,8.0,5229.875,62758.5,1206.894231,1
2,Data scientist or machine learning specialist,64859.0,6.0,5404.916667,64859.0,1247.288462,1
3,Database administrator,56616.0,10.0,4718.0,56616.0,1088.769231,1
4,Designer,52145.0,8.0,4345.416667,52145.0,1002.788462,1
5,DevOps specialist,71093.0,10.0,5924.416667,71093.0,1367.173077,1
6,"Developer, QA or test",57599.5,8.0,4799.958333,57599.5,1107.682692,1
7,"Developer, back-end",58368.0,7.0,4864.0,58368.0,1122.461538,1
8,"Developer, desktop or enterprise applications",59676.0,10.0,4973.0,59676.0,1147.615385,1
9,"Developer, embedded applications or devices",59454.0,8.0,4954.5,59454.0,1143.346154,1


In [47]:

def salario (df,anyo):
    
    #para poder reutilizar la columna en el df de 2021 y 2020 sin hacer grandes cambios, hay que renombrar ConvertedComp
    if anyo==20: df.rename(columns = {'ConvertedComp':'ConvertedCompYearly'}, inplace = True)
    
    #copia de df y se eliminan todas las columnas que tengan nan: Solo se seleccionan filas completas y se descartan las demas
    df4=df[['DevType','ConvertedCompYearly', 'YearsCodePro']].dropna().copy()
    
    # eliminar filas en anyos que tengan texto. 
    df4.drop(df4[df4['YearsCodePro'] == "Less than 1 year"].index, inplace = True) 
    df4.drop(df4[df4['YearsCodePro'] == "More than 50 years"].index, inplace = True)

    #separar las filas que tienen mas de 1 devtype
    df4['DevType'] = df4['DevType'].str.split(';').apply(lambda x: [e.strip() for e in x]).tolist()
    #contar el numero de respuestas por cada DevType (color en el grafico)
    respuestas=df4.explode('DevType').groupby('DevType')['DevType'].size()

    #agrupar por tipo de dev 
    df4=df4.explode('DevType').groupby('DevType').agg(['median'])

    #anyadir el numero de respuestas
    df4['respuestas']=[i for i in respuestas] 
    
    #resetear el formato del df
    df4 = df4.reset_index() #devType es ahora una columna mas
    df4.columns = df4.columns.droplevel(1) # eliminar el segundo nivel en las columnas

    return df4

Para lidiar con rangos de edades, algunos de los cuales tienen texto, se va a calcular una nueva columna con la media de todos ellos. 


## 3. Grafico. 

En este caso, un diagrama de barras.

In [48]:
app = Dash(__name__)
server = app.server #heroku
app.layout = html.Div([

    html.H1("Tipo de desarrollador", style={'text-align': 'center'}), #cabecero h1. Header
    
    #primera mini prueba con un menu desplegable.
    dcc.Dropdown(id="select_opt",  
                 options=[ #el usuario va a ver las label.
                     {"label": "#", "value": "numero"},
                     {"label": "%", "value": "porcentaje"}],
                 multi=False,
                 value="numero",
                 style={'width': "40%"}
                 ),

    dcc.Graph(id='my_survey', figure={}) # graph container

])

In [49]:
@app.callback(
    Output(component_id='my_survey', component_property='figure'),
    Input(component_id='select_opt', component_property='value'))
def update_graph(option_slctd):
    #filtered_df = df[df.year == selected_year]

    fig =  px.scatter(salario(df_csv,20), x= "YearsCodePro", y= "ConvertedCompYearly", color="respuestas", symbol="DevType",
                     hover_name="DevType", animation_frame="ConvertedCompYearly", animation_group="DevType")
        # no implementado la opcion con el porcentaje
    
    return fig

## 4. run server

In [50]:
app.run_server(debug=True, use_reloader=False)

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on
