# Procesamiento de datos

In [1]:
import pandas as pd
import os

In [2]:
# Read de data
path = os.path.join(os.getcwd(), os.pardir, 'data', 'raw')
df_schema = pd.read_csv(os.path.join(path, 'survey_results_schema.csv'))
df_survey = pd.read_csv(os.path.join(path, 'survey_results_public.csv'))

El `df_schema` contiene las preguntas usadas en la encuesta.

In [3]:
# Explore what data to use
cols = ['qname', 'question']
df_schema[cols]

Unnamed: 0,qname,question
0,S0,"<div><span style=""font-size:19px;""><strong>Hel..."
1,MetaInfo,Browser Meta Info
2,S1,"<span style=""font-size:22px; font-family: aria..."
3,MainBranch,Which of the following options best describes ...
4,Employment,Which of the following best describes your cur...
5,Country,"Where do you live? <span style=""font-weight: b..."
6,US_State,<p>In which state or territory of the USA do y...
7,UK_Country,In which part of the United Kingdom do you liv...
8,S2,"<span style=""font-size:22px; font-family: aria..."
9,EdLevel,Which of the following best describes the high...


In [4]:
df_survey.sample(5)

Unnamed: 0,ResponseId,MainBranch,Employment,Country,US_State,UK_Country,EdLevel,Age1stCode,LearnCode,YearsCode,...,Age,Gender,Trans,Sexuality,Ethnicity,Accessibility,MentalHealth,SurveyLength,SurveyEase,ConvertedCompYearly
61129,61130,I am a developer by profession,Employed full-time,United States of America,Georgia,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",18 - 24 years,School;Online Courses or Certification;Books /...,10,...,25-34 years old,Woman,No,Straight / Heterosexual;Bisexual,White or of European descent,None of the above,I have an anxiety disorder,Appropriate in length,Easy,175000.0
27097,27098,I am a developer by profession,Employed full-time,India,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",18 - 24 years,Coding Bootcamp;School,6,...,18-24 years old,Man,No,Straight / Heterosexual,South Asian,None of the above,None of the above,Appropriate in length,Easy,5585.0
64727,64728,I am a developer by profession,"Independent contractor, freelancer, or self-em...",Italy,,,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",35 - 44 years,"Other online resources (ex: videos, blogs, etc...",18,...,35-44 years old,Man,No,Prefer not to say,Prefer not to say,Prefer not to say,Prefer not to say,Appropriate in length,Neither easy nor difficult,75000.0
56572,56573,I am a developer by profession,Employed full-time,Germany,,,Some college/university study without earning ...,18 - 24 years,Coding Bootcamp;Other online resources (ex: vi...,5,...,25-34 years old,Man,No,Straight / Heterosexual,White or of European descent,None of the above,I have a mood or emotional disorder (e.g. depr...,Appropriate in length,Easy,70044.0
28607,28608,I am a developer by profession,Employed part-time,France,,,"Master’s degree (M.A., M.S., M.Eng., MBA, etc.)",11 - 17 years,Coding Bootcamp;Other online resources (ex: vi...,8,...,25-34 years old,Woman,No,Bisexual,White or of European descent,None of the above,I have a concentration and/or memory disorder ...,Appropriate in length,Easy,44320.0


Seleccionar las columnas que pueden ayudar a contestar las preguntas (mostradas en el siguiente notebook). Y también los países hispanohablantes para filtrar la data (también se incluyó a Brasil).

In [5]:
# Select columns
columns = ['ResponseId', 'Age', 'Gender', 'Trans', 'Sexuality', 'Country',
            'EdLevel', 'Age1stCode', 'LearnCode', 'YearsCode', 'YearsCodePro',
            'MainBranch', 'Employment', 'DevType', 'ConvertedCompYearly']

# Filter registers by countries
latam = ['Peru', 'Colombia', 'Chile', 'Argentina', 'Costa Rica', 'Bolivia',
            'Uruguay', 'Mexico', 'Venezuela, Bolivarian Republic of...'
            'Dominican Republic', 'Ecuador', 'Guatemala', 'Paraguay', 'Panama',
            'El Salvador', 'Nicaragua', 'Brazil', 'Spain']

# New dataset
in_latam = df_survey.Country.isin(latam)
df = df_survey[in_latam][columns]

In [6]:
s = df.shape
p = s[0] / df_survey.shape[0] * 100
print(f'Registros: {s[0]}')
print(f'Porcentaje total del dataset: {round(p,2)}%')
print(f'Preguntas: {s[1]}')

Registros: 6337
Porcentaje total del dataset: 7.59%
Preguntas: 15


Veamos los tipos de datos y qué contienen las variables

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6337 entries, 11 to 83438
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ResponseId           6337 non-null   int64  
 1   Age                  6307 non-null   object 
 2   Gender               6305 non-null   object 
 3   Trans                6244 non-null   object 
 4   Sexuality            5885 non-null   object 
 5   Country              6337 non-null   object 
 6   EdLevel              6322 non-null   object 
 7   Age1stCode           6330 non-null   object 
 8   LearnCode            6316 non-null   object 
 9   YearsCode            6254 non-null   object 
 10  YearsCodePro         5160 non-null   object 
 11  MainBranch           6337 non-null   object 
 12  Employment           6335 non-null   object 
 13  DevType              5554 non-null   object 
 14  ConvertedCompYearly  4389 non-null   float64
dtypes: float64(1), int64(1), object(13)


In [8]:
df.describe(include='all')

Unnamed: 0,ResponseId,Age,Gender,Trans,Sexuality,Country,EdLevel,Age1stCode,LearnCode,YearsCode,YearsCodePro,MainBranch,Employment,DevType,ConvertedCompYearly
count,6337.0,6307,6305,6244,5885,6337,6322,6330,6316,6254.0,5160.0,6337,6335,5554,4389.0
unique,,8,11,4,21,16,9,9,306,51.0,43.0,6,9,1519,
top,,25-34 years old,Man,No,Straight / Heterosexual,Brazil,"Bachelor’s degree (B.A., B.S., B.Eng., etc.)",11 - 17 years,School,10.0,3.0,I am a developer by profession,Employed full-time,"Developer, full-stack",
freq,,2695,5870,6075,5070,2254,2535,3384,614,514.0,481.0,4848,4381,688,
mean,41783.380306,,,,,,,,,,,,,,49063.98
std,22858.458387,,,,,,,,,,,,,,137150.9
min,12.0,,,,,,,,,,,,,,12.0
25%,25064.0,,,,,,,,,,,,,,15672.0
50%,40617.0,,,,,,,,,,,,,,28105.0
75%,60783.0,,,,,,,,,,,,,,47184.0


Hay algunas columnas que podrían ser numéricas, pero están como categóricas, ¿por qué?

In [9]:
df.YearsCode.value_counts().sort_values().head()

50                    1
More than 50 years    1
47                    1
48                    1
41                    2
Name: YearsCode, dtype: int64

In [10]:
df.YearsCodePro.value_counts().sort_values().head()

38                    1
43                    1
More than 50 years    1
42                    2
34                    5
Name: YearsCodePro, dtype: int64

Como solo hay 1 valor de "más de 50 años" para cada columna, se lo reemplazará con `50`. Y los valores de "menos de 1 año" con `0.5`. De esa manera se podrá tener columnas numéricas.

In [11]:
df['YearsCode'] = pd.to_numeric(df.YearsCode.replace(['More than 50 years', 'Less than 1 year'], [50, 0.5]))
df['YearsCodePro'] = pd.to_numeric(df.YearsCodePro.replace(['More than 50 years', 'Less than 1 year'], [50, 0.5]))

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6337 entries, 11 to 83438
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ResponseId           6337 non-null   int64  
 1   Age                  6307 non-null   object 
 2   Gender               6305 non-null   object 
 3   Trans                6244 non-null   object 
 4   Sexuality            5885 non-null   object 
 5   Country              6337 non-null   object 
 6   EdLevel              6322 non-null   object 
 7   Age1stCode           6330 non-null   object 
 8   LearnCode            6316 non-null   object 
 9   YearsCode            6254 non-null   float64
 10  YearsCodePro         5160 non-null   float64
 11  MainBranch           6337 non-null   object 
 12  Employment           6335 non-null   object 
 13  DevType              5554 non-null   object 
 14  ConvertedCompYearly  4389 non-null   float64
dtypes: float64(3), int64(1), object(11)


El dataset está listo para poder trabajar.

In [15]:
path_processed = os.path.join(os.getcwd(), os.pardir, 'data', 'processed')
df.to_csv(os.path.join(path_processed, 'survey.csv'))

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=09491c61-3767-4289-98fd-88aee19bb45d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>