# Tratamento e Limpeza de Dados com Pandas!

In [39]:
import pandas as pd

# Importando os Dados

In [40]:
df = pd.read_csv('multiple_choice_responses.csv', delimiter=',', header=0, skiprows=[1])
df.head()

Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q2_OTHER_TEXT,Q3,Q4,Q5,Q5_OTHER_TEXT,Q6,Q7,...,Q34_Part_4,Q34_Part_5,Q34_Part_6,Q34_Part_7,Q34_Part_8,Q34_Part_9,Q34_Part_10,Q34_Part_11,Q34_Part_12,Q34_OTHER_TEXT
0,510,22-24,Male,-1,France,Master’s degree,Software Engineer,-1,"1000-9,999 employees",0,...,,,,,,,,,,-1
1,423,40-44,Male,-1,India,Professional degree,Software Engineer,-1,"> 10,000 employees",20+,...,,,,,,,,,,-1
2,83,55-59,Female,-1,Germany,Professional degree,,-1,,,...,,,,,,,,,,-1
3,391,40-44,Male,-1,Australia,Master’s degree,Other,0,"> 10,000 employees",20+,...,,,,,,Azure SQL Database,,,,-1
4,392,22-24,Male,-1,India,Bachelor’s degree,Other,1,0-49 employees,0,...,,,,,,,,,,-1


In [41]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19717 entries, 0 to 19716
Columns: 246 entries, Time from Start to Finish (seconds) to Q34_OTHER_TEXT
dtypes: int64(29), object(217)
memory usage: 37.0+ MB


(19717, 246)

In [42]:
df1 = pd.read_csv('questions_only.csv')
pd.set_option('max_colwidth', 200)
pd.options.display.float_format = '{:,.2f}'.format
df1.T

Unnamed: 0,0
Time from Start to Finish (seconds),Duration (in seconds)
Q1,What is your age (# years)?
Q2,What is your gender? - Selected Choice
Q3,In which country do you currently reside?
Q4,What is the highest level of formal education that you have attained or plan to attain within the next 2 years?
Q5,Select the title most similar to your current role (or most recent title if retired): - Selected Choice
Q6,What is the size of the company where you are employed?
Q7,Approximately how many individuals are responsible for data science workloads at your place of business?
Q8,Does your current employer incorporate machine learning methods into their business?
Q9,Select any activities that make up an important part of your role at work: (Select all that apply) - Selected Choice


In [43]:
columns = {
    'Time from Start to Finish (seconds)': 'time_seconds',
    'Q1':  'age',
    'Q2':  'gender',
    'Q3':  'country',
    'Q4':  'education',
    'Q5':  'job_role',
    'Q6':  'company_size',
    'Q7':  'data_science_employees',
    'Q8':  'has_machine_learning',
    'Q10': 'income',
    'Q11': 'money_spent_on_ml',
    'Q14': 'primary_tool',
    'Q15': 'code_time',
    'Q23': 'ml_time'
}

In [45]:
df = df[columns.keys()].rename(columns=columns)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19717 entries, 0 to 19716
Data columns (total 14 columns):
time_seconds              19717 non-null int64
age                       19717 non-null object
gender                    19717 non-null object
country                   19717 non-null object
education                 19323 non-null object
job_role                  19107 non-null object
company_size              14002 non-null object
data_science_employees    13623 non-null object
has_machine_learning      13227 non-null object
income                    12497 non-null object
money_spent_on_ml         12250 non-null object
primary_tool              15690 non-null object
code_time                 15627 non-null object
ml_time                   14182 non-null object
dtypes: int64(1), object(13)
memory usage: 2.1+ MB


In [19]:
[item for item in df['has_machine_learning'].unique()]

['I do not know',
 'We have well established ML methods (i.e., models in production for more than 2 years)',
 nan,
 'No (we do not use ML methods)',
 'We are exploring ML methods (and may one day put a model into production)',
 'We recently started using ML methods (i.e., models in production for less than 2 years)',
 'We use ML methods for generating insights (but do not put working models into production)']

In [75]:
df['time_minutes'] = df['time_seconds'].apply(lambda x: x / 60 )
df['code_time'] = df['code_time'].fillna(0)
df['has_machine_learning'] = df['has_machine_learning'].fillna("I Don't Know")

In [93]:
def machine_learning_bool(row):
    if row in ['I do not know',
               'No (we do not use ML methods)',
               'We are exploring ML methods (and may one day put a model into production)'
              ]:
        return 0
    elif row:
        return 1

In [76]:
def fix_code_time(row):
    if row == '1-2 years':
        return 2
    elif row == '< 1 years':
        return 1
    elif row == '3-5 years':
        return 3
    elif row == '5-10 years':
        return 4
    elif row == '10-20 years':
        return 5
    elif row == '20+ years':
        return '6'
    elif row == 'I have never written code':
        return 0
    elif row == 0:
        return 0


In [101]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 15669 entries, 0 to 19716
Data columns (total 17 columns):
time_seconds                    15669 non-null int64
age                             15669 non-null object
gender                          15669 non-null object
country                         15669 non-null object
education                       15578 non-null object
job_role                        15528 non-null object
company_size                    11861 non-null object
data_science_employees          11764 non-null object
has_machine_learning            11669 non-null object
income                          11450 non-null object
money_spent_on_ml               11368 non-null object
primary_tool                    14367 non-null object
code_time                       15669 non-null object
ml_time                         13478 non-null object
time_minutes                    15669 non-null float64
code_time_numeric               15669 non-null int64
has_machine_learning_numeric

In [78]:
df['code_time_numeric'] = df['code_time'].apply(fix_code_time)

In [94]:
df['has_machine_learning_numeric'] = df['has_machine_learning'].apply(machine_learning_bool)

In [96]:
df[['time_minutes', 'time_seconds', 'code_time', 'code_time_numeric', 'has_machine_learning', 'has_machine_learning_numeric']]

Unnamed: 0,time_minutes,time_seconds,code_time,code_time_numeric,has_machine_learning,has_machine_learning_numeric
0,8.50,510,1-2 years,2,I do not know,1
1,7.05,423,I have never written code,0,"We have well established ML methods (i.e., models in production for more than 2 years)",0
3,6.52,391,1-2 years,2,I do not know,1
4,6.53,392,< 1 years,1,No (we do not use ML methods),1
5,7.83,470,20+ years,6,"We have well established ML methods (i.e., models in production for more than 2 years)",0
...,...,...,...,...,...,...
19703,5.77,346,0,0,We use ML methods for generating insights (but do not put working models into production),0
19704,6.67,400,< 1 years,1,,0
19713,7.88,473,1-2 years,2,I do not know,1
19714,153.25,9195,0,0,,0


# Eliminando "apressadinhos"

In [87]:
df = df[df['time_minutes'] > 5]

In [88]:
df['code_time_numeric'] = df['code_time_numeric'].astype(int)
df['code_time_numeric']

0        2
1        0
3        2
4        1
5        6
        ..
19703    0
19704    1
19713    2
19714    0
19716    3
Name: code_time_numeric, Length: 15669, dtype: int64

In [89]:
df.sort_values(by=['time_seconds'])

Unnamed: 0,time_seconds,age,gender,country,education,job_role,company_size,data_science_employees,has_machine_learning,income,money_spent_on_ml,primary_tool,code_time,ml_time,time_minutes,code_time_numeric
207,301,25-29,Male,Belarus,Doctoral degree,Data Scientist,250-999 employees,3-4,We use ML methods for generating insights (but do not put working models into production),"1,000-1,999",$100-$999,"Local development environments (RStudio, JupyterLab, etc.)",5-10 years,1-2 years,5.02,4
17625,301,25-29,Male,Greece,Master’s degree,Not employed,,,,,,"Local development environments (RStudio, JupyterLab, etc.)",1-2 years,< 1 years,5.02,2
6514,301,25-29,Female,Other,Master’s degree,Software Engineer,0-49 employees,1-2,No (we do not use ML methods),"10,000-14,999",$0 (USD),"Local development environments (RStudio, JupyterLab, etc.)",1-2 years,1-2 years,5.02,2
1269,301,22-24,Male,India,Bachelor’s degree,Student,,,,,,"Basic statistical software (Microsoft Excel, Google Sheets, etc.)",I have never written code,,5.02,0
4463,301,50-54,Male,Brazil,Master’s degree,Product/Project Manager,250-999 employees,3-4,No (we do not use ML methods),"7,500-9,999",$0 (USD),"Business intelligence software (Salesforce, Tableau, Spotfire, etc.)",I have never written code,,5.02,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16775,663220,45-49,Female,Ireland,Professional degree,Other,50-249 employees,0,No (we do not use ML methods),"80,000-89,999","$10,000-$99,999",Other,I have never written code,,11053.67,0
19041,673263,18-21,Male,United States of America,No formal education past high school,Student,,,,,,"Basic statistical software (Microsoft Excel, Google Sheets, etc.)",1-2 years,1-2 years,11221.05,2
18141,688947,18-21,Male,China,Bachelor’s degree,Student,,,,,,"Basic statistical software (Microsoft Excel, Google Sheets, etc.)",< 1 years,,11482.45,1
18016,782843,22-24,Male,Other,Master’s degree,Student,,,,,,,0,,13047.38,0


# Exportando os Dados

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15669 entries, 0 to 19716
Data columns (total 16 columns):
time_seconds              15669 non-null int64
age                       15669 non-null object
gender                    15669 non-null object
country                   15669 non-null object
education                 15578 non-null object
job_role                  15528 non-null object
company_size              11861 non-null object
data_science_employees    11764 non-null object
has_machine_learning      11669 non-null object
income                    11450 non-null object
money_spent_on_ml         11368 non-null object
primary_tool              14367 non-null object
code_time                 15669 non-null object
ml_time                   13478 non-null object
time_minutes              15669 non-null float64
code_time_numeric         15669 non-null object
dtypes: float64(1), int64(1), object(14)
memory usage: 2.0+ MB


In [37]:
df[['age']].to_csv('final_df.csv', index=False)

In [91]:
df

Unnamed: 0,time_seconds,age,gender,country,education,job_role,company_size,data_science_employees,has_machine_learning,income,money_spent_on_ml,primary_tool,code_time,ml_time,time_minutes,code_time_numeric
0,510,22-24,Male,France,Master’s degree,Software Engineer,"1000-9,999 employees",0,I do not know,"30,000-39,999",$0 (USD),"Basic statistical software (Microsoft Excel, Google Sheets, etc.)",1-2 years,1-2 years,8.50,2
1,423,40-44,Male,India,Professional degree,Software Engineer,"> 10,000 employees",20+,"We have well established ML methods (i.e., models in production for more than 2 years)","5,000-7,499","> $100,000 ($USD)","Cloud-based data software & APIs (AWS, GCP, Azure, etc.)",I have never written code,,7.05,0
3,391,40-44,Male,Australia,Master’s degree,Other,"> 10,000 employees",20+,I do not know,"250,000-299,999","$10,000-$99,999","Local development environments (RStudio, JupyterLab, etc.)",1-2 years,2-3 years,6.52,2
4,392,22-24,Male,India,Bachelor’s degree,Other,0-49 employees,0,No (we do not use ML methods),"4,000-4,999",$0 (USD),"Local development environments (RStudio, JupyterLab, etc.)",< 1 years,< 1 years,6.53,1
5,470,50-54,Male,France,Master’s degree,Data Scientist,0-49 employees,3-4,"We have well established ML methods (i.e., models in production for more than 2 years)","60,000-69,999","$10,000-$99,999","Advanced statistical software (SPSS, SAS, etc.)",20+ years,10-15 years,7.83,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19703,346,22-24,Male,India,Bachelor’s degree,Data Scientist,50-249 employees,3-4,We use ML methods for generating insights (but do not put working models into production),"7,500-9,999",$100-$999,,0,,5.77,0
19704,400,18-21,Male,China,Bachelor’s degree,Student,,,,,,"Local development environments (RStudio, JupyterLab, etc.)",< 1 years,< 1 years,6.67,1
19713,473,18-21,Male,India,Bachelor’s degree,Other,250-999 employees,3-4,I do not know,$0-999,$0 (USD),"Local development environments (RStudio, JupyterLab, etc.)",1-2 years,,7.88,2
19714,9195,35-39,Male,India,Master’s degree,Student,,,,,,,0,,153.25,0
