In [1]:
import pandas as pd
import numpy as np

In [2]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/2020-jetbrains-python-survey.csv'

In [3]:
jb = pd.read_csv(url, dtype_backend='pyarrow', engine='pyarrow')

In [5]:
jb.sample(10)

Unnamed: 0,is.python.main,other.lang.None,other.lang.Java,other.lang.JavaScript,other.lang.C/C++,other.lang.PHP,other.lang.C#,other.lang.Ruby,other.lang.Bash / Shell,other.lang.Objective-C,...,job.role.Technical support,job.role.Data analyst,job.role.Business analyst,job.role.Team lead,job.role.Product manager,job.role.CIO / CEO / CTO,job.role.Systems analyst,job.role.Other,age,country.live
28574,Yes,,,,C/C++,,,,,,...,,,,,,,,,30–39,Norway
49718,,,,,,,,,,,...,,,,,,,,,,
12042,Yes,,Java,JavaScript,,,,,Bash / Shell,,...,,,,,,,,,30–39,Israel
10769,Yes,,,JavaScript,,,,,Bash / Shell,,...,,,,,,,,Other,30–39,United Kingdom
11193,Yes,,Java,JavaScript,,PHP,,,Bash / Shell,,...,,,,,,,,,21–29,Tunisia
46795,Yes,,,,C/C++,,,,Bash / Shell,,...,,,,,,,,,,
5889,,,,,,,,,,,...,,,,,,,,,,
33350,Yes,,,,,,,,Bash / Shell,,...,,,,,,,,,18–20,United States
1939,Yes,,,,,,,,,,...,,,,,,,,,,
24298,,,,,,,,,,,...,,,,,,,,,,


## Cleaning the data

In [6]:
import collections

counter = collections.defaultdict(list)

for col in sorted(jb.columns):
    period_count = col.count('.')
    if period_count >= 2:
        part_end = 2
    else:
        part_end = 1
    parts = col.split('.')[:part_end]
    counter['.'.join(parts)].append(col)
uniq_cols = []
for cols in counter.values():
    if len(cols) == 1:
        uniq_cols.extend(cols)

In [7]:
uniq_cols

['age',
 'are.you.datascientist',
 'company.size',
 'country.live',
 'employment.status',
 'first.learn.about.main.ide',
 'how.often.use.main.ide',
 'ide.main',
 'is.python.main',
 'job.team',
 'main.purposes',
 'missing.features.main.ide',
 'nps.main.ide',
 'python.years',
 'python2.version.most',
 'python3.version.most',
 'several.projects',
 'team.size',
 'use.python.most',
 'years.of.coding']

In [12]:
# Note: we do the code like this because no operations are seen until now return the same dataframe, but instead they return a new one (or a new series)
(jb
 [uniq_cols]
 .rename(columns= lambda c: c.replace('.', '_'))
         .age
         .str
         .slice(0,2)
         .replace('',np.nan)
         .astype('int8[pyarrow]')
)

# Using chaining like this is also beneficial for code readability, meaning the code is like a recipe with steps.

0          30
1          21
2          30
3        <NA>
4          21
         ... 
54457      21
54458    <NA>
54459      21
54460      30
54461      21
Name: age, Length: 54462, dtype: int8[pyarrow]

In [16]:
'''
Changing are_you_datascientist column convert it in boolean type
I'm not going to show every column conversion, or cleanig, however this can be seen as 
a good template 
'''

(jb
 [uniq_cols]
 .rename(columns=lambda c: c.replace('.', '_'))
 .assign(age=lambda df_:df_.age.str.slice(0,2)
                        .astype('int8[pyarrow]'),
         are_you_datascientist = lambda df_:df_.are_you_datascientist
         .replace({'Yes':'1','No':'0','':'0','Other':'0'})
         .astype('bool[pyarrow]')
         )
 .are_you_datascientist
)

0         <NA>
1         True
2        False
3         <NA>
4         <NA>
         ...  
54457    False
54458    False
54459     <NA>
54460     True
54461     <NA>
Name: are_you_datascientist, Length: 54462, dtype: bool[pyarrow]