In [2]:
DATA_PATH = '../data/raw/raw_data.csv'
EXPORT_PATH= '../data/processed/preprocessed_data.pkl'

In [3]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 10000
from sklearn.preprocessing import MultiLabelBinarizer
import pickle

## Functions

In [4]:
def split_answers(data_series, delimiters=[';', ',']):
    """
    split multiple answers in a string to a list based on delimiters - only one delimiter is evaluated at most
    which is the first match in the list
    parameters:
        data_series(pd.Series):string series with some answers
        delimiters(list): list of strings representing the delimiters used in the split - default =[';' ',']
    Returns:
        (pd.Series) : a series containing the splitted data if applicable other wise the original series
    """
    
    def check_splittable(series, delimiters):
        for delimiter in delimiters:
            if series.str.contains(delimiter).any():
                return (True, delimiter)
        return (False, None)
    
    def split(series, delimiter):
        return series.str.split(delimiter)
    
    is_splittable, delimiter = check_splittable(data_series, delimiters)
    if not is_splittable:
        return data_series
    
    splitted_series = split(data_series, delimiter)
    
    # replace na with empty list
    na_mask = splitted_series.isnull()
    splitted_series.loc[na_mask] = splitted_series.loc[na_mask].apply(lambda x:[])
    
    return splitted_series

In [5]:
REPLACE_DICT = {
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51}, 
    'YearsCode':    {'Less than 1 year': 0, 'More than 50 years': 51}}

## Preprocessing

In [6]:
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

#### replacing values

In [7]:
for col, replacments in REPLACE_DICT.items():
    df[col] = df[col].replace(replacments).astype(np.float32)

#### splitting multiple answers

In [8]:
obj_cols = df.select_dtypes(include='object').columns.tolist()
for col in obj_cols:
    df[col] = split_answers(df[col])

## sanity check

In [9]:
df.sample(1).iloc[0]

ResponseId                                                                         11974
Q120                                                                             I agree
MainBranch                                              [I am a developer by profession]
Age                                                                      25-34 years old
Employment                                                         [Employed, full-time]
RemoteWork                                       [Hybrid (some remote,  some in-person)]
CodingActivities                                          [I don’t code outside of work]
EdLevel                                [Master’s degree (M.A.,  M.S.,  M.Eng.,  MBA, ...
LearnCode                                   [Colleague, Online Courses or Certification]
LearnCodeOnline                                                                       []
LearnCodeCoursesCert                                                             [Other]
YearsCode            

## Exporting data

In [10]:
df.to_pickle(EXPORT_PATH)