In [13]:
# Constants
DATA_PATH = "../data/raw/survey_results_public.csv"
EXPORT_PATH = "../data/processed/1_preprocessed_df.pkl"

REPLACED_DICT = {
    'YearsCodePro' : {'Less than 1 year' : 0, 'More than 50 years' : 51},
    'YearsCode' : {'Less than 1 year' : 0, 'More than 50 years' : 51}
}

In [14]:
# Load packages
import pandas as pd
import numpy as np
import logging
import pickle

## Functions

In [15]:
def split_answers(data_series, delimiter=";"):
    """
    Split multiple answers in a single string
    to a list of single strings each representing a single answers
    
    Parameters:
    * data_series (pd.Series) : String series with answers
    * delimiter (string)      : Another decimal integer 
                                Defaults to "to"
                                
    Returns : (pd.Series) : If column contains
    """
    # Sub functions
    def is_splittable(pd_series, delimiter):
        """ Check if results multiple should be splitted - Returns boolean"""
        return pd_series.str.contains(delimiter)
    
    def split_answer(pd_series, delimiter):
        """ Function to split single answer"""
        return pd_series.str.split(delimiter)
    # ----------------------------

    # Check if multiple answers exist - if none return original
    splittable_values = is_splittable(data_series, delimiter)
    if not splittable_values.any():
        return data_series

    # Else, split each value to a list
    modified_series = split_answer(data_series, delimiter)

    # Replace NAs with empty lists
    mask_null = modified_series.isnull()
    modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])

    return modified_series

## Processing

### Preprocess Data

In [16]:
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

#### 1.Replace Values and parse

In [17]:
for col, replacement in REPLACED_DICT.items() :
    df[col] = df[col].replace(replacement).astype(np.float32)

#### 2. Split multiple answers

In [18]:
object_cols = df.select_dtypes(include = 'object').columns.tolist()
for col in object_cols:
    df[col] = split_answers(df[col])

### Visually verify results

In [19]:
i = df.sample(1).index[0]
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(df['LanguageHaveWorkedWith'].iloc[i])


Python
['Python']


In [20]:
i = df.sample(1).index[0]
print(raw_df['DevType'].iloc[i])
print(df['DevType'].iloc[i])

nan
[]


In [21]:
i = df.sample(1).index[0]
print(raw_df['YearsCodePro'].iloc[i])
print(df['YearsCodePro'].iloc[i])

17
17.0


In [22]:
df.sample(1).iloc[0]

ResponseId                                                                  18566
MainBranch                                 I am a student who is learning to code
Employment                                                     Student, full-time
Country                                                                     China
US_State                                                                      NaN
UK_Country                                                                    NaN
EdLevel                         Some college/university study without earning ...
Age1stCode                                                          18 - 24 years
LearnCode                       [School, Online Courses or Certification, Book...
YearsCode                                                                     0.0
YearsCodePro                                                                  NaN
DevType                                                                        []
OrgSize         

### Export Data

In [24]:
df.to_pickle(EXPORT_PATH)