In [1]:
import pandas as pd
import numpy as np

In [2]:
DATA_PATH = '../data/raw/survey_results_public.csv'
EXPORT_PATH = '../data/processed/1_preprocessed_df.pkl'


REPLACE_DICT = {
    'YearsCode': {'Less than 1 year': 0, 'More than 50 years': 51},
    'YearsCodePro': {'Less than 1 year': 0, 'More than 50 years': 51}
}

## Functions

Do the following before trying to split
- if Value is NaN => convert to empty list
- if value is string => return it as it's ( a string )
- if value is not string => convert to string


In [3]:
def split_answers(data_series, delimiter=';'):
    return data_series.apply(lambda x: x if isinstance(x, list) else [] if pd.isna(x) else x if isinstance(x, str) else str(x)).apply(lambda x: x.split(delimiter) if isinstance(x, str) else x)

## Processing

### Preprocess Data

In [4]:
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy()

#### 1. Replace values and parse

In [5]:
for col, replacement in REPLACE_DICT.items():
    print(col)
    print(replacement)

YearsCode
{'Less than 1 year': 0, 'More than 50 years': 51}
YearsCodePro
{'Less than 1 year': 0, 'More than 50 years': 51}


In [6]:
for col, replacement in REPLACE_DICT.items():
    df[col] = df[col].replace(replacement).astype(np.float32)

#### 2. Split multiple answers

In [7]:
object_cols = df.select_dtypes(include="object").columns.tolist()
for col in object_cols:
    df[col] = split_answers(df[col])

## Visually verify results

In [8]:
i = df.sample(1).index[0]
print(raw_df['DevType'].iloc[i])
print(df['DevType'].iloc[i])

Developer, back-end
['Developer, back-end']


In [13]:
i = df.sample(1).index[0]
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(df['LanguageHaveWorkedWith'].iloc[i])

nan
[]


In [25]:
i = df.sample(1).index[0]
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(df['LanguageHaveWorkedWith'].iloc[i])

Bash/Shell (all shells);C#;Crystal;Dart;Go;Groovy;HTML/CSS;Java;JavaScript;Kotlin;Lua;Objective-C;PHP;PowerShell;Python;Ruby;Rust;Solidity;Swift;TypeScript
['Bash/Shell (all shells)', 'C#', 'Crystal', 'Dart', 'Go', 'Groovy', 'HTML/CSS', 'Java', 'JavaScript', 'Kotlin', 'Lua', 'Objective-C', 'PHP', 'PowerShell', 'Python', 'Ruby', 'Rust', 'Solidity', 'Swift', 'TypeScript']


In [22]:
i = df.sample(1).index[0]
print(raw_df['YearsCode'].iloc[i])
print(df['YearsCode'].iloc[i])

30
30.0


## Export Data

In [11]:
df.to_pickle(EXPORT_PATH)

####