# IT Educational Institute

In [14]:
# Constants
data_path = "/Users/ahmedahmed/IT_Educational_Institute/data/raw/survey_results_public.csv"
export_path = "/Users/ahmedahmed/IT_Educational_Institute/data/processed/1_preprocessed_df.pkl"

# Creating a dictionary of replacements for "YearsCode" and "YearsCodePro"
replace_dict = {
    'YearsCode' : {'Less than 1 year' : 0, 'More than 50 years' : 51},
    'YearsCodePro' : {'Less than 1 year' : 0, 'More than 50 years' : 51}
}

In [15]:
# Importing libraries
import numpy as np
import pandas as pd
import logging
import pickle

## Functions

In [16]:
# Creating a function to split values based on a specific delimiter
def split_answers(data_series, delimiter=';'):
    """
    Split multiple answers in a single string
    to a list of single string each representing
    a single answer
    
    Parameters:
        - data_series (pd.Series): String series with answers
        - delimiter (string): delimiter to split on, default is ;
        
    Returns:
        pd.Series
    """
    
    # Sub functions
    def is_splittable(pd_series, delimiter):
        """ Checks if series should be splitted - Returns: Boolean """
        return pd_series.str.contains(delimiter)
    
    def split_answer(pd_series, delimiter):
        """ Function to split single answer """
        return pd_series.str.split(delimiter)
    
    # Check if multiple answers exist - if none: return original
    splittable_values = is_splittable(data_series, delimiter)
    if not splittable_values.any():
        return data_series
    
    # Else, split each value to a list
    modified_series = split_answer(data_series, delimiter)
    
    # Replace NAs with empty lists
    mask_null = modified_series.isnull()
    modified_series.loc[mask_null] = modified_series.loc[mask_null].apply(lambda x: [])
    
    return modified_series

## Processing
### Preprocess Data

In [17]:
# Loading the data
raw_df = pd.read_csv(data_path)

# Creating a copy of the DataFrame
df = raw_df.copy()

### 1. Replace Values and parse

In [18]:
# Replacing string values with numerical values in "YearsCode" and "YearsCodePro" and parsing to float
for col, value in replace_dict.items():
    df[col] = df[col].replace(value).astype(float)

### 2. Split Multiple Answers

In [19]:
# Creating a variable with all the columns of the object data type
object_cols = df.select_dtypes(include='object').columns.tolist()

# Applying the split_asnwers function to all the object data type
for col in object_cols:
    df[col] = split_answers(df[col])

## Visuallly Verify Results

In [20]:
# Comparing a sample of the raw_df and the processed df in the "LanguagesHaveWorkedWith" column
i = df.sample(1).index[0]
print("The raw data format:")
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print()
print('-'*100)
print()
print('Processed data format:')
print(df['LanguageHaveWorkedWith'].iloc[i])

The raw data format:
C#;Python;SQL

----------------------------------------------------------------------------------------------------

Processed data format:
['C#', 'Python', 'SQL']


In [21]:
# Comparing a sample of the raw_df and the processed df in the "DevType" column
i = df.sample(1).index[0]
print("The raw data format:")
print(raw_df['DevType'].iloc[i])
print()
print('-'*100)
print()
print('Processed data format:')
print(df['DevType'].iloc[i])

The raw data format:
Developer, front-end;Developer, full-stack;Developer, back-end;Developer, desktop or enterprise applications;Developer, mobile;Project manager;Product manager

----------------------------------------------------------------------------------------------------

Processed data format:
['Developer, front-end', 'Developer, full-stack', 'Developer, back-end', 'Developer, desktop or enterprise applications', 'Developer, mobile', 'Project manager', 'Product manager']


In [22]:
# Comparing a sample of the raw_df and the processed df in the "YearsCodePro" column
i = df.sample(1).index[0]
print("The raw data format:")
print(raw_df['YearsCodePro'].iloc[i])
print()
print('-'*100)
print()
print('Processed data format:')
print(df['YearsCodePro'].iloc[i])

The raw data format:
14

----------------------------------------------------------------------------------------------------

Processed data format:
14.0


## Export Data

In [23]:
# Saving the preprocessed data as a pickle file
df.to_pickle(export_path)