In [1]:
#Constants

DATA_PATH = '../data/raw/survey_results_public.csv'
EXPORT_PATH = '../data/processed/1_preprocessed_df.pkl'

In [2]:
#Load Libraries
import pandas as pd
import numpy as np
import pickle

In [10]:
pd.options.display.max_rows = 200

In [3]:
#Replaced Values

REPLACED_VALUES = {
    'More than 50 years' : 51,
    'Less than 1 year' : 0
}

AGE_BUCKETS_AVERAGE = {
    'Under 18 years old': 15,
    '18-24 years old': 20,
    '25-34 years old': 30,
    '35-44 years old': 40,
    '45-54 years old': 50,
    '55-64 years old': 60,
    '65 years or older': 65,
    'Prefer not to say': np.nan
}

In [4]:
def split_answer(data_series,delimeter=';'):
    
    '''Split the data series by the delimeter'''
    
    def is_splitable(pd_series,delimeter):
        #Check if the data series is splitable by the delimeter
        return pd_series.str.contains(delimeter)
    
    def split_data(pd_series,delimeter):
        #Split the data series by the delimeter
        return pd_series.str.split(delimeter,expand=False)
    
    
    split_case = is_splitable(data_series,delimeter)
    
    
    if  not split_case.any():
        return data_series
    
    
    
    modefied_series = split_data(data_series,delimeter)
    
    handel_null = modefied_series.isnull()
    modefied_series.loc[handel_null] = modefied_series.loc[handel_null].apply(lambda x: [])
    
    return modefied_series

# Preprocessing

In [6]:
raw_df = pd.read_csv(DATA_PATH)
df = raw_df.copy(deep=True)

In [11]:
print("YearsCode unique values:")
print(sorted(df['YearsCode'].value_counts().index.tolist()))
print("\nYearsCodePro unique values:") 
print(sorted(df['YearsCodePro'].value_counts().index.tolist()))

YearsCode unique values:
[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0]

YearsCodePro unique values:
[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0]


In [12]:
df['YearsCode'] = df['YearsCode'].replace(REPLACED_VALUES).astype(np.float32)
df['YearsCodePro'] = df['YearsCodePro'].replace(REPLACED_VALUES).astype(np.float32)

In [13]:
print("YearsCode unique values after transforming:")
print(sorted(df['YearsCode'].value_counts().index.tolist()))
print("\nYearsCodePro unique values:") 
print(sorted(df['YearsCodePro'].value_counts().index.tolist()))

YearsCode unique values after transforming:
[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0]

YearsCodePro unique values:
[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0]


## Splitting

In [14]:
df.dtypes

ResponseId                          int64
MainBranch                         object
Age                                object
Employment                         object
RemoteWork                         object
Check                              object
CodingActivities                   object
EdLevel                            object
LearnCode                          object
LearnCodeOnline                    object
TechDoc                            object
YearsCode                         float32
YearsCodePro                      float32
DevType                            object
OrgSize                            object
PurchaseInfluence                  object
BuyNewTool                         object
BuildvsBuy                         object
TechEndorse                        object
Country                            object
Currency                           object
CompTotal                         float64
LanguageHaveWorkedWith             object
LanguageWantToWorkWith            

In [17]:
object_columns = df.select_dtypes(include=['object']).columns
object_columns.tolist()

['MainBranch',
 'Age',
 'Employment',
 'RemoteWork',
 'Check',
 'CodingActivities',
 'EdLevel',
 'LearnCode',
 'LearnCodeOnline',
 'TechDoc',
 'DevType',
 'OrgSize',
 'PurchaseInfluence',
 'BuyNewTool',
 'BuildvsBuy',
 'TechEndorse',
 'Country',
 'Currency',
 'LanguageHaveWorkedWith',
 'LanguageWantToWorkWith',
 'LanguageAdmired',
 'DatabaseHaveWorkedWith',
 'DatabaseWantToWorkWith',
 'DatabaseAdmired',
 'PlatformHaveWorkedWith',
 'PlatformWantToWorkWith',
 'PlatformAdmired',
 'WebframeHaveWorkedWith',
 'WebframeWantToWorkWith',
 'WebframeAdmired',
 'EmbeddedHaveWorkedWith',
 'EmbeddedWantToWorkWith',
 'EmbeddedAdmired',
 'MiscTechHaveWorkedWith',
 'MiscTechWantToWorkWith',
 'MiscTechAdmired',
 'ToolsTechHaveWorkedWith',
 'ToolsTechWantToWorkWith',
 'ToolsTechAdmired',
 'NEWCollabToolsHaveWorkedWith',
 'NEWCollabToolsWantToWorkWith',
 'NEWCollabToolsAdmired',
 'OpSysPersonal use',
 'OpSysProfessional use',
 'OfficeStackAsyncHaveWorkedWith',
 'OfficeStackAsyncWantToWorkWith',
 'OfficeSt

In [18]:
for column in object_columns:
    df[column] = split_answer(df[column])

In [19]:
df['Age'] = df['Age'].replace(AGE_BUCKETS_AVERAGE).astype(np.float32)  

  df['Age'] = df['Age'].replace(AGE_BUCKETS_AVERAGE).astype(np.float32)


In [20]:
df['Age'].unique()

array([15., 40., 50., 20., 30., 60., nan, 65.], dtype=float32)

# Verification of Prep.

In [24]:
i = np.random.randint(0,df.shape[0])

In [25]:
print(raw_df['LanguageHaveWorkedWith'].iloc[i])
print(df['LanguageHaveWorkedWith'].iloc[i])

C#;HTML/CSS;Java;JavaScript;Python;SQL;TypeScript
['C#', 'HTML/CSS', 'Java', 'JavaScript', 'Python', 'SQL', 'TypeScript']


In [26]:
print(raw_df['DevType'].iloc[i])
print(df['DevType'].iloc[i])

Senior Executive (C-Suite, VP, etc.)
Senior Executive (C-Suite, VP, etc.)


In [27]:
df['DevType'].sample(10)

56082                                          Student
3345                               Developer, back-end
2499                           Other (please specify):
59527                                              NaN
23365                            Developer, full-stack
64694                                              NaN
20562    Data scientist or machine learning specialist
38189                                Developer, mobile
57398                              Developer, back-end
38527                            Developer, full-stack
Name: DevType, dtype: object

In [28]:
print(raw_df['YearsCodePro'].iloc[i])
print(df['YearsCodePro'].iloc[i])

17
17.0


In [29]:
# Create age bins
age_bins = [0, 20, 30, 40, 50, 60, 100]
age_labels = ['<20', '20-30', '30-40', '40-50', '50-60', '>60']

# Create age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)

  df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)


In [30]:
df.head()

Unnamed: 0,ResponseId,MainBranch,Age,Employment,RemoteWork,Check,CodingActivities,EdLevel,LearnCode,LearnCodeOnline,...,JobSatPoints_7,JobSatPoints_8,JobSatPoints_9,JobSatPoints_10,JobSatPoints_11,SurveyLength,SurveyEase,ConvertedCompYearly,JobSat,AgeGroup
0,1,I am a developer by profession,15.0,"[Employed, full-time]",Remote,Apples,[Hobby],Primary/elementary school,[Books / Physical media],[],...,,,,,,,,,,<20
1,2,I am a developer by profession,40.0,"[Employed, full-time]",Remote,Apples,"[Hobby, Contribute to open-source projects, Ot...","Bachelor’s degree (B.A., B.S., B.Eng., etc.)","[Books / Physical media, Colleague, On the job...","[Technical documentation, Blogs, Books, Writte...",...,0.0,0.0,0.0,0.0,0.0,,,,,30-40
2,3,I am a developer by profession,50.0,"[Employed, full-time]",Remote,Apples,"[Hobby, Contribute to open-source projects, Ot...","Master’s degree (M.A., M.S., M.Eng., MBA, etc.)","[Books / Physical media, Colleague, On the job...","[Technical documentation, Blogs, Books, Writte...",...,,,,,,Appropriate in length,Easy,,,40-50
3,4,I am learning to code,20.0,"[Student, full-time]",,Apples,[],Some college/university study without earning ...,"[Other online resources (e.g., videos, blogs, ...","[Stack Overflow, How-to videos, Interactive tu...",...,,,,,,Too long,Easy,,,<20
4,5,I am a developer by profession,20.0,"[Student, full-time]",,Apples,[],"Secondary school (e.g. American high school, G...","[Other online resources (e.g., videos, blogs, ...","[Technical documentation, Blogs, Written Tutor...",...,,,,,,Too short,Easy,,,<20


In [31]:
df.to_pickle(EXPORT_PATH)