# Wrangling

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [2]:
df_main = pd.read_csv('developer_dataset_clean.csv', index_col='Unnamed: 0')
df_main.head()

Unnamed: 0,RespondentID,Year,Country,Employment,UndergradMajor,DevType,LanguageWorkedWith,LanguageDesireNextYear,DatabaseWorkedWith,DatabaseDesireNextYear,PlatformWorkedWith,PlatformDesireNextYear,YearsCodePro,JobSeek,ConvertedComp,WorkWeekHrs
1,1,2019,United States,Employed full-time,CompSci,"Developer, full-stack",C,C;C#;JavaScript;SQL,MySQL;SQLite,MySQL;SQLite,Linux;Windows,Linux;Windows,1.0,I am not interested in new job opportunities,60000.0,80.0
2,1,2020,United States,Employed full-time,CompSci,,HTML/CSS,Java;Ruby;Scala,MySQL;PostgreSQL;Redis;SQLite,MySQL;PostgreSQL,AWS;Docker;Linux;MacOS;Windows,Docker;Google Cloud Platform;Heroku;Linux;Windows,8.0,,,
3,2,2018,United States,Employed full-time,CompSci,"Developer, full-stack",C#,C#;JavaScript;SQL;TypeScript;HTML;CSS;Bash/Shell,"SQL Server;Microsoft Azure (Tables, CosmosDB, ...","SQL Server;Microsoft Azure (Tables, CosmosDB, ...",Azure,Azure,4.0,,50000.0,
4,2,2019,United States,Employed full-time,CompSci,Data or business analyst,Bash/Shell/PowerShell,Bash/Shell/PowerShell;HTML/CSS;JavaScript;Rust...,Couchbase;DynamoDB;Firebase;MySQL,Firebase;MySQL;Redis,Android;AWS;Docker;IBM Cloud or Watson;iOS;Lin...,Android;AWS;Docker;IBM Cloud or Watson;Linux;S...,8.0,I am not interested in new job opportunities,90000.0,40.0
5,2,2020,United States,Employed full-time,CompSci,"Developer, back-end",Python,JavaScript,MariaDB;MySQL;Redis,MongoDB,Docker,iOS;Slack Apps and Integrations,13.0,"I’m not actively looking, but I am open to new...",120000.0,40.0


In [3]:
# # Explode
# df_main['LanguageWorkedWith'] = df_main['LanguageWorkedWith'].str.split(';')
# df_expanded = df_main.explode('LanguageWorkedWith')
# df_ohe = pd.get_dummies(df_expanded, columns=['LanguageWorkedWith'], prefix='', prefix_sep='')
# df_grouped = df_ohe.groupby(df_ohe.index).sum().reset_index(inplace=False)

In [4]:
df_main.columns

Index(['RespondentID', 'Year', 'Country', 'Employment', 'UndergradMajor',
       'DevType', 'LanguageWorkedWith', 'LanguageDesireNextYear',
       'DatabaseWorkedWith', 'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'YearsCodePro', 'JobSeek', 'ConvertedComp',
       'WorkWeekHrs'],
      dtype='object')

In [5]:
# Disassemble into multiple frames
df_main_respondents = df_main.loc[:, ['RespondentID', 'Year', 'Country', 'Employment', 'UndergradMajor', 'DevType', 'JobSeek', 'YearsCodePro', 'ConvertedComp', 'WorkWeekHrs']]
df_resp_languages = df_main.loc[:, ['RespondentID', 'Year', 'LanguageWorkedWith', 'LanguageDesireNextYear']]
df_resp_databases = df_main.loc[:, ['RespondentID', 'Year','DatabaseWorkedWith', 'DatabaseDesireNextYear']]
df_resp_platforms = df_main.loc[:, ['RespondentID', 'Year','PlatformWorkedWith', 'PlatformDesireNextYear']]

In [6]:
# Dummy tables for languages
# df_resp_languages['LanguageWorkedWith'] = df_resp_languages['LanguageWorkedWith'].str.split(';').explode('LanguageWorkedWith')
# df_resp_languages_ohe = pd.get_dummies(df_resp_languages, columns=['LanguageWorkedWith'], prefix='Workedwith_', prefix_sep='')
# df_resp_languages_grouped = df_resp_languages_ohe.groupby(df_resp_languages_ohe.index).sum().reset_index(inplace=False).reset_index()

def get_dumb(df, col_name, prefix_str):
    df = df.loc[:,['RespondentID', 'Year', col_name]]
    df[col_name] = df[col_name].str.split(';').explode(col_name)
    df_ohe = pd.get_dummies(df, columns=[col_name], prefix=prefix_str, prefix_sep='')
    df_grouped = df_ohe.groupby(df_ohe.index).sum()
    return df_grouped

df_resp_languages_dummy1 = get_dumb(df_resp_languages, 'LanguageWorkedWith', 'LangWorkedWith_')
df_resp_languages_dummy2 = get_dumb(df_resp_languages, 'LanguageDesireNextYear', 'LangDesire_')
df_resp_databases1 = get_dumb(df_resp_databases, 'DatabaseWorkedWith', 'DBWorkedWith_')
df_resp_databases2 = get_dumb(df_resp_databases, 'DatabaseDesireNextYear', 'DBDesire_')
df_resp_platforms1 = get_dumb(df_resp_platforms, 'PlatformWorkedWith', 'PltfrmWorkedWith_')
df_resp_platforms2 = get_dumb(df_resp_platforms, 'PlatformDesireNextYear', 'PltfrmDesire_')


In [7]:
# Remerge Dummy Tables
df_resp_languages = pd.merge(df_resp_languages_dummy1, df_resp_languages_dummy2, how='inner', on=['RespondentID', 'Year'])
df_resp_databases = pd.merge(df_resp_databases1, df_resp_databases2, how='inner', on=['RespondentID', 'Year'])
df_resp_platforms = pd.merge(df_resp_platforms1, df_resp_platforms2, how='inner', on=['RespondentID', 'Year'])

In [8]:
df_main_respondents.head()

Unnamed: 0,RespondentID,Year,Country,Employment,UndergradMajor,DevType,JobSeek,YearsCodePro,ConvertedComp,WorkWeekHrs
1,1,2019,United States,Employed full-time,CompSci,"Developer, full-stack",I am not interested in new job opportunities,1.0,60000.0,80.0
2,1,2020,United States,Employed full-time,CompSci,,,8.0,,
3,2,2018,United States,Employed full-time,CompSci,"Developer, full-stack",,4.0,50000.0,
4,2,2019,United States,Employed full-time,CompSci,Data or business analyst,I am not interested in new job opportunities,8.0,90000.0,40.0
5,2,2020,United States,Employed full-time,CompSci,"Developer, back-end","I’m not actively looking, but I am open to new...",13.0,120000.0,40.0


In [9]:
df_resp_languages.groupby(['RespondentID', 'Year']).agg('sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,LangWorkedWith_Assembly,LangWorkedWith_Bash/Shell,LangWorkedWith_Bash/Shell/PowerShell,LangWorkedWith_C,LangWorkedWith_C#,LangWorkedWith_C++,LangWorkedWith_Clojure,LangWorkedWith_Cobol,LangWorkedWith_CoffeeScript,LangWorkedWith_Dart,...,LangDesire_Ruby,LangDesire_Rust,LangDesire_SQL,LangDesire_Scala,LangDesire_Swift,LangDesire_TypeScript,LangDesire_VB.NET,LangDesire_VBA,LangDesire_Visual Basic 6,LangDesire_WebAssembly
RespondentID,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2020,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2018,0,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42831,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
42833,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42835,2018,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
42836,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df_resp_databases.groupby(['RespondentID', 'Year']).agg('sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,DBWorkedWith_Amazon DynamoDB,DBWorkedWith_Amazon RDS/Aurora,DBWorkedWith_Amazon Redshift,DBWorkedWith_Apache HBase,DBWorkedWith_Apache Hive,DBWorkedWith_Cassandra,DBWorkedWith_Couchbase,DBWorkedWith_DynamoDB,DBWorkedWith_Elasticsearch,DBWorkedWith_Firebase,...,DBDesire_Microsoft SQL Server,DBDesire_MongoDB,DBDesire_MySQL,DBDesire_Neo4j,DBDesire_Oracle,DBDesire_Other(s):,DBDesire_PostgreSQL,DBDesire_Redis,DBDesire_SQL Server,DBDesire_SQLite
RespondentID,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2020,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42831,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
42833,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
42835,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42836,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
df_resp_platforms.groupby(['RespondentID', 'Year']).agg('sum')

Unnamed: 0_level_0,Unnamed: 1_level_0,PltfrmWorkedWith_AWS,PltfrmWorkedWith_Amazon Echo,PltfrmWorkedWith_Android,PltfrmWorkedWith_Apple Watch or Apple TV,PltfrmWorkedWith_Arduino,PltfrmWorkedWith_Azure,PltfrmWorkedWith_Docker,PltfrmWorkedWith_Drupal,PltfrmWorkedWith_ESP8266,PltfrmWorkedWith_Firebase,...,PltfrmDesire_Salesforce,PltfrmDesire_Serverless,PltfrmDesire_SharePoint,PltfrmDesire_Slack,PltfrmDesire_Slack Apps and Integrations,PltfrmDesire_Windows,PltfrmDesire_Windows Desktop or Server,PltfrmDesire_Windows Phone,PltfrmDesire_WordPress,PltfrmDesire_iOS
RespondentID,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2020,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2018,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42831,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42833,2018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42835,2018,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42836,2018,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Export all
df_main_respondents.to_csv('main_respondents.csv', index=False)
df_resp_languages.to_csv('resp_languages.csv', index=False)
df_resp_databases.to_csv('resp_databases.csv', index=False)
df_resp_platforms.to_csv('resp_platforms.csv', index=False)