In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv("survey_results_public.csv")
data.columns

Index(['ResponseId', 'MainBranch', 'Employment', 'RemoteWork',
       'CodingActivities', 'EdLevel', 'LearnCode', 'LearnCodeOnline',
       'LearnCodeCoursesCert', 'YearsCode', 'YearsCodePro', 'DevType',
       'OrgSize', 'PurchaseInfluence', 'BuyNewTool', 'Country', 'Currency',
       'CompTotal', 'CompFreq', 'LanguageHaveWorkedWith',
       'LanguageWantToWorkWith', 'DatabaseHaveWorkedWith',
       'DatabaseWantToWorkWith', 'PlatformHaveWorkedWith',
       'PlatformWantToWorkWith', 'WebframeHaveWorkedWith',
       'WebframeWantToWorkWith', 'MiscTechHaveWorkedWith',
       'MiscTechWantToWorkWith', 'ToolsTechHaveWorkedWith',
       'ToolsTechWantToWorkWith', 'NEWCollabToolsHaveWorkedWith',
       'NEWCollabToolsWantToWorkWith', 'OpSysProfessional use',
       'OpSysPersonal use', 'VersionControlSystem', 'VCInteraction',
       'VCHostingPersonal use', 'VCHostingProfessional use',
       'OfficeStackAsyncHaveWorkedWith', 'OfficeStackAsyncWantToWorkWith',
       'OfficeStackSyncHaveWork

In [3]:
cols_to_keep = ['MainBranch', 'Employment', 'RemoteWork', 'YearsCode', 'CompTotal', 'CompFreq', 'LanguageHaveWorkedWith']

In [4]:
data = data[cols_to_keep]

In [5]:
data = data[~data["MainBranch"].isin(["None of these", "I used to be a developer by profession, but no longer am"])] # remove non-developers

In [6]:
data["YearsCode"].value_counts()

10                    5162
5                     5143
6                     4614
4                     4449
7                     4202
8                     4198
3                     4084
2                     3322
12                    2951
15                    2883
20                    2568
9                     2562
11                    1793
14                    1784
13                    1604
1                     1559
25                    1535
Less than 1 year      1399
16                    1383
30                    1264
22                    1138
18                    1125
17                    1042
40                     719
24                     717
35                     708
23                     630
19                     547
21                     476
26                     475
27                     468
28                     408
32                     353
38                     275
37                     267
34                     233
36                     225
3

In [7]:
# transform years to code
transformation_dict = {
    "Less than 1 year": 0.0,
    "More than 50 years": 51
}

def years_code_to_float(data):
    try:
        return float(data)
    except:
        return transformation_dict[data]


In [8]:
data["YearsCode"] = data["YearsCode"].apply(years_code_to_float) # convert YearsCode to float

In [9]:
data

Unnamed: 0,MainBranch,Employment,RemoteWork,YearsCode,CompTotal,CompFreq,LanguageHaveWorkedWith
1,I am a developer by profession,"Employed, full-time",Fully remote,,,,JavaScript;TypeScript
2,"I am not primarily a developer, but I write co...","Employed, full-time","Hybrid (some remote, some in-person)",14.0,32000.0,Yearly,C#;C++;HTML/CSS;JavaScript;Python
3,I am a developer by profession,"Employed, full-time",Fully remote,20.0,60000.0,Monthly,C#;JavaScript;SQL;TypeScript
4,I am a developer by profession,"Employed, full-time","Hybrid (some remote, some in-person)",8.0,,,C#;HTML/CSS;JavaScript;SQL;Swift;TypeScript
5,"I am not primarily a developer, but I write co...","Student, full-time",,15.0,,,C++;Lua
...,...,...,...,...,...,...,...
73262,I am learning to code,"Student, full-time;Employed, part-time","Hybrid (some remote, some in-person)",4.0,,,Assembly;C;C++;Java;TypeScript
73263,I am a developer by profession,"Employed, full-time",Fully remote,8.0,60000.0,Yearly,Bash/Shell;Dart;JavaScript;PHP;Python;SQL;Type...
73264,I am a developer by profession,"Employed, full-time",Full in-person,6.0,107000.0,Yearly,Bash/Shell;HTML/CSS;JavaScript;Python;SQL
73265,"I am not primarily a developer, but I write co...","Employed, full-time","Hybrid (some remote, some in-person)",42.0,,,HTML/CSS;JavaScript;PHP;Python;SQL


In [10]:
data = data[data["CompTotal"] < 1e+7] # remove rows with too large salary

In [11]:
data[data["CompFreq"] == "Monthly"].describe() # there is a problem with montly computed data, some people still provide yearly salary

Unnamed: 0,YearsCode,CompTotal
count,14380.0,14410.0
mean,12.368289,196302.8
std,8.535292,804799.1
min,0.0,0.0
25%,6.0,4350.0
50%,10.0,17656.0
75%,16.0,68000.0
max,51.0,9700000.0


In [12]:
languages = data["LanguageHaveWorkedWith"]

In [13]:
def apply_split(row):
    try:
        return row.split(";")
    except:
        return [row]

In [14]:
languages = languages.apply(apply_split)

In [15]:
unique = np.unique(np.concatenate(languages.values)) # get all the unique languages

In [16]:
# create columns for each language and mark if user have worked with it

for unique_lang in unique:
    data[unique_lang] = data["LanguageHaveWorkedWith"].apply(apply_split)

for unique_lang in unique:
    data[unique_lang] = data[unique_lang].apply(lambda r: unique_lang in r)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[unique_lang] = data["LanguageHaveWorkedWith"].apply(apply_split)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[unique_lang] = data["LanguageHaveWorkedWith"].apply(apply_split)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[unique_lang] = data["LanguageHaveWorkedWith"].apply(apply_sp

In [17]:
data

Unnamed: 0,MainBranch,Employment,RemoteWork,YearsCode,CompTotal,CompFreq,LanguageHaveWorkedWith,APL,Assembly,Bash/Shell,...,Ruby,Rust,SAS,SQL,Scala,Solidity,Swift,TypeScript,VBA,nan
2,"I am not primarily a developer, but I write co...","Employed, full-time","Hybrid (some remote, some in-person)",14.0,32000.0,Yearly,C#;C++;HTML/CSS;JavaScript;Python,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,I am a developer by profession,"Employed, full-time",Fully remote,20.0,60000.0,Monthly,C#;JavaScript;SQL;TypeScript,False,False,False,...,False,False,False,True,False,False,False,True,False,False
8,I am a developer by profession,"Employed, full-time","Hybrid (some remote, some in-person)",6.0,46000.0,Yearly,,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10,I am a developer by profession,"Employed, full-time","Hybrid (some remote, some in-person)",5.0,48000.0,Yearly,Bash/Shell;C#;HTML/CSS;JavaScript;PowerShell;SQL,False,False,True,...,False,False,False,True,False,False,False,False,False,False
11,"I am not primarily a developer, but I write co...","Employed, full-time;Independent contractor, fr...",Fully remote,12.0,194400.0,Yearly,C#;HTML/CSS;JavaScript;PowerShell;Python;Rust;SQL,False,False,False,...,False,True,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73259,I am a developer by profession,"Employed, full-time","Hybrid (some remote, some in-person)",6.0,8000.0,Monthly,Elixir,False,False,False,...,False,False,False,False,False,False,False,False,False,False
73261,I am a developer by profession,"Employed, full-time","Hybrid (some remote, some in-person)",9.0,36000.0,Yearly,Bash/Shell;HTML/CSS;Java;JavaScript;Python;SQL...,False,False,True,...,False,False,False,True,False,False,False,True,False,False
73263,I am a developer by profession,"Employed, full-time",Fully remote,8.0,60000.0,Yearly,Bash/Shell;Dart;JavaScript;PHP;Python;SQL;Type...,False,False,True,...,False,False,False,True,False,False,False,True,False,False
73264,I am a developer by profession,"Employed, full-time",Full in-person,6.0,107000.0,Yearly,Bash/Shell;HTML/CSS;JavaScript;Python;SQL,False,False,True,...,False,False,False,True,False,False,False,False,False,False


In [18]:
data.drop(columns=["LanguageHaveWorkedWith", "nan"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.drop(columns=["LanguageHaveWorkedWith", "nan"], inplace=True)


In [19]:
data.to_csv("survey_results_prepared.csv")