In [1]:
import warnings
import numpy as np
import pandas as pd
from scipy import stats
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('survey_results_public.csv')
continent = pd.read_csv('Countries-Continents.csv')
schema_name = 'survey_results_schema.csv'
schema = pd.read_csv(schema_name,index_col='Column').QuestionText

In [3]:
data.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
2,3,"I am not primarily a developer, but I write co...",Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Thailand,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)",Web development or web design,...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,28.0,Man,No,Straight / Heterosexual,,Yes,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
4,5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88883 entries, 0 to 88882
Data columns (total 85 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Respondent              88883 non-null  int64  
 1   MainBranch              88331 non-null  object 
 2   Hobbyist                88883 non-null  object 
 3   OpenSourcer             88883 non-null  object 
 4   OpenSource              86842 non-null  object 
 5   Employment              87181 non-null  object 
 6   Country                 88751 non-null  object 
 7   Student                 87014 non-null  object 
 8   EdLevel                 86390 non-null  object 
 9   UndergradMajor          75614 non-null  object 
 10  EduOther                84260 non-null  object 
 11  OrgSize                 71791 non-null  object 
 12  DevType                 81335 non-null  object 
 13  YearsCode               87938 non-null  object 
 14  Age1stCode              87634 non-null

In [5]:
data.isnull().sum()

Respondent          0
MainBranch        552
Hobbyist            0
OpenSourcer         0
OpenSource       2041
                ...  
Sexuality       12736
Ethnicity       12215
Dependents       5824
SurveyLength     1899
SurveyEase       1802
Length: 85, dtype: int64

In [6]:
schema

Column
Respondent      Randomized respondent ID number (not in order ...
MainBranch      Which of the following options best describes ...
Hobbyist                                  Do you code as a hobby?
OpenSourcer           How often do you contribute to open source?
OpenSource      How do you feel about the quality of open sour...
                                      ...                        
Sexuality       Which of the following do you currently identi...
Ethnicity       Which of the following do you identify as? Ple...
Dependents      Do you have any dependents (e.g., children, el...
SurveyLength    How do you feel about the length of the survey...
SurveyEase      How easy or difficult was this survey to compl...
Name: QuestionText, Length: 85, dtype: object

## 1. Find the average age of developers when they wrote their first line of code.

In [7]:
data['Age1stCode'].isnull().sum()

1249

In [8]:
data['Age1stCode'].unique()

array(['10', '17', '22', '16', '14', '15', '11', '20', '13', '18', '12',
       '19', '21', '8', '35', '6', '9', '29', '7', '5', '23', '30', nan,
       '27', '24', 'Younger than 5 years', '33', '25', '26', '39', '36',
       '38', '28', '31', 'Older than 85', '32', '37', '50', '65', '42',
       '34', '40', '67', '43', '44', '60', '46', '45', '49', '51', '41',
       '55', '83', '48', '53', '54', '47', '56', '79', '61', '68', '77',
       '66', '52', '80', '62', '84', '57', '58', '63'], dtype=object)

In [9]:
# converting object to str
data['Age1stCode'] = data['Age1stCode'].astype(str) 

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88883 entries, 0 to 88882
Data columns (total 85 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Respondent              88883 non-null  int64  
 1   MainBranch              88331 non-null  object 
 2   Hobbyist                88883 non-null  object 
 3   OpenSourcer             88883 non-null  object 
 4   OpenSource              86842 non-null  object 
 5   Employment              87181 non-null  object 
 6   Country                 88751 non-null  object 
 7   Student                 87014 non-null  object 
 8   EdLevel                 86390 non-null  object 
 9   UndergradMajor          75614 non-null  object 
 10  EduOther                84260 non-null  object 
 11  OrgSize                 71791 non-null  object 
 12  DevType                 81335 non-null  object 
 13  YearsCode               87938 non-null  object 
 14  Age1stCode              88883 non-null

In [11]:
for i in range(0,len(data['Age1stCode'])):
        split = data.Age1stCode[i].split()
        for j in range(0,len(split)):
                if split[j].isdigit():
                    data.Age1stCode[i] = split[j]

In [12]:
data['Age1stCode'] = data['Age1stCode'].astype(float) 
mean_age = data.Age1stCode.mean()
mean_age = round(mean_age)
mean_age

15

*Average age of developers when they wrote their first line of code is 15.*

## 2. Deduce the percentage of developers who know python in each country.

In [13]:
target = 'Python'
python_data = data[data['LanguageWorkedWith'].str.contains(target,na=True)]

In [14]:
python_data.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
0,1,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Not employed, and not looking for work",United Kingdom,No,Primary/elementary school,,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,14.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
1,2,I am a student who is learning to code,No,Less than once per year,The quality of OSS and closed source software ...,"Not employed, but looking for work",Bosnia and Herzegovina,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,19.0,Man,No,Straight / Heterosexual,,No,Appropriate in length,Neither easy nor difficult
3,4,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,United States,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,22.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
4,5,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,Ukraine,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech meetups or events in your area;Courses on...,30.0,Man,No,Straight / Heterosexual,White or of European descent;Multiracial,No,Appropriate in length,Easy
7,8,I code primarily as a hobby,Yes,Less than once per year,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",India,,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,A lot more welcome now than last year,Tech articles written by other developers;Indu...,24.0,Man,No,Straight / Heterosexual,,,Appropriate in length,Neither easy nor difficult


In [15]:
python_count = python_data.groupby('Country').size()
python_count

Country
Afghanistan                             13
Albania                                 26
Algeria                                 48
Angola                                   3
Antigua and Barbuda                      2
                                        ..
Venezuela, Bolivarian Republic of...    29
Viet Nam                                89
Yemen                                    5
Zambia                                   4
Zimbabwe                                14
Length: 167, dtype: int64

In [16]:
all_count = data.groupby('Country').size()
all_count

Country
Afghanistan                              44
Albania                                  86
Algeria                                 134
Andorra                                   7
Angola                                    5
                                       ... 
Venezuela, Bolivarian Republic of...     88
Viet Nam                                231
Yemen                                    19
Zambia                                   12
Zimbabwe                                 39
Length: 179, dtype: int64

In [17]:
final_data = round(((python_count/all_count)*100))

In [18]:
final_data = pd.DataFrame(final_data)
final_data.columns = ['Percantage of developer knows python']
final_data

Unnamed: 0_level_0,Percantage of developer knows python
Country,Unnamed: 1_level_1
Afghanistan,30.0
Albania,30.0
Algeria,36.0
Andorra,
Angola,60.0
...,...
"Venezuela, Bolivarian Republic of...",33.0
Viet Nam,39.0
Yemen,26.0
Zambia,33.0


In [19]:
final_data=final_data['Percantage of developer knows python'].dropna()
final_data.head()

Country
Afghanistan            30.0
Albania                30.0
Algeria                36.0
Angola                 60.0
Antigua and Barbuda    22.0
Name: Percantage of developer knows python, dtype: float64

## 3. In which countries do developers work the highest number of hours per week? Consider countries with more than 250 responses only.

In [20]:
data.columns

Index(['Respondent', 'MainBranch', 'Hobbyist', 'OpenSourcer', 'OpenSource',
       'Employment', 'Country', 'Student', 'EdLevel', 'UndergradMajor',
       'EduOther', 'OrgSize', 'DevType', 'YearsCode', 'Age1stCode',
       'YearsCodePro', 'CareerSat', 'JobSat', 'MgrIdiot', 'MgrMoney',
       'MgrWant', 'JobSeek', 'LastHireDate', 'LastInt', 'FizzBuzz',
       'JobFactors', 'ResumeUpdate', 'CurrencySymbol', 'CurrencyDesc',
       'CompTotal', 'CompFreq', 'ConvertedComp', 'WorkWeekHrs', 'WorkPlan',
       'WorkChallenge', 'WorkRemote', 'WorkLoc', 'ImpSyn', 'CodeRev',
       'CodeRevHrs', 'UnitTests', 'PurchaseHow', 'PurchaseWhat',
       'LanguageWorkedWith', 'LanguageDesireNextYear', 'DatabaseWorkedWith',
       'DatabaseDesireNextYear', 'PlatformWorkedWith',
       'PlatformDesireNextYear', 'WebFrameWorkedWith',
       'WebFrameDesireNextYear', 'MiscTechWorkedWith',
       'MiscTechDesireNextYear', 'DevEnviron', 'OpSys', 'Containers',
       'BlockchainOrg', 'BlockchainIs', 'BetterLife'

In [21]:
schema.WorkWeekHrs

'On average, how many hours per week do you work?'

In [22]:
high = data[data['Respondent']>250]
high.head(10)

Unnamed: 0,Respondent,MainBranch,Hobbyist,OpenSourcer,OpenSource,Employment,Country,Student,EdLevel,UndergradMajor,...,WelcomeChange,SONewContent,Age,Gender,Trans,Sexuality,Ethnicity,Dependents,SurveyLength,SurveyEase
249,251,I am a student who is learning to code,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...","Not employed, but looking for work",Zimbabwe,"Yes, full-time","Secondary school (e.g. American high school, G...",,...,A lot more welcome now than last year,Tech meetups or events in your area;Courses on...,20.0,Man,No,Straight / Heterosexual,Black or of African descent,No,Appropriate in length,Neither easy nor difficult
250,252,I am a developer by profession,Yes,Once a month or more often,"OSS is, on average, of HIGHER quality than pro...",Employed full-time,United Kingdom,No,"Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,,27.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Neither easy nor difficult
251,253,I am a developer by profession,Yes,Less than once a month but more than once per ...,The quality of OSS and closed source software ...,Employed full-time,Germany,No,"Secondary school (e.g. American high school, G...",,...,Just as welcome now as I felt last year,Tech meetups or events in your area,28.0,Man,No,Bisexual,White or of European descent,No,Appropriate in length,Neither easy nor difficult
252,254,I am a developer by profession,Yes,Less than once per year,The quality of OSS and closed source software ...,Employed full-time,Australia,No,Associate degree,"Information systems, information technology, o...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Tech...,41.0,Man,No,Straight / Heterosexual,White or of European descent,Yes,Appropriate in length,Neither easy nor difficult
253,255,I am a student who is learning to code,Yes,Never,The quality of OSS and closed source software ...,"Independent contractor, freelancer, or self-em...",Thailand,"Yes, part-time","Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,Tech articles written by other developers;Indu...,20.0,Man,Yes,Straight / Heterosexual,East Asian,Yes,Too long,Neither easy nor difficult
254,256,I am a student who is learning to code,Yes,Never,"OSS is, on average, of HIGHER quality than pro...",Employed part-time,France,"Yes, full-time","Bachelor’s degree (BA, BS, B.Eng., etc.)","Computer science, computer engineering, or sof...",...,Not applicable - I did not use Stack Overflow ...,,20.0,Man,No,Straight / Heterosexual,White or of European descent,,Too long,Neither easy nor difficult
255,257,I am a developer by profession,Yes,Less than once a month but more than once per ...,The quality of OSS and closed source software ...,Employed full-time,Czech Republic,No,Some college/university study without earning ...,,...,Not applicable - I did not use Stack Overflow ...,Tech articles written by other developers;Indu...,43.0,Man,No,Straight / Heterosexual,White or of European descent,No,Appropriate in length,Easy
256,258,I am a developer by profession,No,Never,The quality of OSS and closed source software ...,Employed full-time,France,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",...,Just as welcome now as I felt last year,,23.0,Man,No,Straight / Heterosexual,White or of European descent,No,Too long,Easy
257,259,I am a developer by profession,Yes,Never,The quality of OSS and closed source software ...,Employed full-time,Denmark,No,Associate degree,"Computer science, computer engineering, or sof...",...,Not applicable - I did not use Stack Overflow ...,,31.0,Man,No,Straight / Heterosexual,White or of European descent,Yes,Appropriate in length,Easy
258,260,I am a developer by profession,Yes,Less than once a month but more than once per ...,"OSS is, on average, of HIGHER quality than pro...","Independent contractor, freelancer, or self-em...",Germany,No,"Master’s degree (MA, MS, M.Eng., MBA, etc.)","Computer science, computer engineering, or sof...",...,Somewhat more welcome now than last year,Tech articles written by other developers;Tech...,,Man,No,Straight / Heterosexual,White or of European descent,Yes,Too long,Easy


In [23]:
high = data.groupby('Country')[['WorkWeekHrs']].max().sort_values('WorkWeekHrs',ascending=False)
high.head(10)

Unnamed: 0_level_0,WorkWeekHrs
Country,Unnamed: 1_level_1
India,4850.0
Switzerland,4125.0
Sweden,3875.0
Sri Lanka,1012.0
Turkey,475.0
Brazil,425.0
Poland,425.0
Austria,425.0
Germany,425.0
Spain,415.0


## 4.Based on this survey, what will be the most desired programming language for the year 2020?

In [24]:
schema.LanguageDesireNextYear

'Which of the following programming, scripting, and markup languages have you done extensive development work in over the past year, and which do you want to work in over the next year?  (If you both worked with the language and want to continue to do so, please check both boxes in that row.)'

In [25]:
data.LanguageDesireNextYear

0          C;C++;C#;Go;HTML/CSS;Java;JavaScript;Python;SQL
1                              C++;HTML/CSS;JavaScript;SQL
2                                          Elixir;HTML/CSS
3                                      C;C#;JavaScript;SQL
4                 HTML/CSS;Java;JavaScript;SQL;WebAssembly
                               ...                        
88878    C++;HTML/CSS;JavaScript;SQL;WebAssembly;Other(s):
88879                                                  NaN
88880                                                  NaN
88881                                                  NaN
88882         C;C++;C#;HTML/CSS;Java;Objective-C;SQL;Swift
Name: LanguageDesireNextYear, Length: 88883, dtype: object

In [26]:
counts = {}
for i in data['LanguageDesireNextYear']:
    to_string = str(i)
    splited = to_string.split(';')
    for j in splited:
        if j in counts:
            counts[j] += 1
        else:
            counts[j] = 1

In [27]:
Language_desired_2020 = pd.DataFrame.from_dict(counts,orient='index',columns=['No. of Users'])
Language_desired_2020 = Language_desired_2020.sort_values('No. of Users',ascending=False)
Language_desired_2020.head()

Unnamed: 0,No. of Users
JavaScript,44739
Python,40006
HTML/CSS,36645
SQL,33566
TypeScript,23720


## 5. What is the distribution of people who code as a hobby based on gender and continent (hint: use your mapping of country to continent)?

In [28]:
schema.Hobbyist

'Do you code as a hobby?'

In [29]:
data.Hobbyist

0        Yes
1         No
2        Yes
3         No
4        Yes
        ... 
88878    Yes
88879     No
88880     No
88881     No
88882    Yes
Name: Hobbyist, Length: 88883, dtype: object

In [30]:
data = data.merge(continent[['Country', 'Continent']], on='Country', how='left')
data.Hobbyist = data.Hobbyist.replace({"Yes": 1, "No": 0})

In [31]:
def sex(string):
    if string == 'Man':
        return 'Man'
    elif string == 'Woman':
        return 'Woman'
    else:
        return "Others"

In [32]:
data.Gender = list(map(sex,data.Gender))
data.groupby(['Continent','Gender'])['Hobbyist'].count()

Continent      Gender
Africa         Man        2334
               Others      130
               Woman       191
Asia           Man       16598
               Others     1304
               Woman      1191
Europe         Man       30190
               Others     1329
               Woman      2048
North America  Man        3962
               Others      211
               Woman       372
Oceania        Man        2160
               Others      130
               Woman       144
South America  Man        3166
               Others       88
               Woman       180
Name: Hobbyist, dtype: int64

### 6.Generate the report for job and career satisfaction of developer based on their gender and continent?

In [33]:
data.groupby(['Continent','Gender','JobSat'])['Respondent'].count()

Continent      Gender  JobSat                            
Africa         Man     Neither satisfied nor dissatisfied    225
                       Slightly dissatisfied                 253
                       Slightly satisfied                    515
                       Very dissatisfied                     133
                       Very satisfied                        339
                                                            ... 
South America  Woman   Neither satisfied nor dissatisfied     20
                       Slightly dissatisfied                  21
                       Slightly satisfied                     50
                       Very dissatisfied                      11
                       Very satisfied                         43
Name: Respondent, Length: 90, dtype: int64

In [34]:
data.groupby(['Continent','Gender','CareerSat'])['Respondent'].count()

Continent      Gender  CareerSat                         
Africa         Man     Neither satisfied nor dissatisfied    201
                       Slightly dissatisfied                 175
                       Slightly satisfied                    541
                       Very dissatisfied                     125
                       Very satisfied                        558
                                                            ... 
South America  Woman   Neither satisfied nor dissatisfied     13
                       Slightly dissatisfied                  25
                       Slightly satisfied                     56
                       Very dissatisfied                       7
                       Very satisfied                         52
Name: Respondent, Length: 90, dtype: int64