In [1]:
!pip install pycountry-convert

Collecting pycountry-convert
  Downloading pycountry_convert-0.7.2-py3-none-any.whl (13 kB)
Collecting pytest-mock>=1.6.3
  Downloading pytest_mock-3.5.1-py3-none-any.whl (12 kB)
Collecting pprintpp>=0.3.0
  Downloading pprintpp-0.4.0-py2.py3-none-any.whl (16 kB)
Collecting pytest-cov>=2.5.1
  Downloading pytest_cov-2.11.1-py2.py3-none-any.whl (20 kB)
Collecting repoze.lru>=0.7
  Downloading repoze.lru-0.7-py3-none-any.whl (10 kB)
Collecting coverage>=5.2.1
  Downloading coverage-5.5-cp37-cp37m-manylinux2010_x86_64.whl (242 kB)
[K     |████████████████████████████████| 242 kB 1.2 MB/s 
Installing collected packages: coverage, repoze.lru, pytest-mock, pytest-cov, pprintpp, pycountry-convert
Successfully installed coverage-5.5 pprintpp-0.4.0 pycountry-convert-0.7.2 pytest-cov-2.11.1 pytest-mock-3.5.1 repoze.lru-0.7


In [2]:
from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../input/stack-overflow-developer-survey-results-2019/survey_results_public.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88883 entries, 0 to 88882
Data columns (total 85 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Respondent              88883 non-null  int64  
 1   MainBranch              88331 non-null  object 
 2   Hobbyist                88883 non-null  object 
 3   OpenSourcer             88883 non-null  object 
 4   OpenSource              86842 non-null  object 
 5   Employment              87181 non-null  object 
 6   Country                 88751 non-null  object 
 7   Student                 87014 non-null  object 
 8   EdLevel                 86390 non-null  object 
 9   UndergradMajor          75614 non-null  object 
 10  EduOther                84260 non-null  object 
 11  OrgSize                 71791 non-null  object 
 12  DevType                 81335 non-null  object 
 13  YearsCode               87938 non-null  object 
 14  Age1stCode              87634 non-null

## Q1.What is the average age of developers when they wrote their first line of code?

In [5]:
df.Age1stCode.unique()

array(['10', '17', '22', '16', '14', '15', '11', '20', '13', '18', '12',
       '19', '21', '8', '35', '6', '9', '29', '7', '5', '23', '30', nan,
       '27', '24', 'Younger than 5 years', '33', '25', '26', '39', '36',
       '38', '28', '31', 'Older than 85', '32', '37', '50', '65', '42',
       '34', '40', '67', '43', '44', '60', '46', '45', '49', '51', '41',
       '55', '83', '48', '53', '54', '47', '56', '79', '61', '68', '77',
       '66', '52', '80', '62', '84', '57', '58', '63'], dtype=object)

As this column contains string values, a simple .mean() method will raise an error. So, 1st we will convert dtype from object to float while ignoring the string values with pd.to_numeric ( https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_numeric.html )

In [6]:
df.Age1stCode = pd.to_numeric(df.Age1stCode, errors = 'coerce')

average_age = df.Age1stCode.mean()

In [7]:
print(f"The average age of people when they write their first code is: {round(average_age)}.")

The average age of people when they write their first code is: 15.


## Q2.Deduce the percentage of developers who know python in each country.

In [8]:
df.LanguageWorkedWith

0                          HTML/CSS;Java;JavaScript;Python
1                                      C++;HTML/CSS;Python
2                                                 HTML/CSS
3                                      C;C++;C#;Python;SQL
4              C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
                               ...                        
88878                        HTML/CSS;JavaScript;Other(s):
88879                                                  NaN
88880                                                  NaN
88881                                                  NaN
88882    Bash/Shell/PowerShell;HTML/CSS;Java;JavaScript...
Name: LanguageWorkedWith, Length: 88883, dtype: object

In [9]:
def python(n):
    try:
        if 'Python' in n:
            return 1
        return 0
    except:
        return 0

In [10]:
df['%_knows_python'] = list(map(python,df.LanguageWorkedWith))

In [11]:
df.groupby('Country')['%_knows_python'].mean()*100

Country
Afghanistan                             18.181818
Albania                                 26.744186
Algeria                                 29.850746
Andorra                                  0.000000
Angola                                  40.000000
                                          ...    
Venezuela, Bolivarian Republic of...    31.818182
Viet Nam                                33.766234
Yemen                                   15.789474
Zambia                                  33.333333
Zimbabwe                                35.897436
Name: %_knows_python, Length: 179, dtype: float64

## Q3.Generate a report for the average salary of developer based on continent.


In [12]:
df.Country = df.Country.replace({"Congo, Republic of the...":"Congo",
                                                             "Hong Kong (S.A.R.)":"Hong Kong",
                                                             "Libyan Arab Jamahiriya":"Libya",
                                                             "Other Country (Not Listed Above)":"Others",
                                                             "Republic of Korea":"South Korea",
                                                             "The former Yugoslav Republic of Macedonia":"North Macedonia",
                                                             "Timor-Leste":"India",#As Timor-Leste is an Asian country but not showing in pycountry_convert module
                                                             "Venezuela, Bolivarian Republic of...":"Venezuela"})

In [13]:
comp_per_country = df.groupby('Country')['ConvertedComp'].mean().to_frame().reset_index()

In [14]:
comp_per_country

Unnamed: 0,Country,ConvertedComp
0,Afghanistan,101953.333333
1,Albania,21833.700000
2,Algeria,34924.047619
3,Andorra,160931.000000
4,Angola,7764.000000
...,...,...
172,Venezuela,14581.627907
173,Viet Nam,17233.436782
174,Yemen,16909.166667
175,Zambia,10075.375000


In [15]:
def get_continent(col):
    try:
        cn_a2_code =  country_name_to_country_alpha2(col)
    except:
        cn_a2_code = 'Others' 
    try:
        cn_continent = country_alpha2_to_continent_code(cn_a2_code)
    except:
        cn_continent = 'Others' 
    return (cn_continent)

In [16]:
comp_per_country['Continent'] = list(map(get_continent,comp_per_country.Country))

Average Compensation of a Developer per Continent:

In [17]:
comp_per_country.groupby('Continent')['ConvertedComp'].mean()

Continent
AF         29188.687423
AS         44648.579043
EU         95948.496200
NA         48141.173701
OC        117917.473400
Others     99339.803279
SA         32751.810780
Name: ConvertedComp, dtype: float64

## Q4.Based on this survey, what will be the most desired programming language for the year 2020?

In [18]:
df.LanguageDesireNextYear

0          C;C++;C#;Go;HTML/CSS;Java;JavaScript;Python;SQL
1                              C++;HTML/CSS;JavaScript;SQL
2                                          Elixir;HTML/CSS
3                                      C;C#;JavaScript;SQL
4                 HTML/CSS;Java;JavaScript;SQL;WebAssembly
                               ...                        
88878    C++;HTML/CSS;JavaScript;SQL;WebAssembly;Other(s):
88879                                                  NaN
88880                                                  NaN
88881                                                  NaN
88882         C;C++;C#;HTML/CSS;Java;Objective-C;SQL;Swift
Name: LanguageDesireNextYear, Length: 88883, dtype: object

In [19]:
counts = {}
for i in df['LanguageDesireNextYear']:
    to_string = str(i)
    splited = to_string.split(';')
    for j in splited:
        if j in counts:
            counts[j] += 1
        else:
            counts[j] = 1

In [20]:
Language_desired_2020 = pd.DataFrame.from_dict(counts,orient='index',columns=['No. of Users'])

In [21]:
print(f"The most desired programming language for the year 2020 is: {Language_desired_2020.sort_values('No. of Users',ascending=False).reset_index().iloc[0,0]}.")

The most desired programming language for the year 2020 is: JavaScript.


## Q5.What is the distribution of people who code as a hobby based on gender and continent?

In [22]:
df['Continent'] = list(map(get_continent,df.Country))

In [23]:
df.Hobbyist = df.Hobbyist.replace({"Yes": 1, "No": 0})

In [24]:
def sex(string):
    if string == 'Man':
        return 'Man'
    elif string == 'Woman':
        return 'Woman'
    else:
        return "Others"

In [25]:
df.Gender = list(map(sex,df.Gender))

Hobbyist based on Gender and Continent.

In [26]:
df.groupby(['Continent','Gender'])['Hobbyist'].count()

Continent  Gender
AF         Man        2421
           Others      134
           Woman       194
AS         Man       16039
           Others     1348
           Woman      1186
EU         Man       32283
           Others     1396
           Woman      2164
NA         Man       21662
           Others     1368
           Woman      2464
OC         Man        2160
           Others      130
           Woman       144
Others     Man         102
           Others      155
           Woman        11
SA         Man        3252
           Others       89
           Woman       181
Name: Hobbyist, dtype: int64

## Q6.Generate the report for job and career satisfaction of developer based on their gender and continent?

Job satisfaction based on gender and continent

In [27]:
df.groupby(['Continent','Gender','JobSat'])['Respondent'].count()

Continent  Gender  JobSat                            
AF         Man     Neither satisfied nor dissatisfied    242
                   Slightly dissatisfied                 259
                   Slightly satisfied                    540
                   Very dissatisfied                     137
                   Very satisfied                        352
                                                        ... 
SA         Woman   Neither satisfied nor dissatisfied     20
                   Slightly dissatisfied                  21
                   Slightly satisfied                     51
                   Very dissatisfied                      11
                   Very satisfied                         43
Name: Respondent, Length: 104, dtype: int64

Career satisfaction based on gender and continent

In [28]:
df.groupby(['Continent','Gender','CareerSat'])['Respondent'].count()

Continent  Gender  CareerSat                         
AF         Man     Neither satisfied nor dissatisfied    212
                   Slightly dissatisfied                 184
                   Slightly satisfied                    566
                   Very dissatisfied                     126
                   Very satisfied                        583
                                                        ... 
SA         Woman   Neither satisfied nor dissatisfied     13
                   Slightly dissatisfied                  25
                   Slightly satisfied                     56
                   Very dissatisfied                       7
                   Very satisfied                         53
Name: Respondent, Length: 104, dtype: int64