In [1]:
all_langs = ['Python', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'C', 'C++', 'C#',
             'Ruby', 'R', 'Matlab', 'Go', 'Rust', 'Objective-C', 'Swift',
             'Visual Basic', 'Perl', 'Cobol', 'Fortran', 'Lisp', 'Assembly']

# Load data

In [2]:
from pprint import pprint
import pandas as pd

df = pd.read_csv('../data/Programming language survey.csv', header=0, names=('timestamp', 'languages', 'other_langs', 'age'), usecols=('languages', 'other_langs', 'age'))
    
df

Unnamed: 0,languages,other_langs,age
0,Python;Java;C;C++;R;Matlab;Rust;Fortran;Assembly,"awk,Julia",50 - 59
1,Python;Java;PHP;Visual Basic,,40 - 49
2,Python;Java;JavaScript;C;C++;Ruby;R;Objective-...,,30 - 39


In [3]:
import re

for lang in all_langs:
    regex = rf'(^|;){re.escape(lang)}($|;)'
    df[lang] = df['languages'].str.contains(regex)
df

  df[lang] = df['languages'].str.contains(regex)


Unnamed: 0,languages,other_langs,age,Python,Java,JavaScript,TypeScript,PHP,C,C++,...,Go,Rust,Objective-C,Swift,Visual Basic,Perl,Cobol,Fortran,Lisp,Assembly
0,Python;Java;C;C++;R;Matlab;Rust;Fortran;Assembly,"awk,Julia",50 - 59,True,True,False,False,False,True,True,...,False,True,False,False,False,False,False,True,False,True
1,Python;Java;PHP;Visual Basic,,40 - 49,True,True,False,False,True,False,False,...,False,False,False,False,True,False,False,False,False,False
2,Python;Java;JavaScript;C;C++;Ruby;R;Objective-...,,30 - 39,True,True,True,False,False,True,True,...,False,False,True,False,False,False,False,False,False,True


## Find number of languages known
Print `"{# known by class} / {# in list} languages known by this class (as %)"`.

E.g. **12/21 languages known by this class (57%)**

In [4]:
known_langs = df.columns[df.eq(True).any()].tolist()
known_langs

['Python',
 'Java',
 'JavaScript',
 'PHP',
 'C',
 'C++',
 'Ruby',
 'R',
 'Matlab',
 'Rust',
 'Objective-C',
 'Visual Basic',
 'Fortran',
 'Assembly']

In [5]:
percent = round(len(known_langs)/len(all_langs) * 100)
print(f"{len(known_langs)}/{len(all_langs)} languages known by this class ({percent}%)")

14/21 languages known by this class (67%)


## List languages not known by anyone in the class

In [6]:
print('Not known:')
', '.join(df.columns[df.eq(False).all()].tolist())

Not known:


'TypeScript, C#, Go, Swift, Perl, Cobol, Lisp'

# Rank languages by most commonly known
Print each language as `"{position}: {language} ({count})"`, in order from most to least known

e.g. **1: Python (30)**

In [7]:
langs_count = df.iloc[:, 3:].sum()
langs_count

Python          3
Java            3
JavaScript      1
TypeScript      0
PHP             1
C               2
C++             2
C#              0
Ruby            1
R               2
Matlab          1
Go              0
Rust            1
Objective-C     1
Swift           0
Visual Basic    1
Perl            0
Cobol           0
Fortran         1
Lisp            0
Assembly        2
dtype: int64

In [8]:
langs_count.sort_values(ascending=False, inplace=True)
langs_count

Python          3
Java            3
C               2
R               2
C++             2
Assembly        2
PHP             1
Ruby            1
JavaScript      1
Rust            1
Objective-C     1
Visual Basic    1
Fortran         1
Matlab          1
C#              0
TypeScript      0
Go              0
Swift           0
Perl            0
Cobol           0
Lisp            0
dtype: int64

In [9]:
for i, (lang, count) in enumerate(langs_count.iteritems(), start=1):
    print(f'{i}: {lang} ({count})')

1: Python (3)
2: Java (3)
3: C (2)
4: R (2)
5: C++ (2)
6: Assembly (2)
7: PHP (1)
8: Ruby (1)
9: JavaScript (1)
10: Rust (1)
11: Objective-C (1)
12: Visual Basic (1)
13: Fortran (1)
14: Matlab (1)
15: C# (0)
16: TypeScript (0)
17: Go (0)
18: Swift (0)
19: Perl (0)
20: Cobol (0)
21: Lisp (0)


## Bonus: rank languages known by age group

In [10]:
age_ranges = ['<= 19', '20 - 29', '30 - 39', '40 - 49', '50 - 59', '>= 60', 'Unknown']

df2 = df
df2['age'] = pd.Categorical(df['age'], categories=age_ranges)
df2['age'].fillna("Unknown", inplace=True)
df2['total'] = 1
df2

Unnamed: 0,languages,other_langs,age,Python,Java,JavaScript,TypeScript,PHP,C,C++,...,Rust,Objective-C,Swift,Visual Basic,Perl,Cobol,Fortran,Lisp,Assembly,total
0,Python;Java;C;C++;R;Matlab;Rust;Fortran;Assembly,"awk,Julia",50 - 59,True,True,False,False,False,True,True,...,True,False,False,False,False,False,True,False,True,1
1,Python;Java;PHP;Visual Basic,,40 - 49,True,True,False,False,True,False,False,...,False,False,False,True,False,False,False,False,False,1
2,Python;Java;JavaScript;C;C++;Ruby;R;Objective-...,,30 - 39,True,True,True,False,False,True,True,...,False,True,False,False,False,False,False,False,True,1


In [11]:
by_age = df2.groupby('age').sum()
by_age

Unnamed: 0_level_0,Python,Java,JavaScript,TypeScript,PHP,C,C++,C#,Ruby,R,...,Rust,Objective-C,Swift,Visual Basic,Perl,Cobol,Fortran,Lisp,Assembly,total
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
<= 19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20 - 29,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30 - 39,1,1,1,0,0,1,1,0,1,1,...,0,1,0,0,0,0,0,0,1,1
40 - 49,1,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
50 - 59,1,1,0,0,0,1,1,0,0,1,...,1,0,0,0,0,0,1,0,1,1
>= 60,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Unknown,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
for index in by_age.index:
    num_respondents = by_age.loc[index, 'total']
    print(f"For {index} ({num_respondents})")
    if num_respondents == 0:
        print('  No data')
    else:
        print(by_age.loc[index, by_age.loc[index] >= 1].sort_values(ascending=False).to_string())
    print()

For <= 19 (0)
  No data

For 20 - 29 (0)
  No data

For 30 - 39 (1)
Python         1
Java           1
JavaScript     1
C              1
C++            1
Ruby           1
R              1
Objective-C    1
Assembly       1
total          1

For 40 - 49 (1)
Python          1
Java            1
PHP             1
Visual Basic    1
total           1

For 50 - 59 (1)
Python      1
Java        1
C           1
C++         1
R           1
Matlab      1
Rust        1
Fortran     1
Assembly    1
total       1

For >= 60 (0)
  No data

For Unknown (0)
  No data



## Bonus: other languages known

In [13]:
df3 = df.loc[:,['other_langs', 'age']]
df3.index = df3.pop('age')
df3

Unnamed: 0_level_0,other_langs
age,Unnamed: 1_level_1
50 - 59,"awk,Julia"
40 - 49,
30 - 39,


In [14]:
df3['other_langs'] = df3['other_langs'].str.split(',')
df3.dropna(axis=0, inplace=True)
other_langs = df3.explode('other_langs')
other_langs

Unnamed: 0_level_0,other_langs
age,Unnamed: 1_level_1
50 - 59,awk
50 - 59,Julia


In [15]:
other_langs.value_counts()

other_langs
Julia          1
awk            1
dtype: int64