# Survey analysis
Analyse the results from the [languages survey](https://forms.gle/5b3mZRVcgAsoNG1FA)

In [1]:
all_langs = ['Python', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'C', 'C++', 'C#',
             'Ruby', 'R', 'Matlab', 'Go', 'Rust', 'Objective-C', 'Swift',
             'Visual Basic', 'Perl', 'Cobol', 'Fortran', 'Lisp', 'Assembly']

## Load data

In [2]:
import csv
from pprint import pprint

with open('../data/Programming language survey.csv') as file:
    file.readline()
    reader = csv.DictReader(file, fieldnames=('timestamp', 'languages', 'other_langs', 'age'))
    langs_by_entry = [line['languages'].split(';') for line in reader]

pprint(langs_by_entry)

[['Python', 'Java', 'C', 'C++', 'R', 'Matlab', 'Rust', 'Fortran', 'Assembly'],
 ['Python', 'Java', 'PHP', 'Visual Basic'],
 ['Python',
  'Java',
  'JavaScript',
  'C',
  'C++',
  'Ruby',
  'R',
  'Objective-C',
  'Assembly'],
 ['Python', 'Java', 'JavaScript', 'C', 'C++', 'C#', 'R', 'Perl'],
 ['JavaScript', 'C', 'C#', 'Objective-C', 'Visual Basic'],
 ['Python', 'Java', 'Perl'],
 ['Python', 'Java', 'C++'],
 ['Python', 'JavaScript', 'Go'],
 ['Python', 'Matlab'],
 ['Python', 'Java', 'C', 'C#', 'Lisp'],
 ['Python'],
 ['Python', 'Ruby', 'Rust', 'Perl'],
 ['Python', 'JavaScript', 'TypeScript', 'C++', 'R'],
 ['Python', 'C++', 'Matlab', 'Visual Basic'],
 ['Python', 'JavaScript', 'PHP'],
 ['Python'],
 ['Python', 'JavaScript', 'PHP', 'C', 'C++', 'R', 'Go'],
 ['Python', 'C++', 'R', 'Matlab', 'Fortran', 'Assembly'],
 ['Python', 'Visual Basic', 'Perl'],
 ['Python', 'TypeScript', 'Swift'],
 ['Python',
  'Java',
  'C',
  'C++',
  'C#',
  'R',
  'Visual Basic',
  'Perl',
  'Cobol',
  'Fortran',
  'Asse

## Find number of languages known
Print `"{# known by class} / {# in list} languages known by this class (as %)"`.

E.g. **12/21 languages known by this class (57%)**

In [3]:
langs_set = {lang for langs in langs_by_entry for lang in langs}

num_known = len(langs_set)
num_all = len(all_langs)
percent = round(num_known / num_all * 100)

print(f"{num_known} / {num_all} languages known by this class ({percent}%)")

21 / 21 languages known by this class (100%)


## List languages not known by anyone in the class

In [4]:
not_known = set(all_langs) - langs_set
print(not_known)

set()


## Rank languages by most commonly known
Print each language as `"{position}: {language} ({count})"`, in order from most to least known

e.g. **1: Python (30)**

In [5]:
langs_list = [lang for langs in langs_by_entry for lang in langs]
langs_count = {lang: langs_list.count(lang) for lang in langs_set}

print(langs_count)

{'Matlab': 7, 'C++': 19, 'Lisp': 5, 'R': 12, 'Java': 19, 'TypeScript': 6, 'JavaScript': 16, 'Cobol': 4, 'PHP': 5, 'C': 22, 'Assembly': 9, 'Visual Basic': 13, 'Rust': 3, 'Go': 5, 'Swift': 1, 'Perl': 15, 'Ruby': 5, 'C#': 9, 'Objective-C': 2, 'Fortran': 7, 'Python': 47}


In [6]:
def sort_func(x):
    return -x[1]


langs_count_list = list(langs_count.items())
langs_count_list.sort(key=lambda x: -x[1])
langs_count_list

[('Python', 47),
 ('C', 22),
 ('C++', 19),
 ('Java', 19),
 ('JavaScript', 16),
 ('Perl', 15),
 ('Visual Basic', 13),
 ('R', 12),
 ('Assembly', 9),
 ('C#', 9),
 ('Matlab', 7),
 ('Fortran', 7),
 ('TypeScript', 6),
 ('Lisp', 5),
 ('PHP', 5),
 ('Go', 5),
 ('Ruby', 5),
 ('Cobol', 4),
 ('Rust', 3),
 ('Objective-C', 2),
 ('Swift', 1)]

In [7]:
num_people = len(langs_by_entry)

for i, (name, count) in enumerate(langs_count_list, start=1):
    print(f"{i}: {name} ({round(count / num_people * 100)}%)")

1: Python (94%)
2: C (44%)
3: C++ (38%)
4: Java (38%)
5: JavaScript (32%)
6: Perl (30%)
7: Visual Basic (26%)
8: R (24%)
9: Assembly (18%)
10: C# (18%)
11: Matlab (14%)
12: Fortran (14%)
13: TypeScript (12%)
14: Lisp (10%)
15: PHP (10%)
16: Go (10%)
17: Ruby (10%)
18: Cobol (8%)
19: Rust (6%)
20: Objective-C (4%)
21: Swift (2%)
