# Survey analysis
Analyse the results from the [languages survey](https://forms.gle/5b3mZRVcgAsoNG1FA)

In [1]:
all_langs = ['Python', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'SQL', 'C', 'C++', 'C#',
             'Ruby', 'R', 'Matlab', 'Go', 'Rust', 'Objective-C', 'Swift', 'Visual Basic',
             'Perl', 'Cobol', 'Fortran', 'Lisp', 'Assembly', 'Kotlin', 'Dart', 'Scala',
             'Lua', 'Delphi', 'Haskell', 'Julia', 'Clojure', 'Elixir', 'Pascal']

## Load data

In [2]:
import csv
from pprint import pprint

langs_known: list[str] = []
num_responses = 0

with open('../data/Survey-2024-Jul.csv') as file:
    file.readline()  # Ignore first line (header)
    reader = csv.DictReader(file, fieldnames=('timestamp', 'languages', 'years'))  # list of dicts with keys passed into fieldnames
    for response in reader:
        langs = response['languages'].split(';')
        langs_known.extend(langs)
        num_responses += 1

print(len(langs_known))
print(num_responses)

98
22


## Find number of languages known
Print `"{# known by class} / {# in list} languages known by this class (as %)"`.

E.g. **12/21 languages known by this class (57%)**

In [3]:
langs_known_set = set(langs_known)
print(f"{len(langs_known_set)} / {len(all_langs)} languages known by this class ({round(len(langs_known_set) / len(all_langs) * 100)}%)")

25 / 32 languages known by this class (78%)


## List languages not known by anyone in the class

In [4]:
not_known = set(all_langs) - langs_known_set
print(not_known)

{'Swift', 'Clojure', 'Julia', 'Kotlin', 'Rust', 'Delphi', 'Dart'}


## Rank languages by most commonly known
Print each language as `"{position}: {language} ({percent_known}%)"`, in order from most to least known

e.g. **1: Python (93%)**

In [5]:
lang_counts = {}

for lang in langs_known:
    lang_counts[lang] = lang_counts.get(lang, 0) + 1

lang_counts_list = list(lang_counts.items())
pprint(lang_counts_list)

[('Python', 18),
 ('Java', 10),
 ('JavaScript', 8),
 ('SQL', 9),
 ('C', 7),
 ('Ruby', 2),
 ('R', 4),
 ('Objective-C', 1),
 ('Visual Basic', 2),
 ('C++', 5),
 ('Matlab', 3),
 ('Go', 4),
 ('Perl', 4),
 ('Elixir', 1),
 ('Lisp', 3),
 ('Assembly', 3),
 ('PHP', 1),
 ('TypeScript', 1),
 ('Fortran', 3),
 ('Lua', 2),
 ('Haskell', 1),
 ('Pascal', 2),
 ('C#', 2),
 ('Scala', 1),
 ('Cobol', 1)]


In [6]:
lang_counts_list.sort(key=lambda t: t[1], reverse=True)
pprint(lang_counts_list)

[('Python', 18),
 ('Java', 10),
 ('SQL', 9),
 ('JavaScript', 8),
 ('C', 7),
 ('C++', 5),
 ('R', 4),
 ('Go', 4),
 ('Perl', 4),
 ('Matlab', 3),
 ('Lisp', 3),
 ('Assembly', 3),
 ('Fortran', 3),
 ('Ruby', 2),
 ('Visual Basic', 2),
 ('Lua', 2),
 ('Pascal', 2),
 ('C#', 2),
 ('Objective-C', 1),
 ('Elixir', 1),
 ('PHP', 1),
 ('TypeScript', 1),
 ('Haskell', 1),
 ('Scala', 1),
 ('Cobol', 1)]


In [7]:
for pos, (lang, count) in enumerate(lang_counts_list, start=1):
    print(f"{pos}: {lang} ({round(count / num_responses * 100)}%)")

1: Python (82%)
2: Java (45%)
3: SQL (41%)
4: JavaScript (36%)
5: C (32%)
6: C++ (23%)
7: R (18%)
8: Go (18%)
9: Perl (18%)
10: Matlab (14%)
11: Lisp (14%)
12: Assembly (14%)
13: Fortran (14%)
14: Ruby (9%)
15: Visual Basic (9%)
16: Lua (9%)
17: Pascal (9%)
18: C# (9%)
19: Objective-C (5%)
20: Elixir (5%)
21: PHP (5%)
22: TypeScript (5%)
23: Haskell (5%)
24: Scala (5%)
25: Cobol (5%)
