# Survey analysis
Analyse the results from the [languages survey](https://forms.gle/5b3mZRVcgAsoNG1FA)

In [1]:
all_langs = ['Python', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'SQL', 'C', 'C++', 'C#',
             'Ruby', 'R', 'Matlab', 'Go', 'Rust', 'Objective-C', 'Swift', 'Visual Basic',
             'Perl', 'Cobol', 'Fortran', 'Lisp', 'Assembly', 'Kotlin', 'Dart', 'Scala',
             'Lua', 'Delphi', 'Haskell', 'Julia', 'Clojure', 'Elixir', 'Pascal']

## Load data

In [2]:
import csv

with open('../data/Survey-2024-May.csv') as file:
    file.readline()  # Ignore first line (header)
    reader = csv.DictReader(file, fieldnames=(
    'timestamp', 'languages', 'years'))  # list of dicts with keys passed into fieldnames
    langs_known = [line['languages'].split(';') for line in reader]

num_people = len(langs_known)
print(langs_known)
print(num_people)

27
[['Python', 'C'], ['Python', 'JavaScript', 'C', 'Perl'], ['Python'], ['Python', 'SQL'], ['Python', 'Java', 'JavaScript', 'SQL', 'C', 'C++', 'Perl', 'Kotlin', 'Pascal'], ['Python', 'C'], ['C++'], ['Python', 'PHP', 'SQL', 'Visual Basic'], ['Python', 'JavaScript', 'SQL', 'R'], ['Python', 'Java', 'JavaScript', 'TypeScript', 'SQL', 'C', 'R', 'Perl', 'Lisp', 'Haskell'], ['Java', 'SQL', 'C', 'C#', 'Assembly'], ['Python', 'SQL'], ['Python', 'SQL', 'C', 'C++', 'Perl'], ['Python', 'SQL', 'C', 'C++', 'C#', 'Visual Basic', 'Pascal'], ['Java'], ['Python', 'Java', 'JavaScript', 'SQL', 'C'], ['Python', 'JavaScript', 'SQL'], ['C', 'C++'], ['Java', 'SQL'], ['Java', 'Kotlin'], ['Python', 'SQL', 'C++', 'Perl', 'Cobol', 'Pascal'], ['Python', 'SQL', 'C', 'C++', 'C#', 'Matlab', 'Pascal'], ['Python', 'SQL', 'Visual Basic'], ['Python', 'C', 'C++', 'C#', 'Matlab', 'Perl'], ['Python', 'PHP', 'SQL', 'Visual Basic'], ['Python', 'Java', 'C', 'C++', 'Perl'], ['Java', 'SQL', 'C++']]


## Find number of languages known
Print `"{# known by class} / {# in list} languages known by this class (as %)"`.

E.g. **12/21 languages known by this class (57%)**

In [3]:
unique_langs_known = {lang for langs in langs_known for lang in langs}
unique_langs_known

19 / 32 languages known by this class (59%)


In [None]:
print(
    f"{len(unique_langs_known)} / {len(all_langs)} languages known by this class ({len(unique_langs_known) / len(all_langs) * 100}%)")

## List languages not known by anyone in the class

In [4]:
not_known = set(all_langs) - unique_langs_known
not_known

Clojure
Dart
Delphi
Elixir
Fortran
Go
Julia
Lua
Objective-C
Ruby
Rust
Scala
Swift


## Rank languages by most commonly known
Print each language as `"{position}: {language} ({percent_known}%)"`, in order from most to least known

e.g. **1: Python (93%)**

In [5]:
from pprint import pprint

langs_count = {lang: 0 for lang in all_langs}
for langs in langs_known:
    for lang in langs:
        langs_count[lang] += 1

pprint(langs_count)

{'Python': 20,
 'Java': 9,
 'JavaScript': 6,
 'TypeScript': 1,
 'PHP': 2,
 'SQL': 17,
 'C': 13,
 'C++': 10,
 'C#': 4,
 'Ruby': 0,
 'R': 2,
 'Matlab': 2,
 'Go': 0,
 'Rust': 0,
 'Objective-C': 0,
 'Swift': 0,
 'Visual Basic': 4,
 'Perl': 7,
 'Cobol': 1,
 'Fortran': 0,
 'Lisp': 1,
 'Assembly': 1,
 'Kotlin': 2,
 'Dart': 0,
 'Scala': 0,
 'Lua': 0,
 'Delphi': 0,
 'Haskell': 1,
 'Julia': 0,
 'Clojure': 0,
 'Elixir': 0,
 'Pascal': 4}

In [6]:
lang_count_list = list(langs_count.items())
lang_count_list.sort(key=lambda l: l[1], reverse=True)
lang_count_list

1: Python (74%)
2: SQL (63%)
3: C (48%)
4: C++ (37%)
5: Java (33%)
6: Perl (26%)
7: JavaScript (22%)
8: C# (15%)
9: Visual Basic (15%)
10: Pascal (15%)
11: PHP (7%)
12: R (7%)
13: Matlab (7%)
14: Kotlin (7%)
15: TypeScript (4%)
16: Cobol (4%)
17: Lisp (4%)
18: Assembly (4%)
19: Haskell (4%)
20: Ruby (0%)
21: Go (0%)
22: Rust (0%)
23: Objective-C (0%)
24: Swift (0%)
25: Fortran (0%)
26: Dart (0%)
27: Scala (0%)
28: Lua (0%)
29: Delphi (0%)
30: Julia (0%)
31: Clojure (0%)
32: Elixir (0%)


In [None]:
for pos, (lang, count) in enumerate(lang_count_list, start=1):
    print(f"{pos}: {lang} ({round(count / num_people * 100)}%)")