# Survey analysis
Analyse the results from the [languages survey](https://forms.gle/5b3mZRVcgAsoNG1FA)

In [1]:
all_langs = ['Python', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'SQL', 'C', 'C++', 'C#',
             'Ruby', 'R', 'Matlab', 'Go', 'Rust', 'Objective-C', 'Swift', 'Visual Basic',
             'Perl', 'Cobol', 'Fortran', 'Lisp', 'Assembly', 'Kotlin', 'Dart', 'Scala',
             'Lua', 'Delphi', 'Haskell', 'Julia', 'Clojure', 'Elixir', 'Pascal']

## Load data

In [30]:
import csv
from pprint import pprint

with open('../data/Survey-2024-Jul.csv') as file:
    file.readline()  # Ignore first line (header)
    reader = csv.DictReader(file, fieldnames=('timestamp', 'languages', 'years'))
    # list of dicts with keys passed into fieldnames

    responses = [response['languages'].split(';') for response in reader]

print(len(responses))

22


## Find number of languages known
Print `"{# known by class} / {# in list} languages known by this class (as %)"`.

E.g. **12/21 languages known by this class (57%)**

In [31]:
langs_known = {lang for response in responses for lang in response}

print(f"{len(langs_known)} / {len(all_langs)} languages known by this class ("
      f"{round(len(langs_known) / len(all_langs) * 100)}%)")


25 / 32 languages known by this class (78%)


## List languages not known by anyone in the class

In [32]:
not_known = set(all_langs) - langs_known
not_known

{'Clojure', 'Dart', 'Delphi', 'Julia', 'Kotlin', 'Rust', 'Swift'}

## Rank languages by most commonly known
Print each language as `"{position}: {language} ({percent_known}%)"`, in order from most to least known

e.g. **1: Python (93%)**

In [39]:
from collections import defaultdict

lang_counts = defaultdict(lambda: 0)

for response in responses:
    for lang in response:
        lang_counts[lang] += 1

pprint(lang_counts)

defaultdict(<function <lambda> at 0x104b57ba0>,
            {'Assembly': 3,
             'C': 7,
             'C#': 2,
             'C++': 5,
             'Cobol': 1,
             'Elixir': 1,
             'Fortran': 3,
             'Go': 4,
             'Haskell': 1,
             'Java': 10,
             'JavaScript': 8,
             'Lisp': 3,
             'Lua': 2,
             'Matlab': 3,
             'Objective-C': 1,
             'PHP': 1,
             'Pascal': 2,
             'Perl': 4,
             'Python': 18,
             'R': 4,
             'Ruby': 2,
             'SQL': 9,
             'Scala': 1,
             'TypeScript': 1,
             'Visual Basic': 2})


In [45]:
from collections import Counter

lang_counts = Counter(all_langs)
lang_counts.subtract(all_langs)

for response in responses:
    lang_counts.update(response)

for pos, (lang, count) in enumerate(lang_counts.most_common(), start=1):
    print(f"{pos}: {lang} ({round(count / num_responses * 100)}%)")

1: Python (82%)
2: Java (45%)
3: SQL (41%)
4: JavaScript (36%)
5: C (32%)
6: C++ (23%)
7: R (18%)
8: Go (18%)
9: Perl (18%)
10: Matlab (14%)
11: Fortran (14%)
12: Lisp (14%)
13: Assembly (14%)
14: C# (9%)
15: Ruby (9%)
16: Visual Basic (9%)
17: Lua (9%)
18: Pascal (9%)
19: TypeScript (5%)
20: PHP (5%)
21: Objective-C (5%)
22: Cobol (5%)
23: Scala (5%)
24: Haskell (5%)
25: Elixir (5%)
26: Rust (0%)
27: Swift (0%)
28: Kotlin (0%)
29: Dart (0%)
30: Delphi (0%)
31: Julia (0%)
32: Clojure (0%)
