In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
df = pd.read_csv('./Data/kaggle_survey_2022_responses.csv', skiprows=1, dtype=object)
df.head()

Unnamed: 0,Duration (in seconds),Q2,Q3,Q4,Q5,Q6_1,Q6_2,Q6_3,Q6_4,Q6_5,...,Q44_3,Q44_4,Q44_5,Q44_6,Q44_7,Q44_8,Q44_9,Q44_10,Q44_11,Q44_12
0,121,30-34,Man,India,No,,,,,,...,,,,,,,,,,
1,462,30-34,Man,Algeria,No,,,,,,...,,,,,,,,,,
2,293,18-21,Man,Egypt,Yes,Coursera,edX,,DataCamp,,...,,"Kaggle (notebooks, forums, etc)",,"YouTube (Kaggle YouTube, Cloud AI Adventures, ...","Podcasts (Chai Time Data Science, O’Reilly Dat...",,,,,
3,851,55-59,Man,France,No,Coursera,,Kaggle Learn Courses,,,...,,"Kaggle (notebooks, forums, etc)","Course Forums (forums.fast.ai, Coursera forums...",,,"Blogs (Towards Data Science, Analytics Vidhya,...",,,,
4,232,45-49,Man,India,Yes,,,,,,...,,,,,,"Blogs (Towards Data Science, Analytics Vidhya,...",,,,


In [3]:
# Create a dictionary so we can get the survey question by providing the code
questions_map_df = pd.read_csv('./Data/kaggle_survey_2022_responses.csv', nrows=1)
questions_map_df

code_to_question_map = {}
for (colName, colData) in questions_map_df.iteritems():
    code_to_question_map[str(colData.values[0])] = colName 

In [4]:
# Refactored code goes here (resued functions, etc.)

# Method to create a df of percentages chosen by item
def build_percentage_chosen_by_item_chart(current_df, columns, sortBy):
    unique_answers = [x for x in current_df.unique() if not pd.isnull(x)]
    all_answers = [x for x in current_df if not pd.isnull(x)]

    columns.append("Count")
    data = []
    for answer in unique_answers:
        count = all_answers.count(answer)
        data.append((count/len(all_answers), answer, count))
        
    percentage_by_item_df = pd.DataFrame(data, columns=columns)
    percentage_by_item_df = percentage_by_item_df.sort_values(by=sortBy, ascending=False)
    return percentage_by_item_df

# Build frequency analysis function
def build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, labels, sortBy):
    for item in current_df:

        # skip if is nan
        if pd.isnull(item):
            continue

        # filter because of comma interfering with split
        for filterItem in filters:
            if filterItem in item:
                item = item.replace(filterItem, "")

        # split on comma
        split_line = item.split(",")    

        # remove empties and strip whitespaces
        split_line = [l.strip() for l in split_line if l]

        # re-add filtered
        for filterItem in filters:
            split_line.append(filterItem)   

        for y in split_line:
            if y == "nan":
                continue
                
            unique_selections_set.add(y)
            all_selections_set.append(y)
            
    frequency_df = build_chart(unique_selections_set, all_selections_set, labels, sortBy)
    return frequency_df

# Build chart function
def build_chart(unique_selections_set, all_selections_set, labels, sortBy):
    data = []

    for item in unique_selections_set:
        count = all_selections_set.count(item)
        data.append((count, item))

    frequency_df = pd.DataFrame(data, columns=labels)
    frequency_df = frequency_df.sort_values(by=sortBy, ascending=False)
    
    return frequency_df

# Build frequency analysis for single choices
def build_frequency_analysis_single_select_chart(current_df, all_selections_set, columns, sortBy):
    for item in current_df:    
        # skip if is nan
        if pd.isnull(item):
            continue          
        all_selections_set.append(item)

    unique_selections_set = set()
    for item in all_selections_set:
        unique_selections_set.add(item)

    data = []

    for item in unique_selections_set:        
        count = all_selections_set.count(item)
        data.append((count, item))

    frequency_df = pd.DataFrame(data, columns=columns)
    frequency_df = frequency_df.sort_values(by=sortBy, ascending=False)
    return frequency_df

def export_to_excel(df, dirname, sheetname, name):
    path = f"./{dirname}/{name}.xlsx"
    isExist = os.path.exists(path)
    if not isExist:
        with pd.ExcelWriter(path, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name=f"{sheetname}", index=False)
    else:                         
        with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists="replace") as writer:    
            df.to_excel(writer, sheet_name=f"{sheetname}", index=False)
            
    export_to_excel_standalone_workbook(df, dirname, sheetname, name)
        
def export_to_excel_standalone_workbook(df, dirname, sheetname, name):
    path = f"./{dirname}/{name}-{sheetname}.xlsx"
    isExist = os.path.exists(path)
    if not isExist:
        with pd.ExcelWriter(path, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name=f"{sheetname}", index=False)
        
chart_output_dir = "Charts_KAGGLE2022"
isExist = os.path.exists(f"./{chart_output_dir}")
if not isExist:
    os.mkdir(f"./{chart_output_dir}")
    
chart_output_filename = "kaggle_2022_students"

In [5]:
# Different roles in this dataset not counting students
unique_roles = df["Q23"].unique()
unique_roles

array([nan, 'Data Scientist', 'Software Engineer', 'Research Scientist',
       'Other', 'Developer Advocate',
       'Data Analyst (Business, Marketing, Financial, Quantitative, etc)',
       'Data Engineer', 'Currently not employed',
       'Machine Learning/ MLops Engineer', 'Engineer (non-software)',
       'Teacher / professor', 'Statistician',
       'Manager (Program, Project, Operations, Executive-level, etc)',
       'Data Administrator', 'Data Architect'], dtype=object)

In [6]:
# Testing to ensure students have no additional roles as they were not included in Q23 roles question
# nrows of students_df + nrows of non-nan roles df should be less than total rows
students_df = df[df["Q5"] == "Yes"]
total_students_count = students_df.shape[0]
print("Total students: %s" % total_students_count)
print("Student role: %s" % students_df["Q23"].unique()) # should be nan or other

other_roles = [x for x in df["Q23"] if not pd.isnull(x)]
other_roles_count = len(other_roles)
print("Other roles: %s" % other_roles_count)
total_records = df.shape[0]
print("Total records: %s" % total_records)

# There is a discrepancy
# likely rows that answered "No" for Q5 student question, and also did not select a role for Q23 roles question (nan)
print("Discrepancy: %s" % (df.shape[0]-len(other_roles)-students_df.shape[0]))
discrepancy = df[df["Q23"].isna()]
discrepancy = discrepancy[discrepancy["Q5"] == "No"]
discrepancy_count = discrepancy.shape[0]

# The assumption on discrepancy is correct, all the records now equate to the total
print("Total rows = total student rows count + total other roles rows count + not student and nan role")
print("%s = %s" % (total_records, (total_students_count + other_roles_count + discrepancy_count)))

# Manually adding students is valid as prior test indicates
unique_roles = np.append(unique_roles, "Student")

Total students: 11961
Student role: [nan]
Other roles: 10630
Total records: 23997
Discrepancy: 1406
Total rows = total student rows count + total other roles rows count + not student and nan role
23997 = 23997


In [7]:
# What proportion of this dataset are in various roles. Get percentages per roles.
def get_count_by_role_name(role_name: str):
    selection = df[df["Q23"] == role_name]
    if role_name == "Student":
        return total_students_count
    return selection.shape[0]

data = []

total_rows = df.shape[0]
for role in unique_roles:   
    if(pd.isnull(role)):
        continue
    role_count = get_count_by_role_name(role)
    percentage = role_count/total_rows
    count = role_count    
    data.append((role, percentage, count))
    
roles_percentages_df = pd.DataFrame(data, columns=["Role", "Percentage", "Count"])
roles_percentages_df = roles_percentages_df.sort_values(by=["Percentage"], ascending=False)
export_to_excel(roles_percentages_df, chart_output_dir, "Proportion Students", chart_output_filename)
roles_percentages_df

Unnamed: 0,Role,Percentage,Count
15,Student,0.498437,11961
0,Data Scientist,0.080385,1929
5,"Data Analyst (Business, Marketing, Financial, ...",0.064091,1538
7,Currently not employed,0.059674,1432
1,Software Engineer,0.040838,980
10,Teacher / professor,0.034713,833
12,"Manager (Program, Project, Operations, Executi...",0.034671,832
3,Other,0.031421,754
2,Research Scientist,0.024711,593
8,Machine Learning/ MLops Engineer,0.023795,571


In [8]:
# What geolocation do students live in?
current_df = students_df["Q4"]

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, "Geography", chart_output_filename)
percentage_by_item_df

Unnamed: 0,Percentage,Label,Count
1,0.415266,India,4967
8,0.077251,United States of America,924
2,0.059192,Other,708
7,0.040298,Nigeria,482
11,0.036535,Brazil,437
3,0.034947,Pakistan,418
5,0.024245,China,290
0,0.020065,Egypt,240
12,0.019062,Indonesia,228
4,0.016052,Mexico,192


In [9]:
# What questions were the most relevant to students?
# Total students
print("Total students: %s" % total_students_count)
data = []

for (colCode, colData) in students_df.iteritems():
    total_answered = total_students_count - pd.isnull(colData.values).sum()
    percentage = total_answered/total_students_count
    data.append((colCode, percentage))
    
most_relevant_questions_df = pd.DataFrame(data, columns=["Code", "Percentage"])
filter = (most_relevant_questions_df["Percentage"] < 1) & (most_relevant_questions_df["Percentage"] > 0)
sorted_most_relevant_questions_df = most_relevant_questions_df[filter].sort_values(by=["Percentage"], ascending=False)

top_codes_sorted_df = sorted_most_relevant_questions_df[0:35]
top_codes = top_codes_sorted_df["Code"].values
toppercentages = top_codes_sorted_df["Percentage"].values
print("%s \n" % top_codes_sorted_df)

for code in top_codes:
    print("%s: %s  \n" % (code, code_to_question_map[code]))
    

Total students: 11961
       Code  Percentage
24       Q8    0.969317
29      Q11    0.962211
90      Q16    0.820082
30    Q12_1    0.785302
75    Q15_1    0.578380
55   Q13_11    0.567260
20     Q7_4    0.554301
18     Q7_2    0.542095
21     Q7_5    0.527046
289   Q44_6    0.524454
287   Q44_4    0.464677
48    Q13_4    0.451718
25       Q9    0.445615
106   Q18_1    0.445113
91    Q17_1    0.438090
76    Q15_2    0.417356
60    Q14_2    0.380821
5      Q6_1    0.377895
107   Q18_2    0.359753
32    Q12_3    0.356743
14    Q6_10    0.353984
17     Q7_1    0.339102
92    Q17_2    0.331076
59    Q14_1    0.312683
291   Q44_8    0.288521
73   Q14_15    0.278572
7      Q6_3    0.278405
49    Q13_5    0.274392
93    Q17_3    0.267954
142   Q21_9    0.262854
35    Q12_6    0.248056
141   Q21_8    0.246802
112   Q18_7    0.244127
11     Q6_7    0.237606
16    Q6_12    0.219296 

Q8: What is the highest level of formal education that you have attained or plan to attain within the next 2 yea

In [10]:
# Q8
current_question_code = "Q8"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)  
export_to_excel(percentage_by_item_df, chart_output_dir, "Education", chart_output_filename)
percentage_by_item_df

What is the highest level of formal education that you have attained or plan to attain within the next 2 years?
Total answers: 11594


Unnamed: 0,Percentage,Label,Count
0,0.392444,Bachelor’s degree,4550
1,0.360531,Master’s degree,4180
5,0.081076,Doctoral degree,940
2,0.080041,Some college/university study without earning ...,928
3,0.047007,I prefer not to answer,545
6,0.019579,No formal education past high school,227
4,0.01932,Professional doctorate,224


In [11]:
# Q11
current_question_code = "Q11"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, "Coding Experience", chart_output_filename)
percentage_by_item_df

For how many years have you been writing code and/or programming?
Total answers: 11509


Unnamed: 0,Percentage,Label,Count
0,0.364584,1-3 years,4196
3,0.294205,< 1 years,3386
2,0.147537,3-5 years,1698
5,0.083413,I have never written code,960
1,0.065775,5-10 years,757
4,0.026675,10-20 years,307
6,0.017812,20+ years,205


In [12]:
# Q12_1: we will join columns Q12_1 through Q12_15 with a comma so that an existing method can easily be used to chart
current_question_code = "Q12_1"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
print("Total students: %s" % (total_students_count))

# subset the df to just Q12_1 through Q12_15
column_selections = []
for x in range(1,16):
    column_selections.append(f"Q12_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "Programming Languages", chart_output_filename)
frequency_df

What programming languages do you use on a regular basis? (Select all that apply) - Selected Choice - Python
Total students: 11961


Unnamed: 0,Frequency,Label
5,9393,Python
0,4267,SQL
1,2967,C++
7,2474,C
13,2322,Java
11,2197,R
8,1788,Javascript
6,1412,MATLAB
12,808,PHP
10,673,C#


In [13]:
# Q15_1: we will join columns Q15_1 through Q15_15 with a comma so that an existing method can easily be used to chart
current_question_code = "Q15_1"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
print("Total students: %s" % (total_students_count))

# subset the df 
column_selections = []
for x in range(1,16):
    column_selections.append(f"Q15_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "Visualization Libraries", chart_output_filename)
frequency_df

Do you use any of the following data visualization libraries on a regular basis?  (Select all that apply) - Selected Choice -  Matplotlib 
Total students: 11961


Unnamed: 0,Frequency,Label
1,6918,Matplotlib
13,4992,Seaborn
8,2154,Plotly / Plotly Express
0,1911,Ggplot / ggplot2
14,1889,
12,626,Geoplotlib
9,398,Shiny
3,341,Other
5,285,Bokeh
11,281,D3 js


In [14]:
# Q13_11
current_question_code = "Q13_11"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total students: %s" % (total_students_count))

# subset the df 
column_selections = []
for x in range(1,15):
    column_selections.append(f"Q13_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "IDEs", chart_output_filename)
frequency_df

Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all that apply) - Selected Choice -  Jupyter Notebook
Total students: 11961


Unnamed: 0,Frequency,Label
5,6785,Jupyter Notebook
8,5403,Visual Studio Code (VSCode)
3,3282,PyCharm
4,2432,Visual Studio
12,2287,JupyterLab
2,1863,RStudio
1,1736,Notepad++
7,1441,MATLAB
9,1429,Spyder
0,1161,Sublime Text


In [15]:
# Q7_1 - Q7_8
current_question_code = "Q7_4"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total students: %s" % (total_students_count))

# subset the df 
column_selections = []
for x in range(1,8):
    column_selections.append(f"Q7_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = ["Online courses (Coursera, EdX, etc)","Social media platforms (Reddit, Twitter, etc)", "Video platforms (YouTube, Twitch, etc)", "Kaggle (notebooks, competitions, etc)"]
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "Helpful Platforms", chart_output_filename)
frequency_df

What products or platforms did you find to be most helpful when you first started studying data science?  (Select all that apply) - Selected Choice - Video platforms (YouTube, Twitch, etc)
Total students: 11961


Unnamed: 0,Frequency,Label
0,11961,"Online courses (Coursera, EdX, etc)"
2,11961,"Social media platforms (Reddit, Twitter, etc)"
4,11961,"Kaggle (notebooks, competitions, etc)"
6,11961,"Video platforms (YouTube, Twitch, etc)"
5,4056,University courses
1,779,Other
3,537,None / I do not study data science


In [16]:
#Q44_1 - Q44_12
current_question_code = "Q44_1"
print(code_to_question_map[current_question_code])
print("Total students: %s" % (total_students_count))

# subset the df 
column_selections = []
for x in range(1,13):
    column_selections.append(f"Q44_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = ["Email newsletters (Data Elixir, O'Reilly Data & AI, etc)","Kaggle (notebooks, forums, etc)",
           "Course Forums (forums.fast.ai, Coursera forums, etc)","YouTube (Kaggle YouTube, Cloud AI Adventures, etc)",
          "Podcasts (Chai Time Data Science, O’Reilly Data Show, etc)","Blogs (Towards Data Science, Analytics Vidhya, etc)",
          "Journal Publications (peer-reviewed journals, conference proceedings, etc)","Slack Communities (ods.ai, kagglenoobs, etc)", 
           "Reddit (r/machinelearning, etc)"]
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "Media Sources", chart_output_filename)
frequency_df

Who/what are your favorite media sources that report on data science topics? (Select all that apply) - Selected Choice - Twitter (data science influencers)
Total students: 11961


Unnamed: 0,Frequency,Label
0,11961,"Journal Publications (peer-reviewed journals, ..."
1,11961,"YouTube (Kaggle YouTube, Cloud AI Adventures, ..."
2,11961,"Podcasts (Chai Time Data Science, O’Reilly Dat..."
4,11961,"Blogs (Towards Data Science, Analytics Vidhya,..."
6,11961,"Course Forums (forums.fast.ai, Coursera forums..."
8,11961,"Kaggle (notebooks, forums, etc)"
9,11961,"Slack Communities (ods.ai, kagglenoobs, etc)"
10,11961,"Email newsletters (Data Elixir, O'Reilly Data ..."
11,11961,"Reddit (r/machinelearning, etc)"
5,1908,Twitter (data science influencers)


In [17]:
# Q9
current_question_code = "Q9"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, "Published Research", chart_output_filename)
percentage_by_item_df

Have you ever published any academic research (papers, preprints, conference proceedings, etc)?
Total answers: 5330


Unnamed: 0,Percentage,Label,Count
0,0.656848,No,3501
1,0.343152,Yes,1829


In [18]:
#Q18_1 - Q18_14
current_question_code = "Q18_1"
print(code_to_question_map[current_question_code])
print("Total students: %s" % (total_students_count))

# subset the df 
column_selections = []
for x in range(1,15):
    column_selections.append(f"Q18_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = ["Gradient Boosting Machines (xgboost, lightgbm, etc)","Dense Neural Networks (MLPs, etc)",
          "Transformer Networks (BERT, gpt-3, etc)","Autoencoder Networks (DAE, VAE, etc)"]
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "ML Algorithms", chart_output_filename)
frequency_df

Which of the following ML algorithms do you use on a regular basis? (Select all that apply): - Selected Choice - Linear or Logistic Regression
Total students: 11961


Unnamed: 0,Frequency,Label
5,11961,"Autoencoder Networks (DAE, VAE, etc)"
7,11961,"Transformer Networks (BERT, gpt-3, etc)"
8,11961,"Gradient Boosting Machines (xgboost, lightgbm,..."
11,11961,"Dense Neural Networks (MLPs, etc)"
4,5324,Linear or Logistic Regression
0,4303,Decision Trees or Random Forests
2,2920,Convolutional Neural Networks
12,1730,Bayesian Approaches
9,1689,Recurrent Neural Networks
3,771,Graph Neural Networks


In [19]:
#Q14_1 - Q14_16
current_question_code = "Q14_1"
print(code_to_question_map[current_question_code])
print("Total students: %s" % (total_students_count))

# subset the df 
column_selections = []
for x in range(1,17):
    column_selections.append(f"Q14_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "Notebook Products", chart_output_filename)
frequency_df

Do you use any of the following hosted notebook products?  (Select all that apply) - Selected Choice -  Kaggle Notebooks
Total students: 11961


Unnamed: 0,Frequency,Label
4,4555,Colab Notebooks
9,3740,Kaggle Notebooks
15,3332,
5,540,IBM Watson Studio
3,435,Google Cloud Vertex AI Workbench
2,397,Azure Notebooks
6,284,Other
14,229,Amazon Sagemaker Studio
13,204,Amazon Sagemaker Studio Lab
0,187,Noteable Notebooks


In [20]:
#Q6_1 - Q6_12
current_question_code = "Q6_1"
print(code_to_question_map[current_question_code])
print("Total students: %s" % (total_students_count))

# subset the df 
column_selections = []
for x in range(1,13):
    column_selections.append(f"Q6_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = ["Cloud-certification programs (direct from AWS, Azure, GCP, or similar)"]
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "Complete Courses", chart_output_filename)
frequency_df

On which platforms have you begun or completed data science courses? (Select all that apply) - Selected Choice - Coursera
Total students: 11961


Unnamed: 0,Frequency,Label
1,11961,"Cloud-certification programs (direct from AWS,..."
10,4520,Coursera
7,4234,University Courses (resulting in a university ...
5,3330,Kaggle Learn Courses
2,2842,Udemy
4,2623,Other
9,1761,DataCamp
3,1346,LinkedIn Learning
11,1310,
0,1029,edX


In [21]:
#Q17_1 - Q17_15
current_question_code = "Q17_1"
print(code_to_question_map[current_question_code])
print("Total students: %s" % (total_students_count))

# subset the df 
column_selections = []
for x in range(1,16):
    column_selections.append(f"Q17_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "ML Frameworks", chart_output_filename)
frequency_df

Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) - Selected Choice -   Scikit-learn 
Total students: 11961


Unnamed: 0,Frequency,Label
11,5240,Scikit-learn
12,3960,TensorFlow
0,3205,Keras
7,2469,PyTorch
8,1696,Xgboost
14,946,
10,633,LightGBM
3,477,PyTorch Lightning
4,473,Huggingface
9,436,CatBoost


In [22]:
#Q21_1 - Q21_10
current_question_code = "Q21_1"
print(code_to_question_map[current_question_code])
print("Total students: %s" % (total_students_count))

# subset the df 
column_selections = []
for x in range(1,11):
    column_selections.append(f"Q21_{x}")

subset_df = students_df[column_selections]
subset_aggregated_df = subset_df.apply(lambda x: ','.join(x.astype(str).values), axis=1)
subset_aggregated_df = subset_aggregated_df[df["Q5"] == "Yes"]

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = ["No, I do not download pre-trained model weights on a regular basis"]
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(subset_aggregated_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, "Pre-Trained Models", chart_output_filename)
frequency_df

Do you download pre-trained model weights from any of the following services? (Select all that apply) - Selected Choice -   TensorFlow Hub 
Total students: 11961


Unnamed: 0,Frequency,Label
5,11961,"No, I do not download pre-trained model weight..."
3,2952,Kaggle datasets
0,1505,TensorFlow Hub
7,974,PyTorch Hub
1,648,Huggingface Models
4,377,Other storage services (i.e. google drive)
8,313,NVIDIA NGC models
6,176,ONNX models
9,160,Timm
2,153,Jumpstart
