In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
df = pd.read_csv('./Data/anaconda-2022-SODS-raw-data.csv', skiprows=2, header=1)
df.head()

Unnamed: 0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,...,5.5_3,5.5_4,5.5_5,5.5_6,5.6,5.7,5.8,5.9,5.1,5.11
0,United States,26-41,Doctoral degree,Male,Educational institution,Data Scientist,1-2 years,Data Scientist,Very satisfied,More flexibility with my work hours,...,4.0,2.0,5.0,6.0,A reduction in job opportunities caused by aut...,"Hands-on projects,Mentorship opportunities","Reading technical books, blogs, newsletters, a...",Further innovation in the open-source data sci...,Undermanagement,No
1,United States,42-57,Doctoral degree,Male,Commercial (for-profit) entity,Product Manager,5-6 years,,Very satisfied,More responsibility/opportunity for career adv...,...,2.0,5.0,4.0,6.0,Social impacts from bias in data and models,Tailored learning paths,Free video content (e.g. YouTube),More specialized data science hardware,Public trust,Yes
2,India,18-25,Bachelor's degree,Female,Educational institution,Data Scientist,,,,,...,1.0,4.0,2.0,6.0,A reduction in job opportunities caused by aut...,"Hands-on projects,Mentorship opportunities","Reading technical books, blogs, newsletters, a...",Further innovation in the open-source data sci...,Undermanagement,I'm not sure
3,United States,42-57,Bachelor's degree,Male,Commercial (for-profit) entity,Professor/Instructor/Researcher,10+ years,,Moderately satisfied,More responsibility/opportunity for career adv...,...,1.0,5.0,4.0,6.0,Social impacts from bias in data and models,Hands-on projects,"Reading technical books, blogs, newsletters, a...",New optimized models that allow for more compl...,Talent shortage,No
4,Singapore,18-25,High School or equivalent,Male,,Student,,,,,...,4.0,2.0,3.0,6.0,Social impacts from bias in data and models,"Community engagement and learning platforms,Ta...","Reading technical books, blogs, newsletters, a...",Further innovation in the open-source data sci...,Undermanagement,Yes


In [3]:
# Create a dictionary so we can get the survey question by providing the code
questions_map_df = pd.read_csv('./Data/anaconda-2022-SODS-raw-data.csv', skiprows=2, nrows=1)
questions_map_df

code_to_question_map = {}
for (colName, colData) in questions_map_df.iteritems():
    code_to_question_map[str(colData.values[0])] = colName
    

In [4]:
# Refactored code goes here (resued functions, etc.)

# Method to create a df of percentages chosen by item
def build_percentage_chosen_by_item_chart(current_df, columns, sortBy):
    unique_answers = [x for x in current_df.unique() if not pd.isnull(x)]
    all_answers = [x for x in current_df if not pd.isnull(x)]

    columns.append("Count")
    data = []
    for answer in unique_answers:
        count = all_answers.count(answer)
        data.append((count/len(all_answers), answer, count))
        
    percentage_by_item_df = pd.DataFrame(data, columns=columns)
    percentage_by_item_df = percentage_by_item_df.sort_values(by=sortBy, ascending=False)
    return percentage_by_item_df

# Build frequency analysis function
def build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, labels, sortBy):
    for item in current_df:

        # skip if is nan
        if pd.isnull(item):
            continue

        # filter because of comma interfering with split
        for filterItem in filters:
            if filterItem in item:
                item = item.replace(filterItem, "")

        # split on comma
        split_line = item.split(",")    

        # remove empties and strip whitespaces
        split_line = [l.strip() for l in split_line if l]

        # re-add filtered
        for filterItem in filters:
            split_line.append(filterItem)   

        for y in split_line:
            unique_selections_set.add(y)
            all_selections_set.append(y)
            
    frequency_df = build_chart(unique_selections_set, all_selections_set, labels, sortBy)
    return frequency_df

# Build chart function
def build_chart(unique_selections_set, all_selections_set, labels, sortBy):
    data = []

    for item in unique_selections_set:
        count = all_selections_set.count(item)
        data.append((count, item))

    frequency_df = pd.DataFrame(data, columns=labels)
    frequency_df = frequency_df.sort_values(by=sortBy, ascending=False)
    
    return frequency_df

# Build frequency analysis for single choices
def build_frequency_analysis_single_select_chart(current_df, all_selections_set, columns, sortBy):
    for item in current_df:    
        # skip if is nan
        if pd.isnull(item):
            continue          
        all_selections_set.append(item)

    unique_selections_set = set()
    for item in all_selections_set:
        unique_selections_set.add(item)

    data = []

    for item in unique_selections_set:        
        count = all_selections_set.count(item)
        data.append((count, item))

    frequency_df = pd.DataFrame(data, columns=columns)
    frequency_df = frequency_df.sort_values(by=sortBy, ascending=False)
    return frequency_df

def export_to_excel(df, dirname, sheetname, name):
    path = f"./{dirname}/{name}.xlsx"
    isExist = os.path.exists(path)
    if not isExist:
        with pd.ExcelWriter(path, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name=f"{sheetname}", index=False)
    else:                         
        with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists="replace") as writer:    
            df.to_excel(writer, sheet_name=f"{sheetname}", index=False)
        
    export_to_excel_standalone_workbook(df, dirname, sheetname, name)
            
def export_to_excel_standalone_workbook(df, dirname, sheetname, name):
    path = f"./{dirname}/{name}-{sheetname}.xlsx"
    isExist = os.path.exists(path)
    if not isExist:
        with pd.ExcelWriter(path, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name=f"{sheetname}", index=False)
        
chart_output_dir = "Charts_SODS2022"
isExist = os.path.exists(f"./{chart_output_dir}")
if not isExist:
    os.mkdir(f"./{chart_output_dir}")
    
chart_output_filename = "sods_2022_students"

In [5]:
# Different roles in this dataset
unique_roles = df["1.6"].unique()
unique_roles

array(['Data Scientist', 'Product Manager',
       'Professor/Instructor/Researcher', 'Student', 'Developer',
       'Line-of-Business Manager', 'Other (please indicate below)',
       'System Administrator ', 'Data Engineer', 'DevOps',
       'Applied Scientist', 'Business Analyst', 'Research Scientist',
       'Cloud Security Manager', 'ML Engineer', 'CloudOps',
       'Cloud Engineer', 'MLOps'], dtype=object)

In [6]:
# What proportion of this dataset are students? Get percentages per roles.
def get_count_by_role_name(role_name: str):
    selection = df[df["1.6"] == role_name]
    return selection.shape[0]

data = []

total_rows = df.shape[0]
for role in unique_roles:
    role_count = get_count_by_role_name(role)
    percentage = role_count/total_rows
    count = role_count
    data.append((role, percentage, count))

print("Total Sample Size: %s" % total_rows)
roles_percentages_df = pd.DataFrame(data, columns=["Role", "Percentage", "Count"])
roles_percentages_df = roles_percentages_df.sort_values(by=["Percentage"], ascending=False)
export_to_excel(roles_percentages_df, chart_output_dir, "Proportion Students", chart_output_filename)
roles_percentages_df

Total Sample Size: 3493


Unnamed: 0,Role,Percentage,Count
0,Data Scientist,0.164615,575
3,Student,0.135127,472
6,Other (please indicate below),0.105926,370
2,Professor/Instructor/Researcher,0.086745,303
11,Business Analyst,0.083596,292
12,Research Scientist,0.079015,276
4,Developer,0.077011,269
8,Data Engineer,0.076152,266
1,Product Manager,0.04008,140
5,Line-of-Business Manager,0.033209,116


In [7]:
# Set students as the student_df
students_df = df[df["1.6"] == "Student"]
total_students_count = students_df.shape[0]

# Total students
print("Total students: %s" % total_students_count)

# What geolocation do students live in?
current_df = students_df["1.1"]

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, "Geography", chart_output_filename)
percentage_by_item_df

Total students: 472


Unnamed: 0,Percentage,Label,Count
3,0.233051,United States,110
1,0.199153,India,94
20,0.048729,Nigeria,23
26,0.036017,Brazil,17
10,0.029661,Kenya,14
...,...,...,...
53,0.002119,Venezuela,1
54,0.002119,Benin,1
55,0.002119,Romania,1
56,0.002119,Iraq,1


In [8]:
# 1.3 level of educaiton
current_question_code = "1.3"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))
      
# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Level Of Education", chart_output_filename)
percentage_by_item_df

What is the highest level of education you've achieved?
Total answers: 472


Unnamed: 0,Percentage,Label,Count
1,0.417373,Bachelor's degree,197
0,0.188559,High School or equivalent,89
2,0.169492,Master's degree,80
3,0.169492,Some college,80
5,0.03178,No degree,15
4,0.023305,Doctoral degree,11


In [9]:
# What questions were the most relevant to students?
# Total students
print("Total students: %s" % total_students_count)
data = []

for (colCode, colData) in students_df.iteritems():
    total_answered = total_students_count - pd.isnull(colData.values).sum()
    percentage = total_answered/total_students_count
    data.append((colCode, percentage))
    
most_relevant_questions_df = pd.DataFrame(data, columns=["Code", "Percentage"])
filter = (most_relevant_questions_df["Percentage"] < 1) & (most_relevant_questions_df["Percentage"] > 0)
sorted_most_relevant_questions_df = most_relevant_questions_df[filter].sort_values(by=["Percentage"], ascending=False)

top_20_codes_sorted_df = sorted_most_relevant_questions_df[0:20]
top_20_codes = top_20_codes_sorted_df["Code"].values
top_20_percentages = top_20_codes_sorted_df["Percentage"].values
print("%s \n" % top_20_codes_sorted_df)

for code in top_20_codes:
    print("%s: %s  \n" % (code, code_to_question_map[code]))
    

Total students: 472
      Code  Percentage
86   5.1_1    0.743644
79     4.1    0.694915
82     4.4    0.694915
85     4.7    0.694915
84     4.6    0.694915
83     4.5    0.694915
81     4.3    0.694915
80     4.2    0.694915
115    5.7    0.661017
114    5.6    0.661017
116    5.8    0.661017
102  5.4_2    0.661017
117    5.9    0.661017
101  5.4_1    0.661017
107  5.4_7    0.661017
106  5.4_6    0.661017
105  5.4_5    0.661017
104  5.4_4    0.661017
103  5.4_3    0.661017
119   5.11    0.656780 

5.1_1: How often do you use the following languages? - Python  

4.1: What topics, tools, or skills are in covered your courses in preparation for entering the data science/ML field? (Select all that apply).  

4.4: What types of bias have you personally encountered in your work? (Select all that apply). - Selected Choice.1  

4.7: What is the ideal setting that you would like to work in? - Selected Choice  

4.6: How often are topics related to open-source security taught in classes or lec

In [10]:
# Q5.1_1: 5.1_1 through 5.1_14 are Likert scale questions about a specific language
# Build percentages by item selected but for all similar questions 5.1_1 through 5.1_14
for x in range(1, 15):
    current_question_code = f"5.1_{x}"
    print(code_to_question_map[current_question_code])
    current_df = students_df[current_question_code]
    total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
    print("Total answers: %s" % (total_answers))
    
    columns = ["Percentage", "Label"]
    sortBy = columns[0]

    percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)
    
    # get just the specific topic
    txt = code_to_question_map[current_question_code]
    regex_search = re.search(r"[-] (.+)", txt)
    topic = regex_search.group().lstrip("- ")
    if topic == "C/C++":
        topic = "C CPlusPlus"
        
    if topic == "HTML/CSS":
        topic = "HTML CSS"
        
    if topic == "Bash/Shell":
        topic = "Bash Shell"
    
    
    export_to_excel(percentage_by_item_df, chart_output_dir, f"Usage {topic}", chart_output_filename)
    print(percentage_by_item_df)

How often do you use the following languages? - Python
Total answers: 351
   Percentage       Label  Count
1    0.370370      Always    130
0    0.341880  Frequently    120
2    0.176638   Sometimes     62
3    0.076923      Rarely     27
4    0.034188       Never     12
How often do you use the following languages? - R
Total answers: 295
   Percentage       Label  Count
0    0.383051       Never    113
1    0.233898   Sometimes     69
2    0.223729      Rarely     66
3    0.111864  Frequently     33
4    0.047458      Always     14
How often do you use the following languages? - Java
Total answers: 281
   Percentage       Label  Count
2    0.402135       Never    113
4    0.206406   Sometimes     58
0    0.185053      Rarely     52
1    0.177936  Frequently     50
3    0.028470      Always      8
How often do you use the following languages? - JavaScript
Total answers: 278
   Percentage       Label  Count
1    0.363309       Never    101
2    0.258993   Sometimes     72
3    0.205036 

In [11]:
# Q4.1: These are "select all that apply" questions
current_question_code = "4.1"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))
      

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = ["Proprietary tools (ex. Tableau, Google Colab, SAS, MATLAB, etc)"]
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
frequency_df["Percentage"] = frequency_df["Frequency"] / total_answers
export_to_excel(frequency_df, chart_output_dir, f"Topics Tools Skills", chart_output_filename)
frequency_df

What topics, tools, or skills are in covered your courses in preparation for entering the data science/ML field? (Select all that apply).
Total answers: 328


Unnamed: 0,Frequency,Label,Percentage
6,328,"Proprietary tools (ex. Tableau, Google Colab, ...",1.0
0,249,Python,0.759146
3,195,Machine learning,0.594512
5,159,Data visualization,0.484756
10,158,Probability and statistics,0.481707
17,144,SQL,0.439024
4,119,Deep learning,0.362805
16,109,Development languages in addition to Python/R,0.332317
8,105,Communication skills,0.320122
9,97,Advanced mathematics,0.295732


In [12]:
# Q4.4: These are "select all that apply" questions
current_question_code = "4.4"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))      

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, f"Bias Encountered", chart_output_filename)
frequency_df

What types of bias have you personally encountered in your work? (Select all that apply). - Selected Choice.1
Total answers: 328


Unnamed: 0,Frequency,Label
1,133,Sample bias
3,129,Algorithmic bias
5,79,Measurement bias
6,76,Outlier bias
0,70,Confirmation bias
7,53,Prejudice bias
2,44,Exclusion bias
4,35,Other (please indicate below)


In [13]:
# Q4.7
current_question_code = "4.7"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))
      
# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Ideal Setting", chart_output_filename)
percentage_by_item_df

What is the ideal setting that you would like to work in? - Selected Choice
Total answers: 328


Unnamed: 0,Percentage,Label,Count
0,0.265244,An established startup in growth mode,87
4,0.243902,A well-established industry giant,80
1,0.207317,An academic or research center,68
3,0.134146,An early-stage startup company,44
2,0.085366,A government position,28
5,0.04878,A nonprofit organization,16
6,0.015244,Other (please indicate below),5


In [14]:
# Q4.6
current_question_code = "4.6"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))      

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Open Source Security", chart_output_filename)
percentage_by_item_df

How often are topics related to open-source security taught in classes or lectures?.1
Total answers: 328


Unnamed: 0,Percentage,Label,Count
1,0.329268,It hasn't come up in any discussions or lectures,108
0,0.216463,"Sometimes, when there is a specific lecture",71
2,0.20122,"Rarely, only if there is a one-off conversatio...",66
3,0.146341,"Often, but only during a specific class",48
4,0.106707,"Frequently, in multiple classes and lectures",35


In [15]:
# Q4.5
current_question_code = "4.5"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Biggest Obstacle", chart_output_filename)
percentage_by_item_df

In your opinion, what is the biggest obstacle to obtaining the experience required for a career in data science or related fields? - Selected Choice
Total answers: 328


Unnamed: 0,Percentage,Label,Count
0,0.268293,Finding an internship,88
3,0.204268,Lack of clarity surrounding required experience,67
1,0.137195,Lack of social capital or mentor relationships,45
5,0.125,Cost,41
4,0.103659,Learning new languages,34
2,0.097561,Saturated field makes it difficult to stand out,32
7,0.042683,Lack of resources due to COVID-19,14
6,0.021341,Other (please indicate below),7


In [16]:
# Q4.3
current_question_code = "4.3"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, f"Bias Topics Covered", chart_output_filename)
frequency_df

What types of AI/ML/data science bias are covered in your classes or lectures? (Select all that apply). - Selected Choice
Total answers: 328


Unnamed: 0,Frequency,Label
3,132,Algorithmic bias
1,121,Sample bias
5,118,AI/ML/data science bias has not been not taugh...
0,77,Confirmation bias
7,71,Outlier bias
6,65,Measurement bias
8,47,Prejudice bias
2,43,Exclusion bias
4,14,Other (please indicate below)


In [17]:
# Q4.2
current_question_code = "4.2"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Analyze frequency of each item
all_selections_set = []
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Bias Covered In Lectures", chart_output_filename)
percentage_by_item_df

How often is bias in AI/ML/data science discussed in classes or lectures?
Total answers: 328


Unnamed: 0,Percentage,Label,Count
1,0.219512,"Frequently, in multiple classes and lectures",72
0,0.213415,"Sometimes, when there is a specific lecture",70
3,0.210366,"Often, but only during a specific class",69
2,0.204268,It hasn't come up in any discussions or lectures,67
4,0.152439,"Rarely, only if there is a one-off conversatio...",50


In [18]:
# Q5.7
current_question_code = "5.7"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
frequency_df["Percentage"] = frequency_df["Frequency"] / total_answers
export_to_excel(frequency_df, chart_output_dir, f"Resources Lacking", chart_output_filename)
frequency_df

What tools and resources do you feel are lacking for data scientists who want to learn and develop their skills? (Select all that apply). - Selected Choice
Total answers: 312


Unnamed: 0,Frequency,Label,Percentage
0,195,Mentorship opportunities,0.625
3,146,Tailored learning paths,0.467949
4,139,Hands-on projects,0.445513
2,125,Community engagement and learning platforms,0.400641
1,9,Other (please indicate below),0.028846


In [19]:
# Q5.6
current_question_code = "5.6"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Analyze frequency of each item
all_selections_set = []
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Biggest Problem", chart_output_filename)
percentage_by_item_df

What do you think is the biggest problem in the data science/AI/ML space today? - Selected Choice
Total answers: 312


Unnamed: 0,Percentage,Label,Count
0,0.25,Social impacts from bias in data and models,78
2,0.205128,Lack of diversity and inclusion in the profession,64
1,0.201923,A reduction in job opportunities caused by aut...,63
4,0.176282,Impacts to individual privacy,55
3,0.134615,Advanced information warfare,42
5,0.032051,Other (please indicate below),10


In [20]:
# Q5.8
current_question_code = "5.8"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Use function to build frequency chart
unique_selections_set = set()
all_selections_set = []
filters = ["Reading technical books, blogs, newsletters, and papers", "Paid online courses (e.g. Coursera, EdX)"]
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, f"Learning Strategy", chart_output_filename)
frequency_df

How do you typically learn about new tools and topics relevant to your role? (Select all that apply). - Selected Choice
Total answers: 312


Unnamed: 0,Frequency,Label
2,312,"Reading technical books, blogs, newsletters, a..."
4,312,"Paid online courses (e.g. Coursera, EdX)"
1,253,Free video content (e.g. YouTube)
5,136,Community platforms and forums
0,106,Paid university courses
3,9,Other (please indicate below)


In [21]:
# Q5.9
current_question_code = "5.9"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Hoping to See", chart_output_filename)
percentage_by_item_df

What are you most hoping to see from the data science industry this year? - Selected Choice
Total answers: 312


Unnamed: 0,Percentage,Label,Count
0,0.387821,Further innovation in the open-source data sci...,121
3,0.246795,New optimized models that allow for more compl...,77
1,0.160256,More resources given to data science teams wit...,50
2,0.121795,More specialized data science hardware,38
4,0.054487,Further innovation in vendor-specific data sci...,17
5,0.019231,Other (please indicate below),6
6,0.009615,,3


In [22]:
# Q5.11
current_question_code = "5.11"
print(code_to_question_map[current_question_code])
current_df = students_df[current_question_code]
total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Supply Chain Impact", chart_output_filename)
percentage_by_item_df

Have supply chain disruption problems, such as the ongoing chip shortage, impacted your access to computing resources?
Total answers: 310


Unnamed: 0,Percentage,Label,Count
1,0.470968,No,146
2,0.348387,I'm not sure,108
0,0.180645,Yes,56


In [23]:
# Q5.4_1 - Questions go together 5.4_1 through 5.1_7 and make up a 100% about importance of X topic
# Get averages for each question's responses for 5.4_1 through 5.1_7

def get_average_by_topic_tuple(current_question_code, current_df):
    total_answers = (total_students_count - pd.isnull(current_df.values).sum())   
    sum_of_values = current_df.sum()
    average = sum_of_values/total_answers
    
    # get just the specific topic
    txt = code_to_question_map[current_question_code]
    regex_search = re.search(r"[-] (\w.+)", txt)
    topic = regex_search.group().lstrip("- ") 
    
    # return built data tuple
    data = (average, topic)
    return data

# Use function to get each topic's average as a tuple of (average, topic)
data = []
for x in range(1, 8):
    current_question_code = f"5.4_{x}"  
    current_df = students_df[current_question_code]
    data_tuple = get_average_by_topic_tuple(current_question_code, current_df)
    data.append(data_tuple)

# Display
whole_question = code_to_question_map[current_question_code]
regex_search = re.search(r"^(\w.+)\?", whole_question)
question_part = regex_search.group()

total_answers = total_students_count - pd.isnull(students_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))
print("Question: %s" % (question_part))

# Chart
topic_average_df = pd.DataFrame(data, columns = ["Average", "Topic"])
topic_average_df = topic_average_df.sort_values(by=["Average"], ascending=False)
export_to_excel(topic_average_df, chart_output_dir, f"Open Source Valued", chart_output_filename)
topic_average_df

Total answers: 312
Question: What do you value most about open-source technology?


Unnamed: 0,Average,Topic
0,20.896154,Speed of innovation
1,20.282372,Most economical option
2,18.166667,Most useful tools for my needs
4,16.09391,Product quality
5,12.036538,Offers security
3,11.258333,Avoid vendor lock-in
6,1.266026,Other (please indicate)
