In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
df = pd.read_csv('./Data/anaconda-2022-SODS-raw-data.csv', skiprows=2, header=1)
df.head()

Unnamed: 0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2,...,5.5_3,5.5_4,5.5_5,5.5_6,5.6,5.7,5.8,5.9,5.1,5.11
0,United States,26-41,Doctoral degree,Male,Educational institution,Data Scientist,1-2 years,Data Scientist,Very satisfied,More flexibility with my work hours,...,4.0,2.0,5.0,6.0,A reduction in job opportunities caused by aut...,"Hands-on projects,Mentorship opportunities","Reading technical books, blogs, newsletters, a...",Further innovation in the open-source data sci...,Undermanagement,No
1,United States,42-57,Doctoral degree,Male,Commercial (for-profit) entity,Product Manager,5-6 years,,Very satisfied,More responsibility/opportunity for career adv...,...,2.0,5.0,4.0,6.0,Social impacts from bias in data and models,Tailored learning paths,Free video content (e.g. YouTube),More specialized data science hardware,Public trust,Yes
2,India,18-25,Bachelor's degree,Female,Educational institution,Data Scientist,,,,,...,1.0,4.0,2.0,6.0,A reduction in job opportunities caused by aut...,"Hands-on projects,Mentorship opportunities","Reading technical books, blogs, newsletters, a...",Further innovation in the open-source data sci...,Undermanagement,I'm not sure
3,United States,42-57,Bachelor's degree,Male,Commercial (for-profit) entity,Professor/Instructor/Researcher,10+ years,,Moderately satisfied,More responsibility/opportunity for career adv...,...,1.0,5.0,4.0,6.0,Social impacts from bias in data and models,Hands-on projects,"Reading technical books, blogs, newsletters, a...",New optimized models that allow for more compl...,Talent shortage,No
4,Singapore,18-25,High School or equivalent,Male,,Student,,,,,...,4.0,2.0,3.0,6.0,Social impacts from bias in data and models,"Community engagement and learning platforms,Ta...","Reading technical books, blogs, newsletters, a...",Further innovation in the open-source data sci...,Undermanagement,Yes


In [3]:
# Create a dictionary so we can get the survey question by providing the code
questions_map_df = pd.read_csv('./Data/anaconda-2022-SODS-raw-data.csv', skiprows=2, nrows=1)
questions_map_df

code_to_question_map = {}
for (colName, colData) in questions_map_df.iteritems():
    code_to_question_map[str(colData.values[0])] = colName
    

In [4]:
# Refactored code goes here (resued functions, etc.)

# Method to create a df of percentages chosen by item
def build_percentage_chosen_by_item_chart(current_df, columns, sortBy):
    unique_answers = [x for x in current_df.unique() if not pd.isnull(x)]
    all_answers = [x for x in current_df if not pd.isnull(x)]

    columns.append("Count")
    data = []
    for answer in unique_answers:
        count = all_answers.count(answer)
        data.append((count/len(all_answers), answer, count))
        
    percentage_by_item_df = pd.DataFrame(data, columns=columns)
    percentage_by_item_df = percentage_by_item_df.sort_values(by=sortBy, ascending=False)
    return percentage_by_item_df

# Build frequency analysis function
def build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, labels, sortBy):
    for item in current_df:

        # skip if is nan
        if pd.isnull(item):
            continue

        # filter because of comma interfering with split
        for filterItem in filters:
            if filterItem in item:
                item = item.replace(filterItem, "")

        # split on comma
        split_line = item.split(",")    

        # remove empties and strip whitespaces
        split_line = [l.strip() for l in split_line if l]

        # re-add filtered
        for filterItem in filters:
            split_line.append(filterItem)   

        for y in split_line:
            unique_selections_set.add(y)
            all_selections_set.append(y)
            
    frequency_df = build_chart(unique_selections_set, all_selections_set, labels, sortBy)
    return frequency_df

# Build chart function
def build_chart(unique_selections_set, all_selections_set, labels, sortBy):
    data = []

    for item in unique_selections_set:
        count = all_selections_set.count(item)
        data.append((count, item))

    frequency_df = pd.DataFrame(data, columns=labels)
    frequency_df = frequency_df.sort_values(by=sortBy, ascending=False)
    
    return frequency_df

# Build frequency analysis for single choices
def build_frequency_analysis_single_select_chart(current_df, all_selections_set, columns, sortBy):
    for item in current_df:    
        # skip if is nan
        if pd.isnull(item):
            continue          
        all_selections_set.append(item)

    unique_selections_set = set()
    for item in all_selections_set:
        unique_selections_set.add(item)

    data = []

    for item in unique_selections_set:        
        count = all_selections_set.count(item)
        data.append((count, item))

    frequency_df = pd.DataFrame(data, columns=columns)
    frequency_df = frequency_df.sort_values(by=sortBy, ascending=False)
    return frequency_df

def export_to_excel(df, dirname, sheetname, name):
    path = f"./{dirname}/{name}.xlsx"
    isExist = os.path.exists(path)
    if not isExist:
        with pd.ExcelWriter(path, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name=f"{sheetname}", index=False)
    else:                         
        with pd.ExcelWriter(path, engine='openpyxl', mode='a', if_sheet_exists="replace") as writer:    
            df.to_excel(writer, sheet_name=f"{sheetname}", index=False)
            
    export_to_excel_standalone_workbook(df, dirname, sheetname, name)
        
def export_to_excel_standalone_workbook(df, dirname, sheetname, name):
    path = f"./{dirname}/{name}-{sheetname}.xlsx"
    isExist = os.path.exists(path)
    if not isExist:
        with pd.ExcelWriter(path, engine='xlsxwriter') as writer:
            df.to_excel(writer, sheet_name=f"{sheetname}", index=False)
            
chart_output_dir = "Charts_SODS2022"
isExist = os.path.exists(f"./{chart_output_dir}")
if not isExist:
    os.mkdir(f"./{chart_output_dir}")
    
chart_output_filename = "sods_2022_professors"

In [5]:
# Different roles in this dataset
unique_roles = df["1.6"].unique()
unique_roles

array(['Data Scientist', 'Product Manager',
       'Professor/Instructor/Researcher', 'Student', 'Developer',
       'Line-of-Business Manager', 'Other (please indicate below)',
       'System Administrator ', 'Data Engineer', 'DevOps',
       'Applied Scientist', 'Business Analyst', 'Research Scientist',
       'Cloud Security Manager', 'ML Engineer', 'CloudOps',
       'Cloud Engineer', 'MLOps'], dtype=object)

In [6]:
# What proportion of this dataset are prfoessors? Get percentages per roles.
def get_count_by_role_name(role_name: str):
    selection = df[df["1.6"] == role_name]
    return selection.shape[0]

data = []

total_rows = df.shape[0]
for role in unique_roles:
    role_count = get_count_by_role_name(role)
    percentage = role_count/total_rows
    count = role_count
    data.append((role, percentage, count))

print("Total Sample Size: %s" % total_rows)
roles_percentages_df = pd.DataFrame(data, columns=["Role", "Percentage", "Count"])
roles_percentages_df = roles_percentages_df.sort_values(by=["Percentage"], ascending=False)
export_to_excel(roles_percentages_df, chart_output_dir, "Proportion Professors", chart_output_filename)
roles_percentages_df

Total Sample Size: 3493


Unnamed: 0,Role,Percentage,Count
0,Data Scientist,0.164615,575
3,Student,0.135127,472
6,Other (please indicate below),0.105926,370
2,Professor/Instructor/Researcher,0.086745,303
11,Business Analyst,0.083596,292
12,Research Scientist,0.079015,276
4,Developer,0.077011,269
8,Data Engineer,0.076152,266
1,Product Manager,0.04008,140
5,Line-of-Business Manager,0.033209,116


In [7]:
# Set students as the student_df
professors_df = df[df["1.6"] == "Professor/Instructor/Researcher"]
total_professors_count = professors_df.shape[0]

# Total professors
print("Total professors: %s" % total_professors_count)

# What geolocation do professors live in?
current_df = professors_df["1.1"]

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, "Geography", chart_output_filename)
percentage_by_item_df

Total professors: 303


Unnamed: 0,Percentage,Label,Count
0,0.379538,United States,115
2,0.072607,India,22
15,0.049505,Brazil,15
8,0.042904,Nigeria,13
14,0.029703,Japan,9
...,...,...,...
21,0.003300,Bahrain,1
7,0.003300,Armenia,1
11,0.003300,Paraguay,1
16,0.003300,Chile,1


In [8]:
# What questions were the most relevant to professors?
# Total professors
print("Total professors: %s" % total_professors_count)

data = []

for (colCode, colData) in professors_df.iteritems():
    total_answered = total_professors_count - pd.isnull(colData.values).sum()
    percentage = total_answered/total_professors_count
    data.append((colCode, percentage))
    
most_relevant_questions_df = pd.DataFrame(data, columns=["Code", "Percentage"])
filter = (most_relevant_questions_df["Percentage"] < 1) & (most_relevant_questions_df["Percentage"] > 0)
sorted_most_relevant_questions_df = most_relevant_questions_df[filter].sort_values(by=["Percentage"], ascending=False)

top_30_codes_sorted_df = sorted_most_relevant_questions_df[0:30]
top_30_codes = top_30_codes_sorted_df["Code"].values
top_30_percentages = top_30_codes_sorted_df["Percentage"].values
print("%s \n" % top_30_codes_sorted_df)

for code in top_30_codes:
    print("%s: %s  \n" % (code, code_to_question_map[code]))
    

Total professors: 303
      Code  Percentage
4      1.5    0.993399
8      1.9    0.980198
9        2    0.980198
6      1.7    0.900990
86   5.1_1    0.768977
74     3.2    0.699670
73     3.1    0.699670
75     3.3    0.683168
76     3.4    0.683168
77     3.5    0.683168
78     3.6    0.683168
115    5.7    0.679868
114    5.6    0.679868
117    5.9    0.679868
101  5.4_1    0.679868
102  5.4_2    0.679868
103  5.4_3    0.679868
104  5.4_4    0.679868
105  5.4_5    0.679868
106  5.4_6    0.679868
107  5.4_7    0.679868
116    5.8    0.679868
109  5.5_2    0.679868
110  5.5_3    0.679868
111  5.5_4    0.679868
112  5.5_5    0.679868
113  5.5_6    0.679868
108  5.5_1    0.679868
118    5.1    0.676568
119   5.11    0.676568 

1.5: The organization I work for is best classified as a:  

1.9: How would you rate your job satisfaction in your current role?  

2: What would cause you to leave your current employer for a new job? Please select the top option besides pay/benefits. - Selected

In [9]:
# Q1.5
current_question_code = "1.5"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, "Organization Type", chart_output_filename)
percentage_by_item_df

The organization I work for is best classified as a:
Total answers: 301


Unnamed: 0,Percentage,Label,Count
1,0.807309,Educational institution,243
0,0.069767,Commercial (for-profit) entity,21
3,0.063123,Not-for-profit organization,19
2,0.059801,Government agency,18


In [10]:
# Q1.9
current_question_code = "1.9"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, "Role Satisfaction", chart_output_filename)
percentage_by_item_df

How would you rate your job satisfaction in your current role?
Total answers: 297


Unnamed: 0,Percentage,Label,Count
1,0.353535,Very satisfied,105
0,0.326599,Moderately satisfied,97
3,0.158249,Extremely satisfied,47
2,0.124579,Slightly satisfied,37
4,0.037037,Not at all satisfied,11


In [11]:
# Q2
current_question_code = "2"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, "Cause To Leave", chart_output_filename)
percentage_by_item_df

What would cause you to leave your current employer for a new job? Please select the top option besides pay/benefits. - Selected Choice
Total answers: 297


Unnamed: 0,Percentage,Label,Count
2,0.340067,N/A - I would not leave my current position,101
0,0.306397,More responsibility/opportunity for career adv...,91
1,0.191919,More access to professional training/development,57
4,0.090909,More flexibility with my work hours,27
3,0.070707,Other (please indicate below),21


In [12]:
# Q1.7
current_question_code = "1.7"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, "Tenure", chart_output_filename)
percentage_by_item_df

For how many years have you been in your current role?
Total answers: 273


Unnamed: 0,Percentage,Label,Count
0,0.373626,10+ years,102
1,0.175824,3-4 years,48
3,0.150183,5-6 years,41
2,0.128205,1-2 years,35
4,0.07326,9-10 years,20
5,0.07326,7-8 years,20
6,0.025641,<1 years,7


In [13]:
# Q5.1_1: 5.1_1 through 5.1_14 are Likert scale questions about a specific language
# Build percentages by item selected but for all similar questions 5.1_1 through 5.1_14
for x in range(1, 15):
    current_question_code = f"5.1_{x}"
    print(code_to_question_map[current_question_code])
    current_df = professors_df[current_question_code]
    total_answers = total_professors_count - pd.isnull(professors_df[current_question_code].values).sum()
    print("Total answers: %s" % (total_answers))

    columns = ["Percentage", "Label"]
    sortBy = columns[0]

    percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)  
    
    # get just the specific topic
    txt = code_to_question_map[current_question_code]
    regex_search = re.search(r"[-] (.+)", txt)
    topic = regex_search.group().lstrip("- ")
    if topic == "C/C++":
        topic = "C CPlusPlus"
        
    if topic == "HTML/CSS":
        topic = "HTML CSS"
        
    if topic == "Bash/Shell":
        topic = "Bash Shell"
    
    export_to_excel(percentage_by_item_df, chart_output_dir, f"Usage {topic}", chart_output_filename)
    print(percentage_by_item_df)

How often do you use the following languages? - Python
Total answers: 233
   Percentage       Label  Count
0    0.326180  Frequently     76
1    0.248927      Always     58
3    0.210300   Sometimes     49
4    0.145923      Rarely     34
2    0.068670       Never     16
How often do you use the following languages? - R
Total answers: 202
   Percentage       Label  Count
4    0.282178      Rarely     57
0    0.252475   Sometimes     51
3    0.212871       Never     43
2    0.183168  Frequently     37
1    0.069307      Always     14
How often do you use the following languages? - Java
Total answers: 195
   Percentage       Label  Count
2    0.364103       Never     71
1    0.230769      Rarely     45
0    0.210256   Sometimes     41
3    0.123077  Frequently     24
4    0.071795      Always     14
How often do you use the following languages? - JavaScript
Total answers: 193
   Percentage       Label  Count
2    0.326425       Never     63
1    0.253886      Rarely     49
0    0.238342 

In [14]:
# Q3.2
current_question_code = "3.2"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = ["Proprietary tools (ex. Tableau, Google Colab, SAS, MATLAB, etc)"]
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
frequency_df["Percentage"] = frequency_df["Frequency"] / total_answers
export_to_excel(frequency_df, chart_output_dir, f"Topics Tools Skills", chart_output_filename)
frequency_df

What topics, tools, or skills is your institution teaching students of data science and machine learning? (Select all that apply).
Total answers: 212


Unnamed: 0,Frequency,Label,Percentage
17,212,"Proprietary tools (ex. Tableau, Google Colab, ...",1.0
13,112,Python,0.528302
2,107,Probability and statistics,0.504717
18,101,Machine learning,0.476415
16,84,Data visualization,0.396226
8,78,Deep learning,0.367925
1,73,Communication skills,0.34434
3,70,Advanced mathematics,0.330189
11,63,Big data management,0.29717
7,60,Development languages in addition to Python/R,0.283019


In [15]:
# Q3.1
current_question_code = "3.1"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)
export_to_excel(percentage_by_item_df, chart_output_dir, f"Specialization", chart_output_filename)
percentage_by_item_df

Which of the following best describes your specialization or field of research?
Total answers: 212


Unnamed: 0,Percentage,Label,Count
0,0.240566,Computer Science and Engineering (technology),51
9,0.221698,Education,47
13,0.075472,Mathematics,16
3,0.061321,Business and Management,13
2,0.04717,Economics,10
5,0.042453,"Engineering (mechanical, electrical)",9
7,0.042453,Other,9
17,0.037736,Physics,8
15,0.033019,Psychology,7
11,0.023585,Medicine,5


In [16]:
# Q3.3
current_question_code = "3.3"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Bias Taught", chart_output_filename)
percentage_by_item_df

How often is bias in AI/ML/data science taught in classes or lectures?
Total answers: 207


Unnamed: 0,Percentage,Label,Count
4,0.2657,"Sometimes, but only if there is a lecture abou...",55
0,0.246377,"Frequently, in multiple classes and lectures.",51
1,0.198068,"Rarely, only if there is a one-off conversatio...",41
3,0.183575,"Often, but only during a specific class about ...",38
2,0.10628,It hasn't come up in any discussions or lectures.,22


In [17]:
# Q3.4
current_question_code = "3.4"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, f"Bias Types Taught", chart_output_filename)
frequency_df

What types of AI/ML/data science bias are taught in classes or lectures? (Select all that apply). - Selected Choice
Total answers: 207


Unnamed: 0,Frequency,Label
7,84,Sample bias
3,77,Algorithmic bias
5,66,Measurement bias
2,53,AI/ML/data science bias is not taught in my cl...
6,49,Confirmation bias
1,41,Outlier bias
4,37,Prejudice bias
0,33,Exclusion bias
8,11,Other (please indicate below)


In [18]:
# Q3.5
current_question_code = "3.5"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, f"Bias Encountered", chart_output_filename)
frequency_df

What types of bias have you personally encountered in your work? (Select all that apply). - Selected Choice
Total answers: 207


Unnamed: 0,Frequency,Label
7,75,Sample bias
3,62,Algorithmic bias
2,53,I have not encountered bias
5,52,Measurement bias
4,45,Prejudice bias
6,45,Confirmation bias
1,42,Outlier bias
0,32,Exclusion bias
8,3,Other (please indicate below)


In [19]:
# Q3.6
current_question_code = "3.6"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Open Source Security", chart_output_filename)
percentage_by_item_df

How often are topics related to open-source security taught in classes or lectures?
Total answers: 207


Unnamed: 0,Percentage,Label,Count
2,0.294686,"Sometimes, when there is a specific lecture",61
4,0.202899,"Often, but only during a specific class",42
3,0.188406,It hasn't come up in any discussions or lectures,39
1,0.15942,"Rarely, only if there is a one-off conversatio...",33
0,0.154589,"Frequently, in multiple classes and lectures",32


In [20]:
# Q5.7
current_question_code = "5.7"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = []
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
frequency_df["Percentage"] = frequency_df["Frequency"] / total_answers
export_to_excel(frequency_df, chart_output_dir, f"Resources Lacking", chart_output_filename)
frequency_df

What tools and resources do you feel are lacking for data scientists who want to learn and develop their skills? (Select all that apply). - Selected Choice
Total answers: 206


Unnamed: 0,Frequency,Label,Percentage
3,101,Hands-on projects,0.490291
0,95,Mentorship opportunities,0.461165
2,94,Tailored learning paths,0.456311
1,72,Community engagement and learning platforms,0.349515
4,8,Other (please indicate below),0.038835


In [21]:
# Q5.6
current_question_code = "5.6"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Biggest Problem", chart_output_filename)
percentage_by_item_df

What do you think is the biggest problem in the data science/AI/ML space today? - Selected Choice
Total answers: 206


Unnamed: 0,Percentage,Label,Count
0,0.339806,Social impacts from bias in data and models,70
4,0.208738,Impacts to individual privacy,43
2,0.150485,Lack of diversity and inclusion in the profession,31
5,0.135922,Advanced information warfare,28
3,0.101942,A reduction in job opportunities caused by aut...,21
1,0.063107,Other (please indicate below),13


In [22]:
# Q5.9
current_question_code = "5.9"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)  
export_to_excel(percentage_by_item_df, chart_output_dir, f"Hoping to See", chart_output_filename)
percentage_by_item_df

What are you most hoping to see from the data science industry this year? - Selected Choice
Total answers: 206


Unnamed: 0,Percentage,Label,Count
1,0.373786,Further innovation in the open-source data sci...,77
3,0.23301,More resources given to data science teams wit...,48
0,0.218447,New optimized models that allow for more compl...,45
4,0.11165,More specialized data science hardware,23
2,0.033981,Other (please indicate below),7
5,0.029126,Further innovation in vendor-specific data sci...,6


In [23]:
# Q5.4_1 - Questions go together 5.4_1 through 5.1_7 and make up a 100% about importance of X topic
# Get averages for each question's responses for 5.4_1 through 5.1_7

def get_average_by_topic_tuple(current_question_code, current_df):
    total_answers = (total_professors_count - pd.isnull(current_df.values).sum())   
    sum_of_values = current_df.sum()
    average = sum_of_values/total_answers
    
    # get just the specific topic
    txt = code_to_question_map[current_question_code]
    regex_search = re.search(r"[-] (\w.+)", txt)
    topic = regex_search.group().lstrip("- ") 
    
    # return built data tuple
    data = (average, topic)
    return data

# Use function to get each topic's average as a tuple of (average, topic)
data = []
for x in range(1, 8):
    current_question_code = f"5.4_{x}"  
    current_df = professors_df[current_question_code]
    data_tuple = get_average_by_topic_tuple(current_question_code, current_df)
    data.append(data_tuple)

# Display
whole_question = code_to_question_map[current_question_code]
regex_search = re.search(r"^(\w.+)\?", whole_question)
question_part = regex_search.group()

total_answers = total_professors_count - pd.isnull(professors_df[current_question_code].values).sum()
print("Total answers: %s" % (total_answers))
print("Question: %s" % (question_part))

# Chart
topic_average_df = pd.DataFrame(data, columns = ["Average", "Topic"])
topic_average_df = topic_average_df.sort_values(by=["Average"], ascending=False)
export_to_excel(topic_average_df, chart_output_dir, f"Open Source Valued", chart_output_filename)
topic_average_df

Total answers: 206
Question: What do you value most about open-source technology?


Unnamed: 0,Average,Topic
1,22.849515,Most economical option
0,19.063107,Speed of innovation
2,18.81068,Most useful tools for my needs
4,14.563107,Product quality
3,11.723301,Avoid vendor lock-in
5,10.878641,Offers security
6,2.11165,Other (please indicate)


In [24]:
# Q5.8
current_question_code = "5.8"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Frequency analysis
unique_selections_set = set()
all_selections_set = []
filters = ["Reading technical books, blogs, newsletters, and papers", "Paid online courses (e.g. Coursera, EdX)"]
columns = ["Frequency", "Label"]
sortBy = columns[0]

frequency_df = build_frequency_analysis_multi_select_chart(current_df, unique_selections_set, all_selections_set, filters, columns, sortBy)
export_to_excel(frequency_df, chart_output_dir, f"Learning Strategy", chart_output_filename)
frequency_df

How do you typically learn about new tools and topics relevant to your role? (Select all that apply). - Selected Choice
Total answers: 206


Unnamed: 0,Frequency,Label
1,206,"Reading technical books, blogs, newsletters, a..."
4,206,"Paid online courses (e.g. Coursera, EdX)"
2,134,Free video content (e.g. YouTube)
0,105,Community platforms and forums
3,49,Paid university courses
5,5,Other (please indicate below)


In [25]:
# Q5.5_1 - Questions go together 5.5_1 through 5.5_6 and ranked in importance 1 = most important
# Get frequency for each question's responses

def get_frequency_by_topic_tuple(current_question_code, current_df):
    unique_answers = [x for x in current_df.unique() if not pd.isnull(x)]
    all_answers = [x for x in current_df if not pd.isnull(x)]
    
    answer_frequency_data = []
    for answer in unique_answers:
        count = all_answers.count(answer)
        answer_frequency_data.append((count, answer))
        
    return answer_frequency_data

# Display
whole_question = code_to_question_map["5.5_1"]
regex_search = re.search(r"^(\w.+)\?", whole_question)
question_part = regex_search.group()
print("%s (1 = most important, 6 = least important)" % question_part)
    
# Use function to get each topic's frequency of ratings
for x in range(1, 7):
    current_question_code = f"5.5_{x}"  
    current_df = professors_df[current_question_code]
    data = get_frequency_by_topic_tuple(current_question_code, current_df)
    
    # get just the specific topic
    txt = code_to_question_map[current_question_code]
    regex_search = re.search(r"[-] (\w.+)", txt)
    topic = regex_search.group().lstrip("- ") 
    print(topic)
    
    total_answers = total_professors_count - pd.isnull(current_df.values).sum()
    print("Total answers: %s" % total_answers)

    frequency_by_topic_df = pd.DataFrame(data, columns=["Frequency","Importance"])
    frequency_by_topic_df = frequency_by_topic_df.sort_values(by=["Frequency"], ascending=False)
    export_to_excel(frequency_by_topic_df, chart_output_dir, f"AutoML {current_question_code}", chart_output_filename)
    print(frequency_by_topic_df)

What should an AutoML tool do for data scientists? (1 = most important, 6 = least important)
Enable non-experts to train machine learning models
Total answers: 206
   Frequency  Importance
2         85         1.0
3         38         5.0
0         32         2.0
1         27         4.0
4         23         3.0
5          1         6.0
Quickly and efficiently tune very many hyperparameters
Total answers: 206
   Frequency  Importance
1         68         2.0
0         52         3.0
4         39         4.0
3         33         1.0
2         14         5.0
Help choose the best model types to solve specific problems
Total answers: 206
   Frequency  Importance
2         63         2.0
3         54         3.0
0         49         1.0
1         27         4.0
4         13         5.0
Speed up the ML pipeline by automating certain workflows (data cleaning, etc.)
Total answers: 206
   Frequency  Importance
3         57         4.0
1         51         3.0
0         39         5.0
2         

In [26]:
# Q5.1
current_question_code = "5.1"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy)   
export_to_excel(percentage_by_item_df, chart_output_dir, f"Open Source Challenge", chart_output_filename)
percentage_by_item_df

What do you believe is the biggest challenge in the open-source community today? - Selected Choice
Total answers: 205


Unnamed: 0,Percentage,Label,Count
2,0.287805,Public trust,59
3,0.287805,Security vulnerabilities,59
1,0.190244,Undermanagement,39
0,0.156098,Talent shortage,32
4,0.063415,Other (please indicate below),13
5,0.014634,,3


In [27]:
# Q5.11
current_question_code = "5.11"
print(code_to_question_map[current_question_code])
current_df = professors_df[current_question_code]
total_answers = (total_professors_count - pd.isnull(current_df.values).sum())
print("Total answers: %s" % total_answers)

# Build percentages by item selected
columns = ["Percentage", "Label"]
sortBy = columns[0]

percentage_by_item_df = build_percentage_chosen_by_item_chart(current_df, columns, sortBy) 
export_to_excel(percentage_by_item_df, chart_output_dir, f"Supply Chain Impact", chart_output_filename)
percentage_by_item_df

Have supply chain disruption problems, such as the ongoing chip shortage, impacted your access to computing resources?
Total answers: 205


Unnamed: 0,Percentage,Label,Count
0,0.565854,No,116
1,0.239024,I'm not sure,49
2,0.195122,Yes,40
