In [1]:
# importing the libraries

# importing the libraries
import pandas as pd
import numpy as np
import sys
import datetime

from IPython.display import display, HTML

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from math import pi

import scipy.sparse as sp
import scipy.stats
from scipy.stats import chi2

import warnings
warnings.filterwarnings('ignore')

# pandas setting
pd.set_option('display.max_columns', None)

In [2]:
# reading the files
# file paths
ksm_fp = '../input/new-kaggle-survey-2018-2022-modified/kaggle_survey_2018_2022_melted1.parquet'
ks_fp = '../input/new-kaggle-survey-2018-2022-modified/kaggle_survey_2018_2022_modifed1.parquet'

ksm = pd.read_parquet(ksm_fp)
ks = pd.read_parquet(ks_fp)

In [3]:
# Helping functions and colors

# -------------CONTI. COLORS-----------

colorscale01=[[0.0, "rgb(251, 133, 0)"],
            [0.1111111111111111, "rgb(253, 194, 128)"],
            [0.2222222222222222, "rgb(254, 225, 192)"],
            [0.3333333333333333, "rgb(255, 240, 224)"],
            [0.4444444444444444, "rgb(255, 255, 255)"],
            [0.5555555555555556, "rgb(242, 251, 255)"],
            [0.6666666666666666, "rgb(228, 247, 255)"],
            [0.7777777777777778, "rgb(200, 239, 255)"],
            [0.8888888888888888, "rgb(144, 223, 255)"],
            [1.0, "rgb(32, 190, 255)"]]

colorscale02=[[0.0, "rgb(255, 255, 255)"],
            [0.1111111111111111, "rgb(200, 239, 255)"],
            [0.2222222222222222, "rgb(172, 231, 255)"],
            [0.3333333333333333, "rgb(158, 227, 255)"],
            [0.4444444444444444, "rgb(144, 223, 255)"],
            [0.5555555555555556, "rgb(130, 219, 255)"],
            [0.6666666666666666, "rgb(116, 215, 255)"],
            [0.7777777777777778, "rgb(88, 207, 255)"],
            [0.8888888888888888, "rgb(60, 199, 255)"],
            [1.0, "rgb(32, 190, 255)"]]

colorscale03=[[0.0, "rgb(214, 243, 255)"],
            [0.1111111111111111, "rgb(193, 237, 255)"],
            [0.2222222222222222, "rgb(172, 231, 255)"],
            [0.3333333333333333, "rgb(158, 227, 255)"],
            [0.4444444444444444, "rgb(144, 223, 255)"],
            [0.5555555555555556, "rgb(130, 219, 255)"],
            [0.6666666666666666, "rgb(116, 215, 255)"],
            [0.7777777777777778, "rgb(88, 207, 255)"],
            [0.8888888888888888, "rgb(60, 199, 255)"],
            [1.0, "rgb(32, 190, 255)"]]

colorscale04=[[0.0, "rgb(255, 255, 255)"],
            [0.1111111111111111, "rgb(200, 239, 255)"],
            [0.2222222222222222, "rgb(144, 223, 255)"],
            [0.3333333333333333, "rgb(88, 207, 255)"],
            [0.4444444444444444, "rgb(32, 190, 255)"],
            [0.5555555555555556, "rgb(31, 174, 233)"],
            [0.6666666666666666, "rgb(30, 158, 211)"],
            [0.7777777777777778, "rgb(27, 126, 166)"],
            [0.8888888888888888, "rgb(24, 94, 122)"],
            [1.0, "rgb(21, 61, 77)"]]

# Methodology

First, let's look at the questions in the survey and try to understand what questions were asked. After reading all the questions, I understood that this question could be classified into different categories, such as:
- Based on the kind of question asked: Single choice or Multiple choice
- Based on the idea behind the question: learning, machine learning, etc.
There are 18 single-choice questions, and the rest are multiple-choice. From these 18 single-choice questions, I have selected a few variables which describe the person who took the survey.

Select questions which describe the respondent as follows:
- Q2: What is your age? - Tell us about the respondent's age
- Q3: What is your gender? - Tell us about the respondent's gender
- Q4: In which country do you currently reside? - Tell us about the respondent's country
- Q5: Are you a student?
- Q23: Select the title most similar to your current role.

Note: Q5 and Q23 tell us about their current role. I have merged these two columns into one named 'role.'
Other questions in the dataset also describe the respondent, such as education level, current industry, etc. But to keep the analysis simple, I have selected age, gender, country, and role as the variables which describe the respondent.
The reason behind selecting these variables is that they will help us filter the data and understand it on the micro level.

After this, I classified all the questions based on the idea behind them; the classification may vary from person to person. 
I have mainly classified questions into as follows:
About, Learning, Professional, Programming, Cloud, Hardware, Tools.
I will paste the link here once I complete the analysis.

After this, I have done data cleaning. Here is the notebook where I have done data cleaning. The output of the data cleaning step is two tables.
Table 1: The original table format of the Kaggle survey with the past five years data.
Table 2: With melted format having question and answer as rows for each respondent.

In this notebook, I have done the macro-level analysis of 40 questions using the heatmap.
The process is simple for analysis.
- If the question is the single choice
    - Check the distribution using the heatmap
- If the question is multiple-choice. We will analyze it in three directions.
    - How many products/services/platforms etc did the respondent mention
    - Which products/services/platforms etc did the respondent mention together (by creating an occurrence matrix)
    - Checking the distribution using heatmap.

# About the respondent

In [4]:
pt01 = pd.DataFrame(ks['Year'].value_counts()).reset_index()
pt01.rename(columns={'index':'Year','Year':'Respondent'},inplace=True)
pt01 = pt01.sort_values(by=['Year'],ascending=True).reset_index(drop=True)
pt01 = pt01.set_index('Year')
pt01['% change'] = pt01.pct_change() * 100
pt01['% change'] = pt01['% change'].round(2)
pt01 = pt01.fillna(0)
pt02 = pt01.copy()
pt03 = pt02.div(pt01.max(), axis=1)

trace = go.Heatmap(
    z = pt03.values,
    x = pt01.columns,
    y = pt01.index,
    hoverinfo = "none",
    text = pt01.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text}</span>",
    textfont={"size":12},
    colorscale=colorscale02,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px; font-family:Helvetica'>How many people responded to the survey?</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
<br>
</span>'''

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    margin = dict(t=120, pad=5),
    title_x=0.5,
    width = 500,
    height= 400,
    plot_bgcolor = "#fff",
    yaxis = dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Around 23997 people responded to the survey in 2022, which is 7.6 per cent less than last year.

## Distribution of respondent: Based on Age

In [5]:
pt01 = ks.pivot_table(values='sid',index=['Year'],columns='age',aggfunc='count')
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
# pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Age</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Age</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- People aged 18 to 29 constitute more than 50 per cent of total responses yearly.
- The participation goes on decreasing as we move up in the age group.
- Participation of people from the age group 18-21 and 40-44 is increasing each year. Similarly, the involvement of people from the age group 25-29 and 30-34 is decreasing.

## Distribution of respondent: Based on Gender

In [6]:
pt01 = ks.pivot_table(values='sid',index=['Year'],columns='gender',aggfunc='count')
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
# pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale03,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Gender</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is light blue)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Gender</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- We can see that the male category has the highest participation every year, though their participation decreases yearly.
- The female category's participation in the survey has been increasing for the last four years.

## Distribution of respondent: Based on role

In [7]:
pt01 = ks.pivot_table(values='sid',index=['Year'],columns='role',aggfunc='count')
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
# pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale02,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Role</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is white)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Role</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
#         title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        tickangle = 90,
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Student representation in the survey has increased from 26 percent to 50 percent, whereas participation of other roles has decreased compared to the last year.

## Distribution of respondent: Based on country

In [8]:
pt01 = ks.pivot_table(values='sid',index=['Year'],columns='country',aggfunc='count')
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
# pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
#     text = pt02.values,
#     texttemplate="<span style='color:#153d4d; font-family:Helvetica; font-size:6'>%{text:.2f}</span>",
#     textfont={"size":12},
    colorscale=colorscale03,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Country</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is light blue)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Country</span>"
# y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
#         title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=9),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=9),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- The participation of countries like India, Argentina, Nigeria, Ghana, and Pakistan in the survey is increasing yearly.
- The participation of countries like Australia, Belarus, Belgium, Canada, France, the UK of Great Britain and Northern Ireland, Germany, Italy, Russia, Sweden, Switzerland, and Ukraine in the survey is decreasing yearly.

# Learning

Questions to analysis
- Q6: On which platforms have you begun or completed data science courses?
- Q7: What products or platforms did you find to be most helpful when you first started studying data science?
- Q44: Who/what are your favourite media sources that report on data science topics?

## Q6: On which platforms have you begun or completed data science courses?

### Number of platforms used by respondent for data science courses

In [9]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns belong to the qestion
cols = ['Q6_1', 'Q6_2', 'Q6_3', 'Q6_4', 'Q6_5', 'Q6_6', 'Q6_7',
       'Q6_8', 'Q6_9', 'Q6_10', 'Q6_11', 'Q6_12','Q6_X']

# Pivoting the tables
# columns - sid, year, all platforms used for learning ds
# adding only sid and year in index as the notebook restarts if add any more indexs
# we will merge the remaining columns later

tbl01 = ksm[ksm['question'].str.contains('Q6')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)
tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='left',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

pt04 = df.pivot_table(values='sid',index='pltf_count',columns='Year',aggfunc='count',fill_value=0)
pt04 = pt04[[2022,2021,2020,2019,2018]]

# subplot main title
subplot_title = 'Number of platform used for learning data science'
# plot title
plot1_title_custom = 'Distribution of the number of platforms used by the respondent<br>Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange'
plot2_title_custom = 'Boxplot of number of platform used'

# color
primary_color = '#20beff'

# Plot layout custom
paper_bgcolor_custom = '#fff'
plot_bgcolor_custom = '#fff'

# axis title
plot1_xaxes_title = 'Number of platforms used'
plot1_yaxes_title = 'Year'

plot2_xaxes_title = 'Year'
plot2_yaxes_title = 'Number of platforms used'

# axis tick custom
axis_tick_color = '#153d4d'
axis_tick_family = 'Helvetica'
axis_tick_fontsize = 12

# customize the buttons
button_background = '#fff'

# Initialize figure with subplot
plot1_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot1_title_custom+ "</span>"
plot2_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot2_title_custom+ "</span>"

fig = make_subplots(
    rows=2, cols=1, subplot_titles=(plot1_title,plot2_title),
    vertical_spacing=0.2
)

fig.add_trace(go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    xgap = 1,
    ygap = 1), row=1,col=1)

for year in df['Year'].unique():
    temp_df = df[df['Year']==year]
    fig.add_trace(go.Box(
        y = temp_df['pltf_count'],
        name = str(year),
        hoverlabel = dict(bgcolor="white",font_size=10),
        line_color=primary_color,
        line_width=1,
        width=0.3
    ),row=2,col=1)

# updating the x axis of the plot
fig.update_xaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),tickmode='linear',constrain="domain",
    row=1, col=1)


fig.update_xaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot2_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),tickmode='linear',
    row=2, col=1)

# updating the y axis of the plot
fig.update_yaxes(
    scaleanchor = 'x',
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),autorange='reversed',
    row=1, col=1)


fig.update_yaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot2_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),
    row=2, col=1)

# layout of the plot
fig.update_layout(
    title_text="<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>"+subplot_title+"</span>",
    margin = dict(t=150, pad=5),
    title_x = 0.5,
    paper_bgcolor=paper_bgcolor_custom,
    plot_bgcolor=plot_bgcolor_custom,
    showlegend=False,
    height = 800,
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:12px;font-family:Helvetica'>Note:<br>-1 means respondent didn't answer the question</span>",
        align="left",
        showarrow=False,
        x=0,y=0.5,xref="paper",yref="paper"),
)

fig.show()

Insight
- The number of respondents using at least one platform for learning data science has significantly increased compared to the last. The increase may be due to the rise in the percentage of students taking the survey this year, or another reason for this change in distribution of respondent who did not answer this question in the prior years.
- We can see that the upper fence has been reduced back to 6, which is the same as the level before the pandemic. As offices and universities have started working usually, people might want to restrict themselves to fewer platforms.
- 75th percentile is 3 for the last four years. Having at least three platforms for learning is quite normal.

### Which platforms do respondents used together for learning data science

In [10]:
cols = ['Cloud-cert. programs', 'Coursera', 'DataCamp', 'Fast.ai',
       'Kaggle Learn Courses', 'LinkedIn Learning',
       'Other', 'Udacity', 'Udemy', 'Univ. Courses', 'edX']

df1 = pd.DataFrame(columns = ['Cloud-cert. programs', 'Coursera', 'DataCamp', 'Fast.ai',
       'Kaggle Learn Courses', 'LinkedIn Learning',
       'Other', 'Udacity', 'Udemy', 'Univ. Courses', 'edX','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- Around 15.32 percent of respondents have used Kaggle, and 14.77 percent have used Udemy with the Coursera platform for learning data science. The trend has been the same for the last four years

### Distribution of platforms used for learning the datascience

In [11]:
pt01 = ksm[ksm['question'].str.contains('Q6')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Platform used for learning Data Science</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Platform</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Coursera has been the most used platform for learning Data science for the last five years.
- LinkedIn Learning and Cloud certification programs are slowly gaining popularity each year. On the other hand, Udacity is gradually losing popularity each year.

## Q7: What products or platforms did you find to be most helpful when you first started studying data science?

In [12]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns for the questions
cols = ['Q7_1', 'Q7_2', 'Q7_3','Q7_4', 'Q7_5', 'Q7_6', 'Q7_7','Q7_X']

tbl01 = ksm[ksm['question'].str.contains('Q7')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

# renaming the columns
tbl01.rename(columns={'Kaggle (notebooks, competitions, etc)':'Kaggle','None / I do not study data science':'None','Online courses (Coursera, EdX, etc)':'Online courses',
                      'Social media platforms (Reddit, Twitter, etc)':'Social media','Video platforms (YouTube, Twitch, etc)':'Video Platform'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

tbl01 = tbl01[tbl01['Year']==2022].reset_index(drop=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# Occurance of the platforms
cols = ['Kaggle','Online courses', 'Other', 'Social media','University courses', 'Video Platform']

m01 = df[df['Year']==2022][cols]

m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
m02c = m02.T * m02 # multiply sparse matrix #
m02c.setdiag(0) # reset diagonal
m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
m04 = m03.div(m01.shape[0])
m04 = m04.multiply(100)

# subplot main title
subplot_title = 'Platforms found most useful<br>when they started studying data science'
# plot title
plot1_title_custom = '<b>Distribution of the number of platforms found most useful when started studying data science</b><br>Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange'
plot2_title_custom = '<b>Which platforms respondents mentioned together </b>'

# color
primary_color = '#20beff'

# Plot layout custom
paper_bgcolor_custom = '#fff'
plot_bgcolor_custom = '#fff'

# axis title
plot1_xaxes_title = 'Number of platforms used'
plot1_yaxes_title = 'Year'

plot2_xaxes_title = 'Year'
plot2_yaxes_title = 'Number of platforms used'

# axis tick custom
axis_tick_color = '#153d4d'
axis_tick_family = 'Helvetica'
axis_tick_fontsize = 12

# customize the buttons
button_background = '#fff'

# Initialize figure with subplot
plot1_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot1_title_custom+ "</span>"
plot2_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot2_title_custom+ "</span>"

fig = make_subplots(
    rows=2, cols=1, subplot_titles=(plot1_title,plot2_title),
    row_heights = [0.2,0.8],
    vertical_spacing=0.25
)

fig.add_trace(go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = (pt02.index).astype(str),
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    xgap = 1,
    ygap = 1), row=1,col=1)

fig.add_trace(go.Heatmap(
    z = m04.values,
    x = m04.columns,
    y = m04.index,
    hoverinfo = "none",
    text = m04.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    xgap = 1,
    ygap = 1), row=2,col=1)

# updating the x axis of the plot
fig.update_xaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),constrain="domain",
    row=1, col=1)


fig.update_xaxes(
#     title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),tickangle=90,constrain="domain",
    row=2, col=1)

# updating the y axis of the plot
fig.update_yaxes(
    scaleanchor = 'x1',
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),autorange='reversed',
    row=1, col=1)


fig.update_yaxes(
    scaleanchor = 'x2',
#     title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),autorange='reversed',constrain="domain",
    row=2, col=1)

# layout of the plot
fig.update_layout(
    title_text="<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>"+subplot_title+"</span>",
    margin = dict(t=150, pad=5),
    title_x = 0.5,
    paper_bgcolor=paper_bgcolor_custom,
    plot_bgcolor=plot_bgcolor_custom,
    showlegend=False,
    height = 800,
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:12px;font-family:Helvetica'>Note:<br>-1 means respondent didn't answer the question</span>",
        align="left",
        showarrow=False,
        x=0,y=0.8,xref="paper",yref="paper"),
)

fig.show()

In [13]:
pt01 = ksm[ksm['question'].str.contains('Q7')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# renaming the columns
pt02.rename(columns={'Kaggle (notebooks, competitions, etc)':'Kaggle','None / I do not study data science':'None','Online courses (Coursera, EdX, etc)':'Online courses',
                      'Social media platforms (Reddit, Twitter, etc)':'Social media','Video platforms (YouTube, Twitch, etc)':'Video Platform'},inplace=True)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of mentioned platforms</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Platform</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- From respondents who have found Online courses helpful. 33.25 percent of people among them have also mentioned Kaggle with it.
- From respondents who have found Online courses helpful. 33.32 percent of people among them have also mentioned Video Platforms with it.
- More than 50 percent of people found Kaggle, Online courses and Video platforms helpful when they started studying data science.

## Q44: Who/what are your favorite media sources that report on data science topics?

### Number of media sources used by respondent

In [14]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns for the questions
cols = ['Q44_1', 'Q44_2', 'Q44_3', 'Q44_4', 'Q44_5', 'Q44_6','Q44_7', 'Q44_8', 'Q44_9', 'Q44_10', 'Q44_11', 'Q44_12''Q44_X']

tbl01 = ksm[ksm['question'].str.contains('Q44')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

# renaming the columns
tbl01.rename(columns={"Email newsletters (Data Elixir, O'Reilly Data & AI, etc)":"Email newsletters","Slack Communities (ods.ai, kagglenoobs, etc)":"Slack communities"},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

pt04 = df.pivot_table(values='sid',index='pltf_count',columns='Year',aggfunc='count',fill_value=0)
pt04 = pt04[[2022,2021,2020,2019,2018]]

# subplot main title
subplot_title = 'Number of media sources used by respondent'
# plot title
plot1_title_custom = 'Distribution of the number of media sources used by the respondent<br>Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange'
plot2_title_custom = 'Boxplot of number of media sources used'

# color
primary_color = '#20beff'

# Plot layout custom
paper_bgcolor_custom = '#fff'
plot_bgcolor_custom = '#fff'

# axis title
plot1_xaxes_title = 'Number of media sources used'
plot1_yaxes_title = 'Year'

plot2_xaxes_title = 'Year'
plot2_yaxes_title = 'Number of media sources used'

# axis tick custom
axis_tick_color = '#153d4d'
axis_tick_family = 'Helvetica'
axis_tick_fontsize = 12

# customize the buttons
button_background = '#fff'

# Initialize figure with subplot
plot1_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot1_title_custom+ "</span>"
plot2_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot2_title_custom+ "</span>"

fig = make_subplots(
    rows=2, cols=1, subplot_titles=(plot1_title,plot2_title),
    vertical_spacing=0.2
)

fig.add_trace(go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    xgap = 1,
    ygap = 1), row=1,col=1)

for year in df['Year'].unique():
    temp_df = df[df['Year']==year]
    fig.add_trace(go.Box(
        y = temp_df['pltf_count'],
        name = str(year),
        hoverlabel = dict(bgcolor="white",font_size=10),
        line_color=primary_color,
        line_width=1,
        width=0.3
    ),row=2,col=1)

# updating the x axis of the plot
fig.update_xaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),tickmode='linear',constrain="domain",
    row=1, col=1)


fig.update_xaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot2_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),tickmode='linear',
    row=2, col=1)

# updating the y axis of the plot
fig.update_yaxes(
    scaleanchor = 'x',
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),autorange='reversed',
    row=1, col=1)


fig.update_yaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot2_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),
    row=2, col=1)

# layout of the plot
fig.update_layout(
    title_text="<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>"+subplot_title+"</span>",
    margin = dict(t=150, pad=5),
    title_x = 0.5,
    paper_bgcolor=paper_bgcolor_custom,
    plot_bgcolor=plot_bgcolor_custom,
    showlegend=False,
    height = 800,
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:12px;font-family:Helvetica'>Note:<br>-1 means respondent didn't answer the question</span>",
        align="left",
        showarrow=False,
        x=0,y=0.5,xref="paper",yref="paper"),
)

fig.show()

Insight 
- Usage of media sources for data science topics is slowly increasing from last three years.
- More than 50 percent respondents say they use multiple media sources that report on data science topics.

### Which media sources respondent use together

In [15]:
cols = ['Blogs','Course Forums', 'Email newsletters', 'Journal Publications', 'Kaggle',
        'Other', 'Podcasts', 'Reddit', 'Slack communities','Twitter', 'YouTube']

df1 = pd.DataFrame(columns = ['Blogs','Course Forums', 'Email newsletters', 'Journal Publications', 'Kaggle',
        'Other', 'Podcasts', 'Reddit', 'Slack communities','Twitter', 'YouTube','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- 31.25 respondents mentioned Kaggle with YouTube as their favorite media sources for data science
- 21.54 respondents mentioned Blogs with YouTube as their favorite media sources for data science

### Distribution of media sources

In [16]:
pt01 = ksm[ksm['question'].str.contains('Q44')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
# renaming the columns
pt01.rename(columns={"Email newsletters (Data Elixir, O'Reilly Data & AI, etc)":"Email newsletters","Slack Communities (ods.ai, kagglenoobs, etc)":"Slack communities"},inplace=True)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of media sources</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>media source</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- YouTube, Kaggle, and Blogs are the respondents' top 3 media sources.  The trend has been the same for the last four years.
- There is an increase in the percentage of people mentioning YouTube as their favorite media source; this year, around 50 percent of respondents have mentioned it as their favorite.

# Programming

- Q11: For how many years have you been writing code and/or programming?
- Q12: What programming languages do you use on a regular basis?
- Q13: Which of the following integrated development environments (IDE's) do you use on a regular basis?
- Q14: Do you use any of the following hosted notebook products?
- Q15: Do you use any of the following data visualization libraries on a regular basis?

## Q11: For how many years have you been writing code and/or programming?

### Distirbution of coding experience

In [17]:
# Cleaning the data
ksm['answer'][(ksm['question'].str.contains('Q11'))&(ksm['answer']=='No coding')] = 'Never coded'

pt01 = ksm[ksm['question'].str.contains('Q11')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of coding experience</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Coding experience</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        categoryorder='array',
        categoryarray =['No Answer','Never coded','< 1 year','1-2 years', '1-3 years','3-5 years','5-10 years','10-20 years', '20+ years']
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Note:
- The option in the supplementary notes is 1-2 years, But the option in the CSV file is 1-3 years. So I haven't changed the answer. For analysis, we can assume this as 1-2 years.

Insight:
- We can see an increase in the percentage of respondents having coding experience of fewer than two years, contributing to almost 50 per cent of the overall response in the last two years. This is because the number of students taking the survey increases yearly.

## Q12: What programming languages do you use on a regular basis.

### Number of programming language used by respondent

In [18]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q12_1', 'Q12_2', 'Q12_3', 'Q12_4', 'Q12_5',
       'Q12_6', 'Q12_7', 'Q12_8', 'Q12_9', 'Q12_10', 'Q12_11', 'Q12_12',
       'Q12_13', 'Q12_14', 'Q12_15','Q12_X']

tbl01 = ksm[ksm['question'].str.contains('Q12')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# subplot main title
subplot_title = 'Number of language used on regular basis'
# plot title
plot1_title_custom = 'Distribution of the number of language used by the respondent<br>Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange'
plot2_title_custom = 'Boxplot of number of language used'

# color
primary_color = '#20beff'

# Plot layout custom
paper_bgcolor_custom = '#fff'
plot_bgcolor_custom = '#fff'

# axis title
plot1_xaxes_title = 'Number of language used'
plot1_yaxes_title = 'Year'

plot2_xaxes_title = 'Year'
plot2_yaxes_title = 'Number of language used'

# axis tick custom
axis_tick_color = '#153d4d'
axis_tick_family = 'Helvetica'
axis_tick_fontsize = 12

# customize the buttons
button_background = '#fff'

# Initialize figure with subplot
plot1_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot1_title_custom+ "</span>"
plot2_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot2_title_custom+ "</span>"

fig = make_subplots(
    rows=2, cols=1, subplot_titles=(plot1_title,plot2_title),
    vertical_spacing=0.2
)

fig.add_trace(go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    xgap = 1,
    ygap = 1), row=1,col=1)

for year in df['Year'].unique():
    temp_df = df[df['Year']==year]
    fig.add_trace(go.Box(
        y = temp_df['pltf_count'],
        name = str(year),
        hoverlabel = dict(bgcolor="white",font_size=10),
        line_color=primary_color,
        line_width=1,
        width=0.3
    ),row=2,col=1)

# updating the x axis of the plot
fig.update_xaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),tickmode='linear',constrain="domain",
    row=1, col=1)


fig.update_xaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot2_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),tickmode='linear',
    row=2, col=1)

# updating the y axis of the plot
fig.update_yaxes(
    scaleanchor = 'x',
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),autorange='reversed',
    row=1, col=1)


fig.update_yaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot2_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),
    row=2, col=1)

# layout of the plot
fig.update_layout(
    title_text="<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>"+subplot_title+"</span>",
    margin = dict(t=150, pad=5),
    title_x = 0.5,
    paper_bgcolor=paper_bgcolor_custom,
    plot_bgcolor=plot_bgcolor_custom,
    showlegend=False,
    height = 800,
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:12px;font-family:Helvetica'>Note:<br>-1 means respondent didn't answer the question</span>",
        align="left",
        showarrow=False,
        x=0,y=0.5,xref="paper",yref="paper"),
)

fig.show()

Insight:
- Every year more than 50 percent of the respondents have mentioned that they use 1 to 3 languages regularly.
- Every year most respondents mentioned that they use two languages regularly.

### Which language people use together on regular basis

In [19]:
cols = ['Bash', 'C', 'C#','C++', 'Go', 'Java', 'Javascript', 'Julia', 'MATLAB','Other', 'PHP', 'Python', 'R', 'SQL']

df1 = pd.DataFrame(columns = ['Bash', 'C', 'C#','C++', 'Go', 'Java', 'Javascript', 'Julia', 'MATLAB','Other', 'PHP', 'Python', 'R', 'SQL','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 650,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- Around 36.23 percent of respondents used Python and SQL together regularly. SQL securing first place for Python users. The trend has been the same for the last five years.
- 17.13 percent of respondents used C++ with Python. Positioning C++ at second place after SQL.
- R is the second most-used language for the respondents who use SQL regularly.

### Distribution of programming language used by respondent on regular basis

In [20]:
pt01 = ksm[ksm['question'].str.contains('Q12')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of programming language</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Programming Language</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Python is the most used language. The trend has been the same for the last five years.
- SQL remained the second most used language for the last five years.
- The percentage of people using R is reducing every year.

## Q13: Which of the following integrated development environments (IDEs) do you use regularly?

### Number of IDEs used by respondents regularly

In [21]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q13_1', 'Q13_2', 'Q13_3', 'Q13_4',
       'Q13_5', 'Q13_6', 'Q13_7', 'Q13_8', 'Q13_9', 'Q13_10', 'Q13_11',
       'Q13_12', 'Q13_13', 'Q13_14','Q13_X']

tbl01 = ksm[ksm['question'].str.contains('Q13')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# subplot main title
subplot_title = 'Number of IDEs used on regular basis'
# plot title
plot1_title_custom = 'Distribution of the number of IDEs used by the respondent<br>Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange'
plot2_title_custom = 'Boxplot of number of IDEs used'

# color
primary_color = '#20beff'

# Plot layout custom
paper_bgcolor_custom = '#fff'
plot_bgcolor_custom = '#fff'

# axis title
plot1_xaxes_title = 'Number of IDEs used'
plot1_yaxes_title = 'Year'

plot2_xaxes_title = 'Year'
plot2_yaxes_title = 'Number of IDEs used'

# axis tick custom
axis_tick_color = '#153d4d'
axis_tick_family = 'Helvetica'
axis_tick_fontsize = 12

# customize the buttons
button_background = '#fff'

# Initialize figure with subplot
plot1_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot1_title_custom+ "</span>"
plot2_title = "<span style='font-size:14px;font-family:Helvetica'>"+plot2_title_custom+ "</span>"

fig = make_subplots(
    rows=2, cols=1, subplot_titles=(plot1_title,plot2_title),
    vertical_spacing=0.2
)

fig.add_trace(go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    xgap = 1,
    ygap = 1), row=1,col=1)

for year in df['Year'].unique():
    temp_df = df[df['Year']==year]
    fig.add_trace(go.Box(
        y = temp_df['pltf_count'],
        name = str(year),
        hoverlabel = dict(bgcolor="white",font_size=10),
        line_color=primary_color,
        line_width=1,
        width=0.3
    ),row=2,col=1)

# updating the x axis of the plot
fig.update_xaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),tickmode='linear',constrain="domain",
    row=1, col=1)


fig.update_xaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot2_xaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),tickmode='linear',
    row=2, col=1)

# updating the y axis of the plot
fig.update_yaxes(
    scaleanchor = 'x',
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot1_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),autorange='reversed',
    row=1, col=1)


fig.update_yaxes(
    title_text = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>"+ plot2_yaxes_title +"</span>",
    tickfont=dict(color=axis_tick_color,family=axis_tick_family,size=axis_tick_fontsize),
    row=2, col=1)

# layout of the plot
fig.update_layout(
    title_text="<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>"+subplot_title+"</span>",
    margin = dict(t=150, pad=5),
    title_x = 0.5,
    paper_bgcolor=paper_bgcolor_custom,
    plot_bgcolor=plot_bgcolor_custom,
    showlegend=False,
    height = 800,
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:12px;font-family:Helvetica'>Note:<br>-1 means respondent didn't answer the question</span>",
        align="left",
        showarrow=False,
        x=0,y=0.5,xref="paper",yref="paper"),
)

fig.show()

### Which IDEs respondents used together

In [22]:
cols = ['IntelliJ',
       'Jupyter Notebook', 'JupyterLab', 'MATLAB',
       'Notepad++', 'Other', 'PyCharm', 'RStudio', 'Spyder', 'Sublime Text',
       'Vim / Emacs', 'Visual Studio', 'Visual Studio Code (VSCode)']

df1 = pd.DataFrame(columns = ['IntelliJ',
       'Jupyter Notebook', 'JupyterLab', 'MATLAB',
       'Notepad++', 'Other', 'PyCharm', 'RStudio', 'Spyder', 'Sublime Text',
       'Vim / Emacs', 'Visual Studio', 'Visual Studio Code (VSCode)','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 650,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- 28.84 percent of respondents who used VSCode also mentioned Jupyter Notebooks.

### Distribution of IDEs used by respondents regularly

In [23]:
pt01 = ksm[ksm['question'].str.contains('Q13')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of IDEs</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>IDEs</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Note:
There was a slight difference in options provided in previous surveys. We can see that the Jupyter Notebook was the most mentioned IDE followed by the VSCode.

## Q14: Do you use any of the following hosted notebook products?

### Number of hosted notebook products used by respondents

In [24]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q14_1', 'Q14_2', 'Q14_3', 'Q14_4',
       'Q14_5', 'Q14_6', 'Q14_7', 'Q14_8', 'Q14_9', 'Q14_10', 'Q14_11',
       'Q14_12', 'Q14_13', 'Q14_14', 'Q14_15', 'Q14_16','Q14_X']

tbl01 = ksm[ksm['question'].str.contains('Q14')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of hosted notebook used</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Hosted notebook</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show

<bound method BaseFigure.show of Figure({
    'data': [{'colorbar': {'orientation': 'v', 'ticktext': ['min', 'max'], 'tickvals': [0.0, 1.0]},
              'colorscale': [[0.0, 'rgb(251, 133, 0)'], [0.1111111111111111,
                             'rgb(253, 194, 128)'], [0.2222222222222222, 'rgb(254,
                             225, 192)'], [0.3333333333333333, 'rgb(255, 240,
                             224)'], [0.4444444444444444, 'rgb(255, 255, 255)'],
                             [0.5555555555555556, 'rgb(242, 251, 255)'],
                             [0.6666666666666666, 'rgb(228, 247, 255)'],
                             [0.7777777777777778, 'rgb(200, 239, 255)'],
                             [0.8888888888888888, 'rgb(144, 223, 255)'], [1.0,
                             'rgb(32, 190, 255)']],
              'hoverinfo': 'none',
              'showscale': True,
              'text': array([[3.17071126e-01, 3.18412339e-01, 2.57177585e-01, 9.45974265e-02,
                           

### Which hosted notebook products respondents used together

In [25]:
cols = ['Amazon EMR Notebooks', 'Amazon Sagemaker Studio',
       'Amazon Sagemaker Studio Lab', 'Azure Notebooks', 'Code Ocean',
       'Colab Notebooks', 'Databricks Collaborative Notebooks',
       'Deepnote Notebooks', 'Google Cloud Vertex AI Workbench',
       'Gradient Notebooks', 'Hex Workspaces', 'IBM Watson Studio',
       'Kaggle Notebooks','Noteable Notebooks', 'Other']

df1 = pd.DataFrame(columns = ['Amazon EMR Notebooks', 'Amazon Sagemaker Studio',
       'Amazon Sagemaker Studio Lab', 'Azure Notebooks', 'Code Ocean',
       'Colab Notebooks', 'Databricks Collaborative Notebooks',
       'Deepnote Notebooks', 'Google Cloud Vertex AI Workbench',
       'Gradient Notebooks', 'Hex Workspaces', 'IBM Watson Studio',
       'Kaggle Notebooks','Noteable Notebooks', 'Other','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":8},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- 18.79 percent of respondents who used Kaggle notebooks also mentioned Colab notebooks.

### Distribution of hosted notebook products

In [26]:
pt01 = ksm[ksm['question'].str.contains('Q14')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of hosted notebooks</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>hosted notebooks</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Colab Notebooks are the most used notebooks. The percentage of people mentioning Colab Notebooks was increasing last four years. Though there sight reduction of 1 percent this year.
- The percentage of people mentioning Kaggle Notebooks was increasing last four years. Though this year there is a 6 percent reduction.

## Q15: Do you use any of the following data visualization libraries on a regular basis?

### Number of data visualization libraries used regularly

In [27]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q15_1', 'Q15_2','Q15_3', 'Q15_4', 'Q15_5', 'Q15_6', 'Q15_7', 'Q15_8', 'Q15_9',
       'Q15_10', 'Q15_11', 'Q15_12', 'Q15_13', 'Q15_14', 'Q15_15','Q15_X']

tbl01 = ksm[ksm['question'].str.contains('Q15')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of data visualization libraries</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Data Visualization libraries</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        tickmode ="linear"
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- The percentage of people mentioning one data visualization library was increasing from 2018 to 2021, But this year there is a reduction of 5 percent.
- The percentage of people mentioning two data visualization libraries was increasing from 2018 to 2021, But this year there is a reduction of 4 percent.
- For the last 3 years, we can see most respondents are mentioning they use two data visualization libraries.

### Which data visualization library do respondents use together

In [28]:
cols = [ 'Altair', 'Bokeh',
       'D3 js', 'Dygraphs', 'Geoplotlib', 'Ggplot / ggplot2', 'Highcharter',
       'Leaflet / Folium', 'Matplotlib','Other',
       'Plotly / Plotly Express', 'Pygal', 'Seaborn', 'Shiny',]

df1 = pd.DataFrame(columns = [ 'Altair', 'Bokeh',
       'D3 js', 'Dygraphs', 'Geoplotlib', 'Ggplot / ggplot2', 'Highcharter',
       'Leaflet / Folium', 'Matplotlib','Other',
       'Plotly / Plotly Express', 'Pygal', 'Seaborn', 'Shiny','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 650,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

### Distribution of data visualization libraries

In [29]:
pt01 = ksm[ksm['question'].str.contains('Q15')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of data visualization libraries</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Data visualization libraries</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Matplotlib remained the most mentioned data visualization library for the last five years followed by Seaborn, Plotly and Ggplot.
- There is a decrease in the percentage of people mentioning ggplot for last five years

# Machine learning

- Q16: For how many years have you used machine learning methods?
- Q17: Which of the following machine learning frameworks do you use on a regular basis?
- Q18: Which of the following ML algorithms do you use on a regular basis?
- Q19: Which categories of computer vision methods do you use on a regular basis?
- Q20: Which of the following natural language processing (NLP) methods do you use on a regular basis?
- Q21: Do you download pre-trained model weights from any of the following services?
- Q22: Which of the following ML model hubs/repositories do you use most often? 

## Q16: For how many years have you used machine learning methods?

### Distribution of ML methods experience

In [30]:
pt01 = ksm[ksm['question'].str.contains('Q16')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of ML methods experience</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>ML methods experience</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        categoryorder='array',
        categoryarray =['No Answer','Do not use ML','< 1 year','1-2 years','2-3 years','3-4 years','4-5 years','5-10 years','10-15 years','10-20 years', '20+ years']
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Most of the respondent taking the survey has less than 2 years of experience.
- There is a decrease in the participation of people as the Machine learning experience increases. Around 8-9 percent of respondents are from the 2-3 category. Around 4-5 percent of respondents are from 3-4, 4-5 and 5-10 years categories every year.

## Q17: Which of the following machine learning frameworks do you use on a regular basis?

### Number of ML frameworks used regularly

In [31]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q17_1', 'Q17_2', 'Q17_3', 'Q17_4', 'Q17_5', 'Q17_6', 'Q17_7',
       'Q17_8', 'Q17_9', 'Q17_10', 'Q17_11', 'Q17_12', 'Q17_13', 'Q17_14',
       'Q17_15','Q17_X']

tbl01 = ksm[ksm['question'].str.contains('Q17')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Number of ML frameworks used regularly</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of ML frameworks</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Most respondents did not answer the question. 32 percent of respondents did not answer the question. If we neglect this, most respondents used one ML framework. Though the difference between 1,2 and 3 is small.

### Which ML frameworks do respondents use together

In [32]:
cols = ['Caret', 'CatBoost',
       'Fast.ai', 'Huggingface', 'JAX', 'Keras', 'LightGBM','Other', 'PyTorch', 'PyTorch Lightning', 'Scikit-learn',
       'TensorFlow', 'Tidymodels', 'Xgboost',]

df1 = pd.DataFrame(columns = ['Caret', 'CatBoost',
       'Fast.ai', 'Huggingface', 'JAX', 'Keras', 'LightGBM','Other', 'PyTorch', 'PyTorch Lightning', 'Scikit-learn',
       'TensorFlow', 'Tidymodels', 'Xgboost','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 650,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- 26.62 percent of respondents who used sklearn mentioned that they use Tensorflow.
- 23.31 percent of respondents who used sklearn mentioned that they use Keras.
- 16.73 percent of respondents who used sklearn mentioned that they use Pytorch.

### Distribution of ML frameworks

In [33]:
pt01 = ksm[ksm['question'].str.contains('Q17')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of ML frameworks</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>ML framework</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Sklearn is the most used ML framework every year followed by Tensorflow, Keras and Pytorch.
- The percentage of people mentioning Pytorch was increasing from 2018 to 2021, But this year is a slight decrease in the percentage.

## Q18: Which of the following ML algorithms do you use on a regular basis?

### Number of ML algorithms used regularly

In [34]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q18_1', 'Q18_2', 'Q18_3', 'Q18_4', 'Q18_5', 'Q18_6',
       'Q18_7', 'Q18_8', 'Q18_9', 'Q18_10', 'Q18_11', 'Q18_12', 'Q18_13',
       'Q18_14','Q18_X']

tbl01 = ksm[ksm['question'].str.contains('Q18')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Number of ML algorithms used regularly</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of ML algorithms</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Most respondents did not answer the question. If we neglect them we can see the percentage of people using 2 to 3 ML algorithms is higher and the percentage keeps reducing as the number of algorithms increases.

### Which ML algorithms do respondents use together

In [35]:
cols = ['Autoencoder Networks (DAE, VAE, etc)', 'Bayesian Approaches',
       'Convolutional Neural Networks', 'Decision Trees or Random Forests',
       'Dense Neural Networks (MLPs, etc)', 'Evolutionary Approaches',
       'Generative Adversarial Networks',
       'Gradient Boosting Machines (xgboost, lightgbm, etc)',
       'Graph Neural Networks', 'Linear or Logistic Regression','Other', 'Recurrent Neural Networks', 'Transformer Networks',]

df1 = pd.DataFrame(columns = ['Autoencoder Networks (DAE, VAE, etc)', 'Bayesian Approaches',
       'Convolutional Neural Networks', 'Decision Trees or Random Forests',
       'Dense Neural Networks (MLPs, etc)', 'Evolutionary Approaches',
       'Generative Adversarial Networks',
       'Gradient Boosting Machines (xgboost, lightgbm, etc)',
       'Graph Neural Networks', 'Linear or Logistic Regression','Other', 'Recurrent Neural Networks', 'Transformer Networks','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 800,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- 33.84 percent of people who used linear/logistic regression mentioned they also use Decision Trees or Random forests. The trend is the same for the last four years.

### Distribution of ML algorithm

In [36]:
pt01 = ksm[ksm['question'].str.contains('Q18')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of ML Algorithm</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>ML Algorithm</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Linear/Logistic regression is the most used ML algorithm followed by Decision Trees or Random forests, CNN and Gradient Boosting machines.

## Q19: Which categories of computer vision methods do you use on a regular basis?

### Number of computer vision methods used regularly

In [37]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q19_1', 'Q19_2', 'Q19_3', 'Q19_4', 'Q19_5', 'Q19_6',
       'Q19_7', 'Q19_8','Q19_X']

tbl01 = ksm[ksm['question'].str.contains('Q19')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

# renaming the columns
tbl01.rename(columns={'General purpose image/video tools (PIL, cv2, skimage, etc)':'General purpose image/video tools ',
       'Generative Networks (GAN, VAE, etc)':'Generative Networks',
       'Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc)':'Image classification and other general purpose networks',
       'Image segmentation methods (U-Net, Mask R-CNN, etc)':'Image segmentation methods',
       'Vision transformer networks (ViT, DeiT, BiT, BEiT, Swin, etc)':'Vision transformer networks'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Number of computer vision methods used regularly</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of computer vision</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Note
- If we neglect the people who did not answered the question. We can see that the percentage decreases as the number of computer vision methods increases.

### Which computer vision methods are used together

In [38]:
cols = ['General purpose image/video tools ', 'Generative Networks',
       'Image classification and other general purpose networks',
       'Image segmentation methods',
       'Object detection methods', 'Other', 'Vision transformer networks',]

df1 = pd.DataFrame(columns = ['General purpose image/video tools ', 'Generative Networks',
       'Image classification and other general purpose networks',
       'Image segmentation methods',
       'Object detection methods', 'Other', 'Vision transformer networks','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- 7.63 percent people who used Image Classification and other general purpose networks also mentioned the Image segmentation methods.
- 7.87 percent people who used Image Classification and other general purpose networks also mentioned the Object detection methods.
- 6.76 percent people who used Image Classification and other general purpose networks also mentioned the General purpose Image/video tools.

### Distribution of computer vision methods

In [39]:
pt01 = ksm[ksm['question'].str.contains('Q19')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01.rename(columns={'General purpose image/video tools (PIL, cv2, skimage, etc)':'General purpose image/video tools ',
       'Generative Networks (GAN, VAE, etc)':'Generative Networks',
       'Image classification and other general purpose networks (VGG, Inception, ResNet, ResNeXt, NASNet, EfficientNet, etc)':'Image classification',
       'Image segmentation methods (U-Net, Mask R-CNN, etc)':'Image segmentation methods',
       'Vision transformer networks (ViT, DeiT, BiT, BEiT, Swin, etc)':'Vision transformer networks'},inplace=True)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of computer vision methods</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>computer vision methods</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=11),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=11),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Image classification and other general purpose networks is most used method followed by Object detection, Image segmentation and General purpose image/video tools

## Q20: Which of the following natural language processing (NLP) methods do you use on a regular basis?

### Number of NLP methods used regularly

In [40]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q20_1', 'Q20_2', 'Q20_3', 'Q20_4', 'Q20_5','Q20_6','Q20_X']

tbl01 = ksm[ksm['question'].str.contains('Q20')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01.rename(columns={'Contextualized embeddings (ELMo, CoVe)':'Contextualized embeddings',
       'Encoder-decoder models (seq2seq, vanilla transformers)':'Encoder-decoder models',
       'Word embeddings/vectors (GLoVe, fastText, word2vec)':'Word embeddings/vectors'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Number of NLP methods used regularly</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of NLP methods</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Note:

Most respondent did not answer the question. This year 82 percent respondent did not answer the question. If we neglect them we can see people mentioned 1-2 NLP methods they use regularly. The percentage keep reducing as the Number of methods increases.

### Which NLP methods are used together

In [41]:
cols = ['Contextualized embeddings', 'Encoder-decoder models','Other', 'Transformer language models',
       'Word embeddings/vectors']

df1 = pd.DataFrame(columns = ['Contextualized embeddings', 'Encoder-decoder models', 'Other', 'Transformer language models',
       'Word embeddings/vectors','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- 6.1 percent people who used the Word embeddings/Vectors mentioned Transformer language models
- 5.04 percent people who used the Word embeddings/Vectors mentioned Encoder-decoder models

### Distribution of NLP methods

In [42]:
pt01 = ksm[ksm['question'].str.contains('Q20')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01.rename(columns={'Contextualized embeddings (ELMo, CoVe)':'Contextualized embeddings',
       'Encoder-decoder models (seq2seq, vanilla transformers)':'Encoder-decoder models',
       'Word embeddings/vectors (GLoVe, fastText, word2vec)':'Word embeddings/vectors'},inplace=True)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of NLP methods</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>NLP methods</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Transformer language models and Word embeddings/vectors are the most used methods followed by Encoder-decoder models
- The percent of people mentioning the Transformer language models is increasing every year, whereas the percentage of people mentioning word embeddings/vectors is decreasing.

## Q21: Do you download pre-trained model weights from any of the following services?

### Number of services used for pre-trained model weights

In [43]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns for the questions
cols = ['Q21_1', 'Q21_2', 'Q21_3', 'Q21_4', 'Q21_5', 'Q21_6',
       'Q21_7', 'Q21_8', 'Q21_9', 'Q21_10','Q21_X']

tbl01 = ksm[ksm['question'].str.contains('Q21')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)
# renaming the columns
tbl01.rename(columns={'No, I do not download pre-trained model weights on a regular basis':'None',
                     'Other storage services (i.e. google drive)':'Other'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - (tbl01['None'])
tbl01.reset_index(inplace=True)

tbl01 = tbl01[tbl01['Year']==2022].reset_index(drop=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = (pt02[pt02.index==2022].columns).astype(str),
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of services used for pre-trained model weights </span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of services used for pre trained model weights </span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 19 percent people mentioned they use 1 service for pre-trained model weights and the percent decreases as the number of services increases.
- 30 percent people taking the survey did not use the pre-trained model weight service.

Note
- -1 means people did not answer the question

### Which services do respondents used together for pre-trained model weights

In [44]:
# Occurance of the platforms
cols = ['Huggingface Models',
       'Jumpstart', 'Kaggle datasets', 'NVIDIA NGC models','ONNX models', 'Other', 'PyTorch Hub', 'TensorFlow Hub', 'Timm',]

m01 = df[df['Year']==2022][cols]

m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
m02c = m02.T * m02 # multiply sparse matrix #
m02c.setdiag(0) # reset diagonal
m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
m04 = m03.div(m01.shape[0])
m04 = m04.multiply(100)



trace = go.Heatmap(
    z = m04.values,
    x = m04.columns,
    y = m04.index,
    hoverinfo = "none",
    text = m04.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Services do respondents used together for pre-trained model weights </span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Year 2022
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Services </span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Services</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 7.6 percent people who use the Kaggle dataset also mentioned the TensorFlow Hub.
- 4.59 percent people who use the Kaggle dataset also mentioned the Pytorch Hub.

### Distribution of pre-trained model weights

In [45]:
pt01 = ksm[ksm['question'].str.contains('Q21')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

# renaming the columns
pt01.rename(columns={'No, I do not download pre-trained model weights on a regular basis':'None',
                     'Other storage services (i.e. google drive)':'Other'},inplace=True)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of services used for pre-trained model weights</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Services</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Kaggle datasets is the most used pre-trained model weight service followed by TensorFlow Hub, Pytorch Hub and Huggingface models

## Q22: Which of the following ML model hubs/repositories do you use most often? 

### Distribution of ML model hubs/repositories

In [46]:
pt01 = ksm[ksm['question'].str.contains('Q22')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

# renaming the columns
pt01.rename(columns={'Other storage services (i.e. google drive)':'Other'},inplace=True)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of ML models hubs/repo used most often</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Hubs/Repo</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 85 percent respondents did not answer the question. If we neglect them, Kaggle datasets is most used ML models hub/repo, around 7 percent respondent mentioned Kaggle datasets.

# Tools

- Q35: Do you use any of the following data products?
- Q36: Do you use any of the following business intelligence tools?
- Q37: Do you use any of the following managed machine learning products regularly?
- Q38: Do you use any of the following automated machine-learning tools?
- Q39: Do you use any of the following products to serve your machine learning models? 
- Q40: Do you use any tools to help monitor your machine-learning models and/or experiments?
- Q41: Do you use any of the following responsible or ethical AI products in your machine learning practices?

## Q35: Do you use any of the following data products?

### Number of data products used

In [47]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q35_1', 'Q35_2', 'Q35_3', 'Q35_4', 'Q35_5', 'Q35_6',
       'Q35_7', 'Q35_8', 'Q35_9', 'Q35_10', 'Q35_11', 'Q35_12', 'Q35_13',
       'Q35_14', 'Q35_15', 'Q35_16','Q35_X']

tbl01 = ksm[ksm['question'].str.contains('Q35')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = (pt02.columns).astype(str),
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Number of data products used regularly</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of data products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Note:

- 78 percent people did not answer the question.
- Percentage people using the 1,2 or 3 data products is reducting slightly every year.

### Which data products do respondents use together

In [48]:
cols = ['Amazon DynamoDB',
       'Amazon RDS', 'Amazon Redshift', 'Google Cloud BigQuery',
       'Google Cloud SQL', 'IBM Db2', 'Microsoft Azure SQL Database',
       'Microsoft SQL Server', 'MongoDB', 'MySQL',
       'Oracle Database', 'Other', 'PostgreSQL', 'SQLite', 'Snowflake',]

df1 = pd.DataFrame(columns = ['Amazon DynamoDB',
       'Amazon RDS', 'Amazon Redshift', 'Google Cloud BigQuery',
       'Google Cloud SQL', 'IBM Db2', 'Microsoft Azure SQL Database',
       'Microsoft SQL Server', 'MongoDB', 'MySQL',
       'Oracle Database', 'Other', 'PostgreSQL', 'SQLite', 'Snowflake','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- 3.6 percent people mentioned MySQL and PostgreSQL together.
- 3.19 percent people mentioned MySQL and SQLite together.

### Distribution of data products

In [49]:
pt01 = ksm[ksm['question'].str.contains('Q35')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of data products</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Data products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- MySQL is the most used data product followed by PostgreSQL,SQLite and Microsoft SQL server.

## Q36: Do you use any of the following business intelligence tools?

### Number of business intelligence tools used

In [50]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q36_1', 'Q36_2', 'Q36_3', 'Q36_4',
       'Q36_5', 'Q36_6', 'Q36_7', 'Q36_8', 'Q36_9', 'Q36_10', 'Q36_11',
       'Q36_12', 'Q36_13', 'Q36_14', 'Q36_15','Q36_X']

tbl01 = ksm[ksm['question'].str.contains('Q36')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

# tbl01.rename(columns={'Contextualized embeddings (ELMo, CoVe)':'Contextualized embeddings',
#        'Encoder-decoder models (seq2seq, vanilla transformers)':'Encoder-decoder models',
#        'Word embeddings/vectors (GLoVe, fastText, word2vec)':'Word embeddings/vectors'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = (pt02.columns).astype(str),
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Number of BI tools used regularly</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of BI tools</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- If we neglect the people how did not answer the question. The percentage is high at 0 means most respondent who answer the question did not used BI tools.The percentage keeps on reducing as the Number of BI tools increases.

### Which business intelligence tools do respondents use together

In [51]:
cols = ['Alteryx',
       'Amazon QuickSight', 'Domo', 'Google Data Studio', 'Looker',
       'Microsoft Azure Synapse', 'Microsoft Power BI',
       'Other', 'Qlik Sense', 'SAP Analytics Cloud', 'Sisense',
       'TIBCO Spotfire', 'Tableau', 'Thoughtspot',]

df1 = pd.DataFrame(columns = ['Alteryx',
       'Amazon QuickSight', 'Domo', 'Google Data Studio', 'Looker',
       'Microsoft Azure Synapse', 'Microsoft Power BI',
       'Other', 'Qlik Sense', 'SAP Analytics Cloud', 'Sisense',
       'TIBCO Spotfire', 'Tableau', 'Thoughtspot','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- From people who answer the question, 3.43 percent people mentioned the Microsoft Power BI and Tableau together.

### Distribution of business intelligence tools

In [52]:
pt01 = ksm[ksm['question'].str.contains('Q36')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of BI tools</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>BI tools</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Microsoft Power BI is the most used BI tool followed by Tableau among the people who use BI tools.

## Q37: Do you use any of the following managed machine learning products regularly?

### Number of managed ML products used

In [53]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns for the questions
cols = ['Q37_1', 'Q37_2', 'Q37_3',
       'Q37_4', 'Q37_5', 'Q37_6', 'Q37_7', 'Q37_8', 'Q37_9', 'Q37_10',
       'Q37_11', 'Q37_12', 'Q37_13','Q37_X']

tbl01 = ksm[ksm['question'].str.contains('Q37')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)
# # renaming the columns
# tbl01.rename(columns={'No, I do not download pre-trained model weights on a regular basis':'None',
#                      'Other storage services (i.e. google drive)':'Other'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - (tbl01['None'])
tbl01.reset_index(inplace=True)

tbl01 = tbl01[tbl01['Year']==2022].reset_index(drop=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = (pt02[pt02.index==2022].columns).astype(str),
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of managed ML products used</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of managed ML products used</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
#         tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 13 percent said the did not use ML products and 5 percent people mentioned they use only 1 ML product.

### Which managed ML products do respondents use together

In [54]:
# Occurance of the platforms
cols = ['Alteryx',
       'Amazon SageMaker', 'Azure Machine Learning Studio', 'C3.ai',
       'DataRobot', 'Databricks', 'Dataiku', 'Domino Data Lab',
       'Google Cloud Vertex AI', 'H2O AI Cloud','Other',
       'Rapidminer',]

m01 = df[df['Year']==2022][cols]

m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
m02c = m02.T * m02 # multiply sparse matrix #
m02c.setdiag(0) # reset diagonal
m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
m04 = m03.div(m01.shape[0])
m04 = m04.multiply(100)



trace = go.Heatmap(
    z = m04.values,
    x = m04.columns,
    y = m04.index,
    hoverinfo = "none",
    text = m04.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Which managed ML products do respondents mention together </span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Year 2022
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Products</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Distribution of managed ML products

In [55]:
pt01 = ksm[ksm['question'].str.contains('Q37')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of managed ML products regularly</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 81 percent respondent did not answered the question
- Databricks, Amazon Sagemaker, Azure ML Studio are the most used ML products among people who answered the question.

## Q38: Do you use any of the following automated machine-learning tools?

### Number of automated ML tools used

In [56]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q38_1', 'Q38_2', 'Q38_3', 'Q38_4',
       'Q38_5', 'Q38_6', 'Q38_7', 'Q38_8','Q38_X']

tbl01 = ksm[ksm['question'].str.contains('Q38')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of automated ML tools</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of automated ML tools</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 15 percent of respondent said they did not used any automated ML tools.
- Around 3 percent people said they used 1 automated ML tool.

### Which automated ML tools do respondents use together

In [57]:
cols = ['Amazon Sagemaker Autopilot', 'Azure Automated Machine Learning',
       'DataRobot', 'DataRobot AutoML', 'Databricks AutoML',
       'Google Cloud AutoML', 'H2O Driverless AI',
       'Other',]

df1 = pd.DataFrame(columns = ['Amazon Sagemaker Autopilot', 'Azure Automated Machine Learning',
       'DataRobot', 'DataRobot AutoML', 'Databricks AutoML',
       'Google Cloud AutoML', 'H2O Driverless AI',
       'Other','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 650,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

### Distribution of automated ML tools

In [58]:
pt01 = ksm[ksm['question'].str.contains('Q38')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of automated ML tools</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>automated ML tools</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Google Cloud AutoML remained the most used tool among the people who use automated ML tools

## Q39: Do you use any of the following products to serve your machine learning models?

### Number of products used to serve ML models

In [59]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns for the questions
cols = ['Q39_1', 'Q39_2', 'Q39_3',
       'Q39_4', 'Q39_5', 'Q39_6', 'Q39_7', 'Q39_8', 'Q39_9', 'Q39_10',
       'Q39_11', 'Q39_12','Q39_X']

tbl01 = ksm[ksm['question'].str.contains('Q39')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)
# # renaming the columns
# tbl01.rename(columns={'No, I do not download pre-trained model weights on a regular basis':'None',
#                      'Other storage services (i.e. google drive)':'Other'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - (tbl01['None'])
tbl01.reset_index(inplace=True)

tbl01 = tbl01[tbl01['Year']==2022].reset_index(drop=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = (pt02[pt02.index==2022].columns).astype(str),
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of products used to serve ML models</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
#         tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 4 percent people mentioned they used only 1 product to serve ML model.

### Which products do respondents use together to serve ML models

In [60]:
# Occurance of the platforms
cols = ['BentoML', 'KServe',
       'MLflow', 'Multi Model Server (MMS)',
       'ONNX Runtime', 'OpenVINO Model Server', 'Other', 'Seldon Core',
       'TensorFlow Extended (TFX)', 'TorchServe', 'Triton Inference Server',]

m01 = df[df['Year']==2022][cols]

m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
m02c = m02.T * m02 # multiply sparse matrix #
m02c.setdiag(0) # reset diagonal
m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
m04 = m03.div(m01.shape[0])
m04 = m04.multiply(100)



trace = go.Heatmap(
    z = m04.values,
    x = m04.columns,
    y = m04.index,
    hoverinfo = "none",
    text = m04.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Which products do respondents mentioned together to serve ML models</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Year 2022
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Products</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Distribution of products used to serve ML models

In [61]:
pt01 = ksm[ksm['question'].str.contains('Q39')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of products used to serve ML models</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 400,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 14 percent people said they did not used any product to serve ML model.
- 3 percent people mentioned MLflow they use to serve ML model.

## Q40: Do you use any tools to help monitor your machine-learning models and/or experiments?

### Number of tools used to monitor ML models

In [62]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns for the questions
cols = ['Q40_1', 'Q40_2', 'Q40_3', 'Q40_4', 'Q40_5',
       'Q40_6', 'Q40_7', 'Q40_8', 'Q40_9', 'Q40_10', 'Q40_11', 'Q40_12',
       'Q40_13', 'Q40_14', 'Q40_15','Q40_X']

tbl01 = ksm[ksm['question'].str.contains('Q40')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)
# # renaming the columns
# tbl01.rename(columns={'No, I do not download pre-trained model weights on a regular basis':'None',
#                      'Other storage services (i.e. google drive)':'Other'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - (tbl01['None'])
tbl01.reset_index(inplace=True)

tbl01 = tbl01[tbl01['Year']==2022].reset_index(drop=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = (pt02[pt02.index==2022].columns).astype(str),
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of tools used to monitor ML models</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of tools</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 5 percent people said they used one tool to monitor ML models

### Which tools do respondents mention together

In [63]:
# Occurance of the platforms
cols = ['Aporia', 'Arize',
       'ClearML', 'Comet.ml', 'DVC', 'Evidently AI', 'Fiddler', 'Guild.ai',
       'MLflow', 'Neptune.ai','Other', 'TensorBoard',
       'Weights & Biases', 'WhyLabs',]

m01 = df[df['Year']==2022][cols]

m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
m02c = m02.T * m02 # multiply sparse matrix #
m02c.setdiag(0) # reset diagonal
m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
m04 = m03.div(m01.shape[0])
m04 = m04.multiply(100)



trace = go.Heatmap(
    z = m04.values,
    x = m04.columns,
    y = m04.index,
    hoverinfo = "none",
    text = m04.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Which tools do respondents mention together</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Year 2022
</span>'''

# x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Services </span>"
# y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Services</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
#         title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
#         title_text = x_axis_title,
#         tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Distribution of tools used to monitor ML models

In [64]:
pt01 = ksm[ksm['question'].str.contains('Q40')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of tools used to monitor ML models</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Tools</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 12 percent mentioned they did not used any tool to monitor ML models
- 4 percent people mentioned the TensorBoard as tool they use to monitor ML models.

## Q41: Do you use any of the following responsible or ethical AI products in your machine learning practices?

### Number of responsible/ethical AI products mentioned

In [65]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns for the questions
cols = ['Q41_1', 'Q41_2', 'Q41_3', 'Q41_4',
       'Q41_5', 'Q41_6', 'Q41_7', 'Q41_8', 'Q41_9','Q41_X']

tbl01 = ksm[ksm['question'].str.contains('Q41')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01.rename(columns={'Amazon AI Ethics Tools (Clarify, A2I, etc)':'Amazon AI Ethics Tools',
       'Google Responsible AI Toolkit (LIT, What-if, Fairness Indicator, etc)':'Google Responsible AI Toolkit',
       'IBM AI Ethics tools (AI Fairness 360, Adversarial Robustness Toolbox, etc':'IBM AI Ethics tools',
       'Microsoft Responsible AI Resources (Fairlearn, Counterfit, InterpretML, etc)':'Microsoft Responsible AI Resources'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - (tbl01['None'])
tbl01.reset_index(inplace=True)

tbl01 = tbl01[tbl01['Year']==2022].reset_index(drop=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = (pt02[pt02.index==2022].columns).astype(str),
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of responsible/ethical AI products mentioned</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Which products do respondents mention together

In [66]:
# Occurance of the platforms
cols = ['Aequitas',
       'Amazon AI Ethics Tools', 'Audit-AI', 'Google Responsible AI Toolkit',
       'IBM AI Ethics tools', 'Microsoft Responsible AI Resources','Other', 'The LinkedIn Fairness Toolkit (LiFT)']

m01 = df[df['Year']==2022][cols]

m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
m02c = m02.T * m02 # multiply sparse matrix #
m02c.setdiag(0) # reset diagonal
m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
m04 = m03.div(m01.shape[0])
m04 = m04.multiply(100)



trace = go.Heatmap(
    z = m04.values,
    x = m04.columns,
    y = m04.index,
    hoverinfo = "none",
    text = m04.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Which tools do respondents mention together</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Year 2022
</span>'''

# x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Services </span>"
# y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Services</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
#         title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
#         title_text = x_axis_title,
#         tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Distribution of responsible/ethical AI products

In [67]:
pt01 = ksm[ksm['question'].str.contains('Q41')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01.rename(columns={'Amazon AI Ethics Tools (Clarify, A2I, etc)':'Amazon AI Ethics Tools',
       'Google Responsible AI Toolkit (LIT, What-if, Fairness Indicator, etc)':'Google Responsible AI Toolkit',
       'IBM AI Ethics tools (AI Fairness 360, Adversarial Robustness Toolbox, etc':'IBM AI Ethics tools',
       'Microsoft Responsible AI Resources (Fairlearn, Counterfit, InterpretML, etc)':'Microsoft Responsible AI Resources'},inplace=True)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of responsible/ethical AI products</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Tools</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 400,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 16 percent respondent mentioned they did not use ethical AI products

# Cloud

- Q31: Which of the following cloud computing platforms do you use?
- Q32: Of the cloud platforms that you are familiar with, which has the best developer experience?
- Q33: Do you use any of the following cloud computing products?
- Q35: Do you use any of the following data storage products?

## Q31: Which of the following cloud computing platforms do you use?

### Number of cloud computing platforms used

In [68]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q31_1', 'Q31_2', 'Q31_3', 'Q31_4',
       'Q31_5', 'Q31_6', 'Q31_7', 'Q31_8', 'Q31_9', 'Q31_10', 'Q31_11',
       'Q31_12','Q31_X']

tbl01 = ksm[ksm['question'].str.contains('Q31')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - tbl01['None']
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Number of cloud computing platform used</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of cloud computing platform</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 9 percent respondent said they use single cloud computing platform. The percent decreases as the number of platform increases.

### Which platforms do respondents mention together

In [69]:
cols = ['Alibaba Cloud',
       'Amazon Web Services (AWS)', 'Google Cloud Platform (GCP)',
       'Huawei Cloud', 'IBM Cloud / Red Hat', 'Microsoft Azure','Oracle Cloud', 'Other', 'SAP Cloud', 'Tencent Cloud',
       'VMware Cloud',]

df1 = pd.DataFrame(columns = ['Alibaba Cloud',
       'Amazon Web Services (AWS)', 'Google Cloud Platform (GCP)',
       'Huawei Cloud', 'IBM Cloud / Red Hat', 'Microsoft Azure','Oracle Cloud', 'Other', 'SAP Cloud', 'Tencent Cloud',
       'VMware Cloud','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 650,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

Insight
- 4.4 percent people mentioned the Google Cloud Platform and AWS together.
- 2.59 percent people mentioned the GCP and Microsoft Azure together.

### Distribution of cloud computing platforms

In [70]:
pt01 = ksm[ksm['question'].str.contains('Q31')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of cloud computing platform</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Cloud computing platform</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- AWS is the most used cloud computing platform followed by GCP and Microsoft Azure.

## Q32: Of the cloud platforms that you are familiar with, which has the best developer experience?

In [71]:
pt01 = ksm[ksm['question'].str.contains('Q32')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of cloud platform based on best developer experience</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Cloud Platform</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=11),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=11),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 2 percent people mention AWS and 2 percent people mention GCP.
- 2 percent people said they all had a similar developer experience.

## Q33: Do you use any of the following cloud computing products?

### Number of cloud computing products used

In [72]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q33_1', 'Q33_2', 'Q33_3', 'Q33_4', 'Q33_5','Q33_X']

tbl01 = ksm[ksm['question'].str.contains('Q33')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)
# # renaming the columns
tbl01.rename(columns={'No / None':'None'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - (tbl01['None'])
tbl01.reset_index(inplace=True)

tbl01 = tbl01[tbl01['Year']==2022].reset_index(drop=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = (pt02[pt02.index==2022].columns).astype(str),
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of cloud computing products</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of cloud computing products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
#         tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 8 percent people mentioned the use one cloud computing products.

### Which products do respondents mention together

In [73]:
# Occurance of the platforms
cols = ['Amazon Elastic Compute Cloud (EC2)', 'Google Cloud Compute Engine',
       'Microsoft Azure Virtual Machines','Other',]

m01 = df[df['Year']==2022][cols]

m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
m02c = m02.T * m02 # multiply sparse matrix #
m02c.setdiag(0) # reset diagonal
m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
m04 = m03.div(m01.shape[0])
m04 = m04.multiply(100)



trace = go.Heatmap(
    z = m04.values,
    x = m04.columns,
    y = m04.index,
    hoverinfo = "none",
    text = m04.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Which products do respondents mention together</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Year 2022
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Products</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Distribution of cloud computing products

In [74]:
pt01 = ksm[ksm['question'].str.contains('Q33')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of cloud computing products</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 400,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- Amazon Elastic compute Cloud (EC2) is the most used cloud computing product

## Q34: Do you use any of the following data storage products?

### Number of data storage products used

In [75]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns for the questions
cols = ['Q34_1', 'Q34_2', 'Q34_3', 'Q34_4', 'Q34_5', 'Q34_6', 'Q34_7',
       'Q34_8','Q34_X']

tbl01 = ksm[ksm['question'].str.contains('Q34')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)
# # renaming the columns
tbl01.rename(columns={'No / None':'None'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - (tbl01['None'])
tbl01.reset_index(inplace=True)

tbl01 = tbl01[tbl01['Year']==2022].reset_index(drop=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = (pt02[pt02.index==2022].columns).astype(str),
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of data storage products used</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of data storage products used</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
#         tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Which products do respondents mention together

In [76]:
# Occurance of the platforms
cols = ['Amazon Elastic File System (EFS)',
       'Amazon Simple Storage Service (S3)', 'Google Cloud Filestore',
       'Google Cloud Storage (GCS)', 'Microsoft Azure Blob Storage',
       'Microsoft Azure Files','Other',]

m01 = df[df['Year']==2022][cols]

m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
m02c = m02.T * m02 # multiply sparse matrix #
m02c.setdiag(0) # reset diagonal
m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
m04 = m03.div(m01.shape[0])
m04 = m04.multiply(100)



trace = go.Heatmap(
    z = m04.values,
    x = m04.columns,
    y = m04.index,
    hoverinfo = "none",
    text = m04.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Which data storage products respondents mention together </span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Year 2022
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Products</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

Insight
- 2.02 people mentioned Amazon Simple storage service and Google cloud storage together.

### Distribution of data storage products

In [77]:
pt01 = ksm[ksm['question'].str.contains('Q34')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of data storage products</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 400,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

# Hardware

## Q43: Approximately how many times have you used a TPU (tensor processing unit)?


In [78]:
pt01 = ksm[ksm['question'].str.contains('Q43')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of TPU usage</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>TPU usage</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain")
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

## Q42: Do you use any of the following types of specialized hardware when training machine learning models?

### Number of specialized hardware used for training models

In [79]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role']

# columns for the questions
cols = ['Q42_1', 'Q42_2',
       'Q42_3', 'Q42_4', 'Q42_5', 'Q42_6', 'Q42_7', 'Q42_8', 'Q42_9','Q42_X']

tbl01 = ksm[ksm['question'].str.contains('Q42')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)
# # renaming the columns
# tbl01.rename(columns={'No, I do not download pre-trained model weights on a regular basis':'None',
#                      'Other storage services (i.e. google drive)':'Other'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01['No Answer'] * 2) - (tbl01['None'])
tbl01.reset_index(inplace=True)

tbl01 = tbl01[tbl01['Year']==2022].reset_index(drop=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = (pt02[pt02.index==2022].columns).astype(str),
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of number of specialized hardware used</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of specialized hardware used</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
#         tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Which hardware do respondents mention together

In [80]:
# Occurance of the platforms
cols = ['GPUs', 'IPUs',
       'Inferentia Chips','Other', 'RDUs', 'TPUs',
       'Trainium Chips', 'WSEs',]

m01 = df[df['Year']==2022][cols]

m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
m02c = m02.T * m02 # multiply sparse matrix #
m02c.setdiag(0) # reset diagonal
m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
m04 = m03.div(m01.shape[0])
m04 = m04.multiply(100)



trace = go.Heatmap(
    z = m04.values,
    x = m04.columns,
    y = m04.index,
    hoverinfo = "none",
    text = m04.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":10},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Which hardwares do respondents mention together </span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Year 2022
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Products</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Products</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 700,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        tickmode='linear',
#         range=[-1.5,9.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Distribution of specialized hardware used

In [81]:
pt01 = ksm[ksm['question'].str.contains('Q42')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

trace = go.Heatmap(
    z = pt03[pt03.index==2022].values,
    x = pt02[pt02.index==2022].columns,
    y = (pt02[pt02.index==2022].index).astype(str),
    hoverinfo = "none",
    text = pt02[pt02.index==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of specialized hardware</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Specialized hardware</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 350,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        autorange='reversed',
#         constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickangle = 90,
        tickfont=dict(color='#153d4d',family='Helvetica',size=10),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

# Professional

- Q8: What is the highest level of formal education that you have attained or plan to attain within the next 2 years?
- Q9: Have you ever published any academic research (papers, preprints, conference proceedings, etc)?
- Q10: Did your research make use of machine learning?
- Q24: In what industry is your current employer/contract (or your most recent employer if retired)?
- Q25: What is the size of the company where you are employed?
- Q26: Approximately how many individuals are responsible for data science workloads at your place of business?
- Q27: Does your current employer incorporate machine learning methods into their business?
- Q28: Select any activities that make up an important part of your role at work: 

## Q8: What is the highest level of formal education that you have attained or plan to attain within the next 2 years?

In [82]:
pt01 = ksm[ksm['question'].str.contains('Q8')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of respondents education</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Education</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",)
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

## Q24: In what industry is your current employer/contract (or your most recent employer if retired)?

In [83]:
pt01 = ksm[ksm['question'].str.contains('Q24')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of respondents Industry</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Industry</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",)
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

## Q25: What is the size of the company where you are employed?

In [84]:
pt01 = ksm[ksm['question'].str.contains('Q25')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of respondents company size</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>company size</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",)
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

## Q26: Approximately how many individuals are responsible for data science workloads at your place of business?

In [85]:
pt01 = ksm[ksm['question'].str.contains('Q26')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of respondents education</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Education</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 600,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",)
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

## Q28: Select any activities that make up an important part of your role at work:

### Number of activities that make an important part of the work

- List of activities
    1. 'Analyze and understand data to influence product or business decisions',
    2. 'Build and/or run a machine learning service that operationally improves my product or workflows',
    3. 'Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data',
    4. 'Build prototypes to explore applying machine learning to new areas',
    5. 'Do research that advances the state of the art of machine learning',
    6. 'Experimentation and iteration to improve existing ML models',
    7. 'No Answer',
    8. 'None of these activities are an important part of my role at work',
    9. 'Other'

In [86]:
# facts of the dataset
facts = ['sid', 'Year', 'age', 'gender', 'country', 'role',]

# columns for the questions
cols = ['Q28_1', 'Q28_2', 'Q28_3', 'Q28_4', 'Q28_5', 'Q28_6',
       'Q28_7', 'Q28_8','Q28_X']

tbl01 = ksm[ksm['question'].str.contains('Q28')].pivot_table(values='question',index=['sid','Year'],columns='answer',aggfunc='count',fill_value=0)

tbl01.rename(columns={
    'Analyze and understand data to influence product or business decisions':'1',
    'Build and/or run a machine learning service that operationally improves my product or workflows':'2',
    'Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data':'3',
    'Build prototypes to explore applying machine learning to new areas':'4',
    'Do research that advances the state of the art of machine learning':'5',
    'Experimentation and iteration to improve existing ML models':'6',
    'No Answer':'7',
    'None of these activities are an important part of my role at work':'8',
    'Other':'9'},inplace=True)

tbl01['pltf_count'] = tbl01.sum(axis=1) - (tbl01["7"] * 2) - tbl01["8"]
tbl01.reset_index(inplace=True)

# merging the age,country and role on sid and year
df = pd.merge(ks[facts],tbl01,how='inner',on=['sid','Year'])

pt01 = df.pivot_table(values='sid',index='Year',columns='pltf_count',aggfunc='count',fill_value=0)
pt01['total'] = pt01.sum(axis=1)
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt01 = pt01.drop('total', axis=1)
pt02 = pt02.drop('total', axis=1)

pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of Number of activities</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Number of activities</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        # range=[-1.5,11.5],
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

### Which activities do respondents mention together

In [87]:
cols = ['1', '2', '3', '4','5', '6', '7', '8', '9']

df1 = pd.DataFrame(columns = ['1', '2', '3', '4',
       '5', '6', '7', '8', '9','Year'])

for year in df['Year'].unique():
    m01 = df[df['Year']==year][cols]
    m02 = sp.csr_matrix(m01.astype(int).values) # convert dataframe to sparse matrix
    m02c = m02.T * m02 # multiply sparse matrix #
    m02c.setdiag(0) # reset diagonal
    m03 = pd.DataFrame(m02c.todense(), columns=m01.columns, index= m01.columns)
    m04 = m03.div(m01.shape[0])
    m04 = m04.multiply(100)
    occ = m04.copy()
    occ['Year'] = year
    frames = [df1,occ]
    df1 = pd.concat(frames)
    
df1['Year'] = df1['Year'].astype('object')

fig = go.Figure()

fig.add_trace(go.Heatmap(
    z = df1.iloc[:,:-1][df1['Year']==2022].values,
    x = df1.iloc[:,:-1].columns,
    y = df1.iloc[:,:-1][df1['Year']==2022].index,
    name = 'Selection1',
    hoverinfo = "none",
    text = df1.iloc[:,:-1][df1['Year']==2022].values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=False,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
))

# Creating the buttons
updatemenu = []
buttons1 = []

for col in df1['Year'].unique():
    buttons1.append(dict(method='restyle',
                        label=col,
                        visible=True,
                        args=[{'y':[df1.iloc[:,:-1][df1['Year']==col].index],
                               'x':[df1.iloc[:,:-1].columns],
                               'z':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'text':[df1.iloc[:,:-1][df1['Year']==col].values],
                               'type':'heatmap'}, [0]],
                        )
                  )

updatemenu = list([
    dict(
        buttons=buttons1,
        direction='down',
        name = 'Selection1',
        pad={'r':10,'t':10},
        showactive=True,
        font = dict(family = 'Helvetica',size=14,color='#153d4d'),
        bgcolor="#fff",
        x=0.53,
        xanchor='left',
        y=1.12,
        yanchor='top'),
])

fig.update_layout(
    updatemenus=updatemenu,
    height= 650,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
#         autorange='reversed',
        constrain="domain"
        ),
    xaxis =  dict(
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        tickmode='linear',
        constrain="domain",
        )
)

fig.add_annotation(
        dict(text="<span style='color:#153d4d;font-size:16px;font-family:Helvetica'>Select Year:</span>",
        align="left",
        showarrow=False,
        x=0.46,y=1.08,xref="paper",yref="paper"),
)

fig.show()

### Distribution of the activities at work

In [88]:
pt01 = ksm[ksm['question'].str.contains('Q28')].pivot_table(values='question',index=['Year'],columns='answer',aggfunc='count',fill_value=0)

pt01.rename(columns={
    'Analyze and understand data to influence product or business decisions':'1',
    'Build and/or run a machine learning service that operationally improves my product or workflows':'2',
    'Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data':'3',
    'Build prototypes to explore applying machine learning to new areas':'4',
    'Do research that advances the state of the art of machine learning':'5',
    'Experimentation and iteration to improve existing ML models':'6',
    'No Answer':'7',
    'None of these activities are an important part of my role at work':'8',
    'Other':'9'},inplace=True)

pt01['total'] = ks['Year'].value_counts()
pt02 = pt01.div(pt01.total, axis=0).fillna(0)
pt02 = pt02.drop('total', axis=1)
pt03 = pt02.div(pt02.max(axis=1), axis=0)

# ploting the graph
trace = go.Heatmap(
    z = pt03.values,
    x = pt02.columns,
    y = pt02.index,
    hoverinfo = "none",
    text = pt02.values,
    texttemplate="<span style='color:#153d4d; font-family:Helvetica'>%{text:.2f}</span>",
    textfont={"size":12},
    colorscale=colorscale01,
    showscale=True,
    colorbar_orientation='v',
    colorbar_tickvals = [np.min(np.nan_to_num(pt03.values,nan=0.0)),np.max(np.nan_to_num(pt03.values,nan=0.0))],
    colorbar_ticktext = ['min','max'],
    xgap = 1,
    ygap = 1
)

plt_title = "<span style='font-size:24px;font-weight:bold;font-family:Helvetica'>Distribution of activities</span>"
plt_caption = '''<span style='font-size:14px; font-family:Helvetica'>
Independent density normalization<br>
Note: Colorscale is based on row (Max value in row is blue and Min value in row is orange)
</span>'''

x_axis_title = "<span style='color:#153d4d;font-size:14px;font-family:Helvetica'>Activities</span>"
y_axis_title = "<span style='color:#153d4d;font-size:14px; font-family:Helvetica'>Survey year</span>"

layout = dict(
    title = plt_title + "<br><br>" + plt_caption,
    title_x=0.5,
    margin = dict(t=180, pad=5),
#     width = 800,
    height= 500,
    plot_bgcolor = "#fff",
    yaxis = dict(
        scaleanchor = 'x',
        title_text=y_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        autorange='reversed',
        ),
    xaxis =  dict(
        title_text = x_axis_title,
        tickfont=dict(color='#153d4d',family='Helvetica',size=12),
        constrain="domain",
        )
    )

fig = go.Figure(data = trace, layout = layout)

fig.show()

# Thanks

First of all, I would to thank Kaggle for hosting the competition, I learned so many things while creating the notebook.

I would also like to thank [all the previous winners of the competition ](https://www.kaggle.com/competitions/kaggle-survey-2022/discussion/359064)for creating so many awesome books.

Thank you Andrada Olteanu for making the [aggregated dataset ](https://www.kaggle.com/datasets/andradaolteanu/kaggle-data-science-survey-20172021)of all previous surveys public. The dataset saved a lot of my time while data preparation for this year.

Thank you for reading this notebook. If there is any suggestion or mistake please let me know. This is my first Kaggle survey competition and I enjoyed analyzing it.