While writing this notebook, the data was stored locally in my computer therefore to follow along with this note book you will need to download the data from here : https://insights.stackoverflow.com/survey/

##### Process followed:
STEP 1. Count unique elements in a column

        Each column has a list of options that a user picked out seperated by a semi-colon (;).
        So the in each row the list is split and elements are added to a list.
        Then unique items in the list are counted and the data recorded in a dataframe with index as unique items in the list.
    
STEP 2. Repeat the first step for each year
STEP 3. Merge all the dataframes into one table

Each question will have a table that answers it therefore we repeat the above process four times.

In [1]:
# Imports
import pandas as pd

In [2]:
# Loading in data using kedro's catalog attribute
# See https://kedro.readthedocs.io/en/stable/tools_integration/ipython.html for more details

df2018 = catalog.load('2018_dataset')
df2019 = catalog.load('2019_dataset')
df2020 = catalog.load('2020_dataset')
df2021 = catalog.load('2021_dataset')

## Transforming the data

In [4]:
dataframes = [df2018, df2019, df2020, df2021]
[print(f"{df['year'].iloc[0]} dataset has {df.shape[0]} rows and {df.shape[1]} columns.") for df in dataframes]

2018 dataset has 98855 rows and 5 columns.
2019 dataset has 88883 rows and 5 columns.
2020 dataset has 64461 rows and 5 columns.
2021 dataset has 83439 rows and 5 columns.


In [13]:
from collections import Counter

def count_unique_items_in_column(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """Counts unique elements in dataframe column. Column must have semicolon separated values or nan values in column

    Args:
        df (pd.DataFrame): dataframe to be modified
        column_name (str): column name in dataframe

    Returns:
        pd.DataFrame: new dataframe contain value and count of value in df
    
    Raises:
        ValueError: if the column passed does not exist in dataframe
    """
    if column_name not in df.columns:
        raise ValueError(f"No column named {column_name} in dataframe.")

    column_as_list = df[column_name].tolist()

    new_list = []
    for list_item in column_as_list: 

        # for nan values
        if isinstance(list_item, type(None)):
            new_list.append(list_item)

        if isinstance(list_item, str): 
            new_list.extend(list_item.split(";"))

    # find the number of occurances of a item in a list
    occ = Counter(new_list)
    language = []
    count = []
    for x in occ:
        key = x
        value = occ[key]
        language.append(key)
        count.append(value)

    df_temp = pd.DataFrame(list(zip(language, count)), columns = [column_name, 'count'])
    df_temp.set_index(column_name, inplace=True)
    df_temp.sort_values(by='count', ascending=False, inplace=True)
    return df_temp


def _merge(dataset_18: pd.DataFrame, dataset_19: pd.DataFrame,
                       dataset_20: pd.DataFrame, dataset_21: pd.DataFrame, column_names: list) -> pd.DataFrame:
    """Merges dataframes on index.

    Args:
        dataset_18 (pd.DataFrame): 2018 dataset
        dataset_19 (pd.DataFrame): 2019 dataset
        dataset_20 (pd.DataFrame): 2020 dataset
        dataset_21 (pd.DataFrame): 2021 dataset
        column_names (list): column names to use in the resulting dataframe.

    Returns:
        pd.DataFrame: a dataframe with column names 
    """
    df18_19 = pd.merge(dataset_18, dataset_19, left_index=True, right_index=True)
    df20_21 = pd.merge(dataset_20, dataset_21, left_index=True, right_index=True)
    dfs_merged = pd.merge(df18_19, df20_21, left_index=True, right_index=True)
    dfs_merged.columns = column_names
    dfs_merged.reset_index(inplace=True)
    
    return dfs_merged

In [18]:
def display_index_values(df_list: list, column_name: str):
    for i in df_list:
        df_temp = count_unique_items_in_column(i, column_name)
        yield df_temp.index.values
        
j = display_index_values(dataframes, 'web_frameworks')

for i in j:
    print(i)

[None 'Node.js' 'Angular' 'React' '.NET Core' 'Spring' 'Django' 'Cordova'
 'TensorFlow' 'Xamarin' 'Spark' 'Hadoop' 'Torch/PyTorch']
['jQuery' None 'React.js' 'Angular/Angular.js' 'ASP.NET' 'Express'
 'Spring' 'Vue.js' 'Django' 'Flask' 'Laravel' 'Other(s):' 'Ruby on Rails'
 'Drupal']
[None 'jQuery' 'React.js' 'Angular' 'ASP.NET' 'Express' 'ASP.NET Core'
 'Vue.js' 'Spring' 'Angular.js' 'Django' 'Flask' 'Laravel' 'Ruby on Rails'
 'Symfony' 'Gatsby' 'Drupal']
['React.js' None 'jQuery' 'Express' 'Angular' 'Vue.js' 'ASP.NET Core '
 'Flask' 'ASP.NET' 'Django' 'Spring' 'Angular.js' 'Laravel'
 'Ruby on Rails' 'Gatsby' 'FastAPI' 'Symfony' 'Svelte' 'Drupal']


In [20]:
# Web Frameworks
# rename React in 2018 dataset to React.js
# rename 'Angular' in 2018 dataset to Angular.js
# rename 'Angular/Angular.js' to Angular.js in 2019 dataset
# add ASP.NET to 2018 dataset and set it to zero
# add jQuery to 2018 dataset and set it to zero
# add Vue.js to 2018 dataset and set it to zero
# add Flask to 2018 dataset and set it to zero
# add Laravel to 2018 dataset and set it to zero
# add Express to 2018 dataset and set it to zero
# add Ruby on Rails to 2018 dataset and set it to zero
# add Drupal on Rails to 2018 dataset and set it to zero
df18 = count_unique_items_in_column(df2018, 'web_frameworks').rename(index={'React': 'React.js', 'Angular': 'Angular.js', 'Angular/Angular.js': 'Angular.js'})
for i in ['ASP.NET', 'jQuery', 'Vue.js', 'Flask', 'Laravel',  'Express', 'Ruby on Rails', 'Drupal']:
    if i not in df18.index.values:
        df18.loc[i] = 0
df19 = count_unique_items_in_column(df2019, 'web_frameworks')
df20 = count_unique_items_in_column(df2020, 'web_frameworks')
df21 = count_unique_items_in_column(df2021, 'web_frameworks')

web_frameworks = _merge(df18, df19, df20, df21, ['2018', '2019', '2020', '2021'])
web_frameworks

Unnamed: 0,web_frameworks,2018,2019,2020,2021
0,,47235,23861,22182,21732
1,React.js,14337,19877,15167,25296
2,Spring,9079,10298,6941,9177
3,Django,6723,8249,6014,9446
4,ASP.NET,0,16699,9258,9918
5,jQuery,0,30981,18316,21693
6,Vue.js,0,9671,7322,11954
7,Flask,0,7694,6005,10174
8,Laravel,0,6679,4680,6377
9,Express,0,12522,8961,15010


In [21]:
j = display_index_values(dataframes, 'platforms')

for i in j:
    print(i)

[None 'Linux' 'Windows Desktop or Server' 'Android' 'AWS' 'Mac OS'
 'Raspberry Pi' 'WordPress' 'iOS' 'Firebase' 'Azure' 'Arduino' 'Heroku'
 'Google Cloud Platform/App Engine' 'Serverless' 'Drupal' 'Amazon Echo'
 'Windows Phone' 'SharePoint' 'ESP8266' 'Salesforce'
 'Apple Watch or Apple TV' 'IBM Cloud or Watson' 'Google Home'
 'Gaming console' 'Mainframe' 'Predix']
['Linux' 'Windows' 'Docker' 'Android' 'AWS' 'MacOS' 'Slack' 'Raspberry Pi'
 'WordPress' 'iOS' 'Google Cloud Platform' 'Microsoft Azure' 'Arduino'
 'Heroku' None 'Kubernetes' 'Other(s):' 'IBM Cloud or Watson']
['Linux' 'Windows' 'Docker' 'AWS' 'Android' 'MacOS' None 'Raspberry Pi'
 'Microsoft Azure' 'WordPress' 'Google Cloud Platform' 'iOS' 'Kubernetes'
 'Heroku' 'Arduino' 'Slack Apps and Integrations' 'IBM Cloud or Watson']
[None 'AWS' 'Google Cloud Platform' 'Microsoft Azure' 'DigitalOcean'
 'Heroku' 'IBM Cloud or Watson' 'Oracle Cloud Infrastructure']


In [22]:
# Platforms
# Only dealing with cloud platforms : AWS Microsoft Azure Heroku  Google Cloud Platform	 IBM Cloud or Watson
# change 'Google Cloud Platform/App Engine' to 'Google Cloud Platform'
# change 'Azure' in 2018 dataset to 'Microsoft Azure'
df18 = count_unique_items_in_column(df2018, 'platforms').rename(index={'Google Cloud Platform/App Engine': 'Google Cloud Platform', 'Azure': 'Microsoft Azure'})
df19 = count_unique_items_in_column(df2019, 'platforms')
df20 = count_unique_items_in_column(df2020, 'platforms')
df21 = count_unique_items_in_column(df2021, 'platforms')

platforms = _merge(df18, df19, df20, df21, ['2018', '2019', '2020', '2021'])
platforms

Unnamed: 0,platforms,2018,2019,2020,2021
0,,32856,8169,10618,41820
1,AWS,15927,21304,14389,26295
2,Microsoft Azure,7267,9528,7830,15096
3,Heroku,6913,8527,5974,8182
4,Google Cloud Platform,5302,9928,7569,16228
5,IBM Cloud or Watson,950,1514,876,1768


In [19]:
j = display_index_values(dataframes, 'languages')

for i in j:
    print(i)

['JavaScript' 'HTML' 'CSS' 'SQL' 'Java' 'Bash/Shell' 'Python' 'C#' 'PHP'
 None 'C++' 'C' 'TypeScript' 'Ruby' 'Swift' 'Assembly' 'Go' 'Objective-C'
 'VB.NET' 'R' 'Matlab' 'VBA' 'Kotlin' 'Scala' 'Groovy' 'Perl'
 'Visual Basic 6' 'Lua' 'CoffeeScript' 'Delphi/Object Pascal' 'Haskell'
 'Rust' 'F#' 'Clojure' 'Erlang' 'Cobol' 'Ocaml' 'Julia' 'Hack']
['JavaScript' 'HTML/CSS' 'SQL' 'Python' 'Java' 'Bash/Shell/PowerShell'
 'C#' 'PHP' 'C++' 'TypeScript' 'C' 'Other(s):' 'Ruby' 'Go' 'Assembly'
 'Swift' 'Kotlin' 'R' 'VBA' 'Objective-C' 'Scala' 'Rust' 'Dart' None
 'Elixir' 'Clojure' 'WebAssembly' 'F#' 'Erlang']
['JavaScript' 'HTML/CSS' 'SQL' 'Python' 'Java' 'Bash/Shell/PowerShell'
 'C#' 'PHP' 'TypeScript' 'C++' 'C' None 'Go' 'Kotlin' 'Ruby' 'Assembly'
 'VBA' 'Swift' 'R' 'Rust' 'Objective-C' 'Dart' 'Scala' 'Perl' 'Haskell'
 'Julia']
['JavaScript' 'HTML/CSS' 'Python' 'SQL' 'Java' 'Node.js' 'TypeScript' 'C#'
 'Bash/Shell' 'C++' 'PHP' 'C' 'PowerShell' 'Go' 'Kotlin' 'Rust' 'Ruby'
 'Dart' 'Assembly' 'Swift

In [16]:
# Languages
df18 = count_unique_items_in_column(df2018, 'languages')
df19 = count_unique_items_in_column(df2019, 'languages')
df20 = count_unique_items_in_column(df2020, 'languages')
df21 = count_unique_items_in_column(df2021, 'languages')

languages = _merge(df18, df19, df20, df21, ['2018', '2019', '2020', '2021'])
languages

Unnamed: 0,languages,2018,2019,2020,2021
0,JavaScript,54686,59219,38822,53587
1,SQL,44670,47544,31413,38835
2,Java,35521,35917,23074,29162
3,Python,30359,36443,25287,39792
4,C#,26954,27097,18041,22984
5,PHP,24071,23030,15007,18130
6,,20521,1314,7083,1082
7,C++,19872,20524,13707,20057
8,C,18042,18017,12487,17329
9,TypeScript,13626,18523,14578,24909


In [23]:
j = display_index_values(dataframes, 'databases')

for i in j:
    print(i)

['MySQL' None 'SQL Server' 'PostgreSQL' 'MongoDB' 'SQLite' 'Redis'
 'Elasticsearch' 'MariaDB' 'Oracle'
 'Microsoft Azure (Tables, CosmosDB, SQL, etc)' 'Google Cloud Storage'
 'Memcached' 'Amazon DynamoDB' 'Amazon RDS/Aurora' 'Cassandra' 'IBM Db2'
 'Neo4j' 'Amazon Redshift' 'Apache Hive' 'Google BigQuery' 'Apache HBase']
['MySQL' 'PostgreSQL' 'Microsoft SQL Server' 'SQLite' 'MongoDB' 'Redis'
 None 'MariaDB' 'Oracle' 'Elasticsearch' 'Firebase' 'Other(s):' 'DynamoDB'
 'Cassandra' 'Couchbase']
['MySQL' 'PostgreSQL' 'Microsoft SQL Server' 'SQLite' None 'MongoDB'
 'Redis' 'MariaDB' 'Oracle' 'Firebase' 'Elasticsearch' 'DynamoDB'
 'Cassandra' 'IBM DB2' 'Couchbase']
['MySQL' 'PostgreSQL' 'SQLite' 'MongoDB' 'Microsoft SQL Server' 'Redis'
 None 'MariaDB' 'Firebase' 'Elasticsearch' 'Oracle' 'DynamoDB' 'Cassandra'
 'IBM DB2' 'Couchbase']


In [26]:
# Databases
df18 = count_unique_items_in_column(df2018, 'databases').rename(index={'SQL Server': 'Microsoft SQL Server', 'IBM Db2': 'IBM DB2', 'Amazon DynamoDB': 'DynamoDB'})
for i in ['Cassandra', 'Couchbase', 'Firebase']:
    if i not in df18.index.values:
        df18.loc[i] = 0
df19 = count_unique_items_in_column(df2019, 'databases')
df20 = count_unique_items_in_column(df2020, 'databases')
df21 = count_unique_items_in_column(df2021, 'databases')

databases = _merge(df18, df19, df20, df21, ['2018', '2019', '2020', '2021'])
databases

Unnamed: 0,databases,2018,2019,2020,2021
0,MySQL,38909,40537,27559,35289
1,,32585,12857,14924,13893
2,Microsoft SQL Server,27293,24590,16336,18896
3,PostgreSQL,21776,25758,17892,28424
4,MongoDB,17183,19100,13086,19479
5,SQLite,13036,23713,15434,22634
6,Redis,11944,13971,9056,14552
7,Elasticsearch,9312,10720,6817,9331
8,MariaDB,8853,12401,8312,12088
9,Oracle,7376,12353,8155,8868
