While writing this notebook, the data was stored locally in my computer therefore to follow along with this note book you will need to download the data from here : https://insights.stackoverflow.com/survey/

##### Process followed:
STEP 1. Count unique elements in a column

        Each column has a list of options that a user picked out seperated by a semi-colon (;).
        So the in each row the list is split and elements are added to a list.
        Then unique items in the list are counted and the data recorded in a dataframe with index as unique items in the list.
    
STEP 2. Repeat the first step for each year
STEP 3. Merge all the dataframes into one table

Each question will have a table that answers it therefore we repeat the above process four times.

In [1]:
# Imports
import pandas as pd

In [2]:
# Loading in data using kedro's catalog attribute
# See https://kedro.readthedocs.io/en/stable/tools_integration/ipython.html for more details

df2018 = catalog.load('2018_dataset')
df2019 = catalog.load('2019_dataset')
df2020 = catalog.load('2020_dataset')
df2021 = catalog.load('2021_dataset')

## Transforming the data

In [None]:
dataframes = [df2018, df2019, df2020, df2021]
[print(f"{df['year'].iloc[0]} dataset has {df.shape[0]} rows and {df.shape[1]} cloumns.") for df in dataframes]

In [None]:
from collections import Counter

def count_unique_items_in_column(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    """Counts unique elements in dataframe column. Column must have semicolon separated values or nan values in column

    Args:
        df (pd.DataFrame): dataframe to be modified
        column_name (str): column name in dataframe

    Returns:
        pd.DataFrame: new dataframe contain value and count of value in df
    
    Raises:
        ValueError: if the column passed does not exist in dataframe
    """
    if column_name not in df.columns:
        raise ValueError(f"No column named {column_name} in dataframe.")

    column_as_list = df[column_name].tolist()

    new_list = []
    for list_item in column_as_list: 

        # for nan values
        if isinstance(list_item, type(None)):
            new_list.append(list_item)

        if isinstance(list_item, str): 
            new_list.extend(list_item.split(";"))

    # find the number of occurances of a item in a list
    occ = Counter(new_list)
    language = []
    count = []
    for x in occ:
        key = x
        value = occ[key]
        language.append(key)
        count.append(value)

    df_temp = pd.DataFrame(list(zip(language, count)), columns = [column_name, 'count'])
    df_temp.set_index(column_name, inplace=True)
    df_temp.sort_values(by='count', ascending=False, inplace=True)
    return df_temp


def _merge(dataframe_list: list) -> pd.DataFrame:
    """Merges dataframes on index

    Args:
        dataframe_list (list): a list of dataframes to merge

    Returns:
        pd.DataFrame: merged dataframe
        
    Raises:
        ValueError: if the list of dataframes passed is not equal to four
    """
    if len(dataframe_list) != 4:
        raise ValueError("List of dataframes must be equal to four(4)")
        
    df18_19 = pd.merge(dataframe_list[0], dataframe_list[1], left_index=True, right_index=True)
    df20_21 = pd.merge(dataframe_list[2], dataframe_list[3], left_index=True, right_index=True)
    dfs_merged = pd.merge(df18_19, df20_21, left_index=True, right_index=True)
    dfs_merged.columns = ['2018', '2019', '2020', '2021']
    
    return dfs_merged

In [None]:
def display_index_values(df_list: list, column_name: str):
    for i in df_list:
        df_temp = count_unique_items_in_column(i, column_name)
        yield df_temp.index.values
        
j = display_index_values(dataframes, 'web_frameworks')

for i in j:
    print(i)

In [None]:
# Web Frameworks
# rename React in 2018 dataset to React.js
# rename 'Angular' in 2018 dataset to Angular.js
# rename 'Angular/Angular.js' to Angular.js in 2019 dataset
# add ASP.NET to 2018 dataset and set it to zero
# add jQuery to 2018 dataset and set it to zero
# add Vue.js to 2018 dataset and set it to zero
# add Flask to 2018 dataset and set it to zero
# add Laravel to 2018 dataset and set it to zero
# add Express to 2018 dataset and set it to zero
# add Ruby on Rails to 2018 dataset and set it to zero
# add Drupal on Rails to 2018 dataset and set it to zero
df18 = count_unique_items_in_column(df2018, 'web_frameworks').rename(index={'React': 'React.js', 'Angular': 'Angular.js', 'Angular/Angular.js': 'Angular.js'})
for i in ['ASP.NET', 'jQuery', 'Vue.js', 'Flask', 'Laravel',  'Express', 'Ruby on Rails', 'Drupal']:
    if i not in df18.index.values:
        df18.loc[i] = 0
df19 = count_unique_items_in_column(df2019, 'web_frameworks')
df20 = count_unique_items_in_column(df2020, 'web_frameworks')
df21 = count_unique_items_in_column(df2021, 'web_frameworks')

l = [df18, df19, df20, df21]
web_frameworks = _merge(l)

In [None]:
j = display_index_values(dataframes, 'platforms')

for i in j:
    print(i)

In [None]:
# Platforms
# Only dealing with cloud platforms : AWS Microsoft Azure Heroku  Google Cloud Platform	 IBM Cloud or Watson
# change 'Google Cloud Platform/App Engine' to 'Google Cloud Platform'
# change 'Azure' in 2018 dataset to 'Microsoft Azure'
df18 = count_unique_items_in_column(df2018, 'platforms').rename(index={'Google Cloud Platform/App Engine': 'Google Cloud Platform', 'Azure': 'Microsoft Azure'})
df19 = count_unique_items_in_column(df2019, 'platforms')
df20 = count_unique_items_in_column(df2020, 'platforms')
df21 = count_unique_items_in_column(df2021, 'platforms')

l = [df18, df19, df20, df21]
platforms = _merge(l)

In [None]:
j = display_index_values(dataframes, 'languages')

for i in j:
    print(i)

In [None]:
# Languages
df18 = count_unique_items_in_column(df2018, 'languages')
df19 = count_unique_items_in_column(df2019, 'languages')
df20 = count_unique_items_in_column(df2020, 'languages')
df21 = count_unique_items_in_column(df2021, 'languages')
l = [df18, df19, df20, df21]

languages = _merge(l)

In [None]:
j = display_index_values(dataframes, 'databases')

for i in j:
    print(i)

In [None]:
# Databases
df18 = count_unique_items_in_column(df2018, 'databases').rename(index={'SQL Server': 'Microsoft SQL Server', 'IBM Db2': 'IBM DB2', 'Amazon DynamoDB': 'DynamoDB'})
for i in ['Cassandra', 'Couchbase', 'Firebase']:
    if i not in df18.index.values:
        df18.loc[i] = 0
df19 = count_unique_items_in_column(df2019, 'databases')
df20 = count_unique_items_in_column(df2020, 'databases')
df21 = count_unique_items_in_column(df2021, 'databases')
l = [df18, df19, df20, df21]

databases = _merge(l)
