In [None]:
import holoviews as hv
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xa

from typing import List

from IPython.core.display import display, HTML

np.set_printoptions(linewidth=200)
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
# In order to make the analysis easier, we can filter the dataframe into smaller ones 
# which will contain the data of each multiple choice question.
# To this end, defining a function can be helpful
def filter_df(df, question_index):
    columns = [col for col in df.columns if col.startswith(f"Q{question_index}_")]
    df = df.loc[1:][columns].fillna(False)
    unique_values = []
    for column in columns:
        column_unique = list(df[column].unique())
        column_unique.remove(False)
        unique_values.append(column_unique[0])
    filtered_df = df.replace(to_replace=unique_values, value=True)
    filtered_df.columns = [value.strip() for value in unique_values]
    return filtered_df

# We will usually want to join one or more of the filtered dataframes to the background one:
def join_dfs(*dataframes: List[pd.DataFrame]) -> pd.DataFrame:
    df = pd.concat(dataframes, axis=1, ignore_index=False)
    return df

In [None]:
# load the original dataset.
orig = pd.read_csv("../data/kaggle_survey_2020_responses.csv", low_memory=False)
orig.head(2)

In [None]:
# the first row contains the question.
# This is useful to read but it is not actual data.
df = orig.loc[1:].reset_index(drop=True)
df.head(2)
df.tail(2)

In [None]:
## Normalize data
#
# Strip whitespace from all the anwers
# Note: Since, initially, all the columns are of dtype "object" We could do this simply with: 
#           df = df.apply(lambda x: x.str.strip())
#       Nevertheless, we want to convert one column to a numeric one, so reexecuting this cell will be failing.
#       Thus we will do this in a bit more elaborate way.
string_cols = df.select_dtypes(object).columns
df[string_cols] = df[string_cols].apply(lambda x: x.str.strip())

# Remove symbols and "," from salary ranges.
df.Q24 = df.Q24.replace({"$0-999": "0-999", "> $500,000": "500,000-1,000,000"}).str.replace(",", "")

# Column: "Time from Start to Finish (seconds)" contains integers. Let's cast it and rename it to something more convenient
df = df.rename(columns={"Time from Start to Finish (seconds)": "duration"})
df["duration"] = df["duration"].apply(int)
df.head(2)
df.tail(2)

In [None]:
## Investigate duration
#
# Some people were too quick in completing the survey and their answers should be ignored.
# Nevertheless, coming up with a reasonable "cut-off" threshold is not that easy though.
df.duration.nsmallest(200, keep="all")

In [None]:
# Unfortunately, there had been participants who did not answer any non-demographic questions.
# Let's identify them.
# 
# Note: The non-demographic questions start from Q7 and afterwards, that's why we use:
#     df.iloc[:, 7:]
temp_df = df.iloc[0:, 7:]
participants_who_did_not_answer_any_real_questions = ((temp_df == 'None') | temp_df.isnull()).all(axis=1)
participants_who_did_not_answer_any_real_questions

In [None]:
# Let's drop these participants.
if len(participants_who_did_not_answer_any_real_questions) != len(df):
    print("rows already dropped!")
else:
    df = df[~participants_who_did_not_answer_any_real_questions].reset_index(drop=True)
df.tail(2)

In [None]:
# After the drops, the duration seems to be much more reasonable. 
# Using a threshold might still make some sense, but probably not much point in further investigating this.
df.duration.nsmallest(500, keep="all")

In [None]:
# create the demographics dataframe
demo = df[["Q1", "Q2", "Q3", "Q4", "Q5", "Q6", 'Q15']]
demo.columns = ["age", "gender", "country", "education", "role", "code_exp", "ml_exp"]
demo = demo.assign(
    salary=df.Q24.str.split("-").str[-1].astype(float).astype(pd.Int64Dtype()),
    code_exp=df.Q6.replace({
        "< 1 years": 1,
        "1-2 years": 2,
        "3-5 years": 5,
        "5-10 years": 10,
        "10-20 years": 20,
        "20+ years": 20,
        "I have never written code": 0,
    }).astype(pd.Int64Dtype()),
    ml_exp=df.Q15.replace({
        "I do not use machine learning methods": 0,
        "Under 1 year": 1,
        "1-2 years": 2,
        "2-3 years": 3,
        "3-4 years": 4,
        "4-5 years": 5,
        "5-10 years": 10,
        "10-20 years": 20,
        "20 or more years": 20,
        "I have never written code": 0,
    }).astype(pd.Int64Dtype()),
)

demo.tail(2)
demo.info()