In [None]:
import pandas as pd

pd.options.display.max_rows = 999
pd.options.display.max_colwidth = 999

surveys = {
    "2011": {
        "data": "2011 Stack Overflow Survey Results/2011 Stack Overflow Survey Results.csv",
        "readme": None,
        "schema": None
    },
    "2012": {
        "data": "2012 Stack Overflow Survey Results/2012 Stack Overflow Survey Results.csv",
        "readme": None,
        "schema": None
    },
    "2013": {
        "data": "2013 Stack Overflow Survey Responses/2013 Stack Overflow Survey Responses.csv",
        "readme": None,
        "schema": None
    },
    "2014": {
        "data": "2014 Stack Overflow Survey Responses/2014 Stack Overflow Survey Responses.csv",
        "readme": None,
        "schema": None
    },
    "2015": {
        "data": "2015 Stack Overflow Developer Survey Responses/2015 Stack Overflow Developer Survey Responses.csv",
        "readme": None,
        "schema": None
    },
    "2016": {
        "data": "2016 Stack Overflow Survey Results/2016 Stack Overflow Survey Results/2016 Stack Overflow Survey Responses.csv",
        "readme": "2016 Stack Overflow Survey Results/2016 Stack Overflow Survey Results/READ_ME_-_The_Public_2016_Stack_Overflow_Developer_Survey_Results.txt",
        "schema": None
    },
    "2017": {
        "data": "developer_survey_2017/survey_results_public.csv",
        "readme": "developer_survey_2017/README_2017.txt",
        "schema": "developer_survey_2017/survey_results_schema.csv"
    },
    "2018": {
        "data": "developer_survey_2018/survey_results_public.csv",
        "readme": "developer_survey_2018/README_2018.txt",
        "schema": "developer_survey_2018/survey_results_schema.csv"
    },
    "2019": {
        "data": "developer_survey_2019/survey_results_public.csv",
        "readme": "developer_survey_2019/README_2019.txt",
        "schema": "developer_survey_2019/survey_results_schema.csv"
    },
    "2020": {
        "data": "developer_survey_2020/survey_results_public.csv",
        "readme": "developer_survey_2020/README_2020.txt",
        "schema": "developer_survey_2020/survey_results_schema.csv"
    }
}

mapping = {
    "What Country or Region do you live in?": "Country",
    "How old are you?": "Age",
    "age_midpoint": "Age",
    "How many years of IT/Programming experience do you have?": "YearsCode",
    "Which best describes the size of your company?": "OrgSize",
    "How many developers are employed at your company?": "OrgSize",
    "company_size_range": "OrgSize",
    "Which of the following best describes your occupation?": "Employment",
    "What type of project are you developing?": "DevType",
    "Including bonus, what is your annual compensation in USD?": "ConvertedComp"
}

def search_schema(df, string):
    """Searches schema's DataFrame columns 'Question' for the word match.
    Args:
        df (pandas.DataFrame):
        string (str):
    Returns:
        DataFrame containing labels and questions with the matched word.
    """
    if 'Question' in df.columns:
        return df[df['Question'].str.contains('\W*'+string+'\W*', case=False, regex=True)]
    return df[df['QuestionText'].str.contains('\W*'+string+'\W*', case=False, regex=True)]

def search_columns(df, string):
    """Searches DataFrame's columns for the word match.
    Args:
        df (pandas.DataFrame):
        string (str):
    Returns:
        List containing all the columns with the word match.
    """
    return df.columns[df.columns.str.contains('\W*'+string+'\W*', case=False, regex=True)].tolist()

survey_2011 = pd.read_csv(surveys['2011']['data'])
survey_2011.head()

survey_2011.columns.tolist()

survey_2011['What Country or Region do you live in?'].value_counts()

survey_2011['How many years of IT/Programming experience do you have?'].value_counts()

survey_2011['How would you best describe the industry you work in?'].value_counts()

survey_2011['How old are you?'].value_counts()

survey_2011['Which best describes the size of your company?'].value_counts()

survey_2011['Which of the following best describes your occupation?'].value_counts()

survey_2011['Please rate your job/career satisfaction'].value_counts()

survey_2011['Including bonus, what is your annual compensation in USD?'].value_counts()

survey_2011['What type of project are you developing?'].value_counts()

survey_2011_map = {
    "What Country or Region do you live in?",
    "How old are you?",
    "How many years of IT/Programming experience do you have?",
#     "How would you best describe the industry you work in?",
    "Which best describes the size of your company?",
    "Which of the following best describes your occupation?",
    "What type of project are you developing?",
    "Including bonus, what is your annual compensation in USD?"
}

survey_2011[survey_2011_features].info()

survey_2012 = pd.read_csv(surveys['2012']['data'])
survey_2012[[col for col in survey_2012.columns if 'Unnamed' not in col]].info()

for col in survey_2011_features:
    if col not in survey_2012.columns:
        print(col)

survey_2012_features = survey_2011_features.copy()
# survey_2012_features.remove('How would you best describe the industry you work in?')
# survey_2012_features.append('How would you best describe the industry you currently work in?')

survey_2012[survey_2012_features].info()

for k, v in surveys.items():
    current_survey = pd.read_csv(v['data'], low_memory=False)
    exploring_columns = [col for col in current_survey.columns if 'Unnamed' not in col]
    print(f"Survey: {k}", current_survey[exploring_columns].info(), "="*50, sep="\n")

survey_2017 = pd.read_csv(surveys['2017']['data'])
survey_2017.head()

survey_2017_schema = pd.read_csv(surveys['2017']['schema'])
survey_2017_schema

search_schema(pd.read_csv(surveys['2017']['schema']), 'age')

search_schema(pd.read_csv(surveys['2020']['schema']), 'salary')

survey_2020_features = {
    "Country": "Country",
    "Age": "Age",
    "YearsCode": "YearsCode",
    "OrgSize": "OrgSize",
    "Employment": "Employment",
    "DevType": "DevType",
    "ConvertedComp": "ConvertedComp"
}

survey_2019 = pd.read_csv(surveys['2019']['data'])
survey_2019[survey_2020_features].info()

survey_2011_map = {
    "Country": "What Country or Region do you live in?",
    "Age": "How old are you?",
    "YearsCode": "How many years of IT/Programming experience do you have?",
    "OrgSize": "Which best describes the size of your company?",
    "Employment": "Which of the following best describes your occupation?",
    "DevType": "What type of project are you developing?",
    "ConvertedComp": "Including bonus, what is your annual compensation in USD?"
}

survey_2011_features

survey_2020 = pd.read_csv(surveys['2020']['data'])
survey_2020['Employment'].value_counts()

survey_2020['MainBranch'].value_counts()

survey_2020['CompTotal'].value_counts(normalize=True, bins=10)

for k, v in surveys.items():
    survey = pd.read_csv(v['data'], low_memory=True)
    print(k, "="*50, "\n")
    survey.rename(mapping, inplace=True)

dfs = []
for k, v in surveys.items():
    if int(k) <= 2016:
        continue
    survey = pd.read_csv(v['data'], low_memory=True)
    survey.rename(mapping, axis=1,inplace=True)
    print(k)
    dfs.append(survey[set(mapping.values())])
df = pd.concat(dfs, keys=surveys.keys())

list(set(mapping.values()))

survey_2014 = pd.read_csv(surveys['2014']['data'])
survey_2015 = pd.read_csv(surveys['2015']['data'], skiprows=[0])
survey_2016 = pd.read_csv(surveys['2016']['data'])
survey_2017 = pd.read_csv(surveys['2017']['data'], low_memory=True)
survey_2018 = pd.read_csv(surveys['2018']['data'], low_memory=True)

search_columns(survey_2017, 'old')

search_schema(pd.read_csv(surveys['2017']['schema'], low_memory=True), ' ')

Looks like only a few amount of questions could be analyzed in the context of time.
I abandon the main idea and now will search for fun questions in the context of salary, gender, age, country, time.

survey_2017 = pd.read_csv(surveys['2017']['data'], low_memory=True)

In [None]:
survey_2018 = pd.read_csv(surveys['2018']['data'], low_memory=False)
print(survey_2018.shape, survey_2018.columns.tolist())

survey_2018['Exercise'].value_counts(dropna=False)

survey_2018['Age'].value_counts(dropna=False)

survey_2018['CompanySize'].value_counts(dropna=False)

survey_2018['HoursOutside'].value_counts(dropna=False)

survey_2018['YearsCoding'].value_counts(dropna=False)