In [1]:
import holoviews as hv
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xa

from typing import List

from IPython.core.display import display, HTML

np.set_printoptions(linewidth=200)
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))

# Part A: Prepare the dataset

## A1. Load the necessary datasetes and make initial improvements

In [2]:
# load the original Kaggle Survey dataset.
orig = pd.read_csv('../data/kaggle_survey_2020_responses.csv', low_memory=False)
# the first row contains the question. This is useful to read but,
questions = orig.loc[0].reset_index(drop=True)
# It can be better read like that:
# questions[21]
# The question is not the data we want to explore,
#so we discard it from the df we will work on.
df = orig.loc[1:].reset_index(drop=True)

In [3]:
# Load countries' income groups from World Bank
country_groups = pd.read_csv('../data/wb_country_income_groups.csv')
country_groups.rename(columns={'Economy': 'country'}, inplace=True)
country_groups.head()

Unnamed: 0,country,Income group
0,Afghanistan,Low income
1,Albania,Upper middle income
2,Algeria,Lower middle income
3,American Samoa,Upper middle income
4,Andorra,High income


In [81]:
# Load mean nominal wage data from International Labor Organization
mean_wage_df = pd.read_csv('../data/ilo_mean_monthly_earnings_data.csv')
# mean_wage_df.head()
mean_wage_df = mean_wage_df[mean_wage_df.unit == 'Currency: U.S. dollars']
mean_wage_df.drop('unit', axis=1, inplace=True)

# https://www.numbeo.com/cost-of-living/country_price_rankings?itemId=105
# set an approximation for India
in_india_wage = pd.DataFrame([['India', int(2019), int(446)]], columns=list(mean_wage_df.columns))
mean_wage_df = mean_wage_df.append(in_india_wage)

mean_wage_df['yearly_earnings'] = mean_wage_df['monthly_earnings'] * 12

# sort by max year and drop duplicates
# df[df.groupby('ID')['date'].transform('max') == df['date']]
# https://stackoverflow.com/questions/53199076/grab-rows-with-max-date-from-pandas-dataframe
mean_wage_df = mean_wage_df.loc[mean_wage_df.groupby('country').year.idxmax()]
# mean_wage_df.head()

## A2. Define useful functions 

In [14]:
# In order to make the analysis easier, we can filter the dataframe into smaller ones 
# which will contain the data of each multiple choice question.
# To this end, defining a function can be helpful
def filter_df(df, question_index):
    columns = [col for col in df.columns if col.startswith(f"Q{question_index}_")]
    df = df.loc[0:][columns].fillna(False)
    unique_values = []
    for column in columns:
        column_unique = list(df[column].unique())
        column_unique.remove(False)
        unique_values.append(column_unique[0])
    filtered_df = df.replace(to_replace=unique_values, value=True)
    filtered_df.columns = [value.strip() for value in unique_values]
    return filtered_df

# We will usually want to join one or more of the filtered dataframes to the background one:
def join_dfs(*dataframes: List[pd.DataFrame]) -> pd.DataFrame:
    df = pd.concat(dataframes, axis=1, ignore_index=False)
    return df

In [15]:
# df.head(2)

In [16]:
## Normalize data format
# Strip whitespace from all the anwers
# Note: Since, initially, all the columns are of dtype "object" We could do this simply with: 
#           df = df.apply(lambda x: x.str.strip())
#       Nevertheless, we want to convert one column to a numeric one, so reexecuting this cell will be failing.
#       Thus we will do this in a bit more elaborate way.
string_cols = df.select_dtypes(object).columns
df[string_cols] = df[string_cols].apply(lambda x: x.str.strip())

In [17]:
df.head(2)

Unnamed: 0,Time from Start to Finish (seconds),Q1,Q2,Q3,Q4,Q5,Q6,Q7_Part_1,Q7_Part_2,Q7_Part_3,Q7_Part_4,Q7_Part_5,Q7_Part_6,Q7_Part_7,Q7_Part_8,Q7_Part_9,Q7_Part_10,Q7_Part_11,Q7_Part_12,Q7_OTHER,Q8,Q9_Part_1,Q9_Part_2,Q9_Part_3,Q9_Part_4,Q9_Part_5,Q9_Part_6,Q9_Part_7,Q9_Part_8,Q9_Part_9,Q9_Part_10,Q9_Part_11,Q9_OTHER,Q10_Part_1,Q10_Part_2,Q10_Part_3,Q10_Part_4,Q10_Part_5,Q10_Part_6,Q10_Part_7,Q10_Part_8,Q10_Part_9,Q10_Part_10,Q10_Part_11,Q10_Part_12,Q10_Part_13,Q10_OTHER,Q11,Q12_Part_1,Q12_Part_2,Q12_Part_3,Q12_OTHER,Q13,Q14_Part_1,Q14_Part_2,Q14_Part_3,Q14_Part_4,Q14_Part_5,Q14_Part_6,Q14_Part_7,Q14_Part_8,Q14_Part_9,Q14_Part_10,Q14_Part_11,Q14_OTHER,Q15,Q16_Part_1,Q16_Part_2,Q16_Part_3,Q16_Part_4,Q16_Part_5,Q16_Part_6,Q16_Part_7,Q16_Part_8,Q16_Part_9,Q16_Part_10,Q16_Part_11,Q16_Part_12,Q16_Part_13,Q16_Part_14,Q16_Part_15,Q16_OTHER,Q17_Part_1,Q17_Part_2,Q17_Part_3,Q17_Part_4,Q17_Part_5,Q17_Part_6,Q17_Part_7,Q17_Part_8,Q17_Part_9,Q17_Part_10,Q17_Part_11,Q17_OTHER,Q18_Part_1,Q18_Part_2,Q18_Part_3,Q18_Part_4,Q18_Part_5,Q18_Part_6,Q18_OTHER,Q19_Part_1,Q19_Part_2,Q19_Part_3,Q19_Part_4,Q19_Part_5,Q19_OTHER,Q20,Q21,Q22,Q23_Part_1,Q23_Part_2,Q23_Part_3,Q23_Part_4,Q23_Part_5,Q23_Part_6,Q23_Part_7,Q23_OTHER,Q24,Q25,Q26_A_Part_1,Q26_A_Part_2,Q26_A_Part_3,Q26_A_Part_4,Q26_A_Part_5,Q26_A_Part_6,Q26_A_Part_7,Q26_A_Part_8,Q26_A_Part_9,Q26_A_Part_10,Q26_A_Part_11,Q26_A_OTHER,Q27_A_Part_1,Q27_A_Part_2,Q27_A_Part_3,Q27_A_Part_4,Q27_A_Part_5,Q27_A_Part_6,Q27_A_Part_7,Q27_A_Part_8,Q27_A_Part_9,Q27_A_Part_10,Q27_A_Part_11,Q27_A_OTHER,Q28_A_Part_1,Q28_A_Part_2,Q28_A_Part_3,Q28_A_Part_4,Q28_A_Part_5,Q28_A_Part_6,Q28_A_Part_7,Q28_A_Part_8,Q28_A_Part_9,Q28_A_Part_10,Q28_A_OTHER,Q29_A_Part_1,Q29_A_Part_2,Q29_A_Part_3,Q29_A_Part_4,Q29_A_Part_5,Q29_A_Part_6,Q29_A_Part_7,Q29_A_Part_8,Q29_A_Part_9,Q29_A_Part_10,Q29_A_Part_11,Q29_A_Part_12,Q29_A_Part_13,Q29_A_Part_14,Q29_A_Part_15,Q29_A_Part_16,Q29_A_Part_17,Q29_A_OTHER,Q30,Q31_A_Part_1,Q31_A_Part_2,Q31_A_Part_3,Q31_A_Part_4,Q31_A_Part_5,Q31_A_Part_6,Q31_A_Part_7,Q31_A_Part_8,Q31_A_Part_9,Q31_A_Part_10,Q31_A_Part_11,Q31_A_Part_12,Q31_A_Part_13,Q31_A_Part_14,Q31_A_OTHER,Q32,Q33_A_Part_1,Q33_A_Part_2,Q33_A_Part_3,Q33_A_Part_4,Q33_A_Part_5,Q33_A_Part_6,Q33_A_Part_7,Q33_A_OTHER,Q34_A_Part_1,Q34_A_Part_2,Q34_A_Part_3,Q34_A_Part_4,Q34_A_Part_5,Q34_A_Part_6,Q34_A_Part_7,Q34_A_Part_8,Q34_A_Part_9,Q34_A_Part_10,Q34_A_Part_11,Q34_A_OTHER,Q35_A_Part_1,Q35_A_Part_2,Q35_A_Part_3,Q35_A_Part_4,Q35_A_Part_5,Q35_A_Part_6,Q35_A_Part_7,Q35_A_Part_8,Q35_A_Part_9,Q35_A_Part_10,Q35_A_OTHER,Q36_Part_1,Q36_Part_2,Q36_Part_3,Q36_Part_4,Q36_Part_5,Q36_Part_6,Q36_Part_7,Q36_Part_8,Q36_Part_9,Q36_OTHER,Q37_Part_1,Q37_Part_2,Q37_Part_3,Q37_Part_4,Q37_Part_5,Q37_Part_6,Q37_Part_7,Q37_Part_8,Q37_Part_9,Q37_Part_10,Q37_Part_11,Q37_OTHER,Q38,Q39_Part_1,Q39_Part_2,Q39_Part_3,Q39_Part_4,Q39_Part_5,Q39_Part_6,Q39_Part_7,Q39_Part_8,Q39_Part_9,Q39_Part_10,Q39_Part_11,Q39_OTHER,Q26_B_Part_1,Q26_B_Part_2,Q26_B_Part_3,Q26_B_Part_4,Q26_B_Part_5,Q26_B_Part_6,Q26_B_Part_7,Q26_B_Part_8,Q26_B_Part_9,Q26_B_Part_10,Q26_B_Part_11,Q26_B_OTHER,Q27_B_Part_1,Q27_B_Part_2,Q27_B_Part_3,Q27_B_Part_4,Q27_B_Part_5,Q27_B_Part_6,Q27_B_Part_7,Q27_B_Part_8,Q27_B_Part_9,Q27_B_Part_10,Q27_B_Part_11,Q27_B_OTHER,Q28_B_Part_1,Q28_B_Part_2,Q28_B_Part_3,Q28_B_Part_4,Q28_B_Part_5,Q28_B_Part_6,Q28_B_Part_7,Q28_B_Part_8,Q28_B_Part_9,Q28_B_Part_10,Q28_B_OTHER,Q29_B_Part_1,Q29_B_Part_2,Q29_B_Part_3,Q29_B_Part_4,Q29_B_Part_5,Q29_B_Part_6,Q29_B_Part_7,Q29_B_Part_8,Q29_B_Part_9,Q29_B_Part_10,Q29_B_Part_11,Q29_B_Part_12,Q29_B_Part_13,Q29_B_Part_14,Q29_B_Part_15,Q29_B_Part_16,Q29_B_Part_17,Q29_B_OTHER,Q31_B_Part_1,Q31_B_Part_2,Q31_B_Part_3,Q31_B_Part_4,Q31_B_Part_5,Q31_B_Part_6,Q31_B_Part_7,Q31_B_Part_8,Q31_B_Part_9,Q31_B_Part_10,Q31_B_Part_11,Q31_B_Part_12,Q31_B_Part_13,Q31_B_Part_14,Q31_B_OTHER,Q33_B_Part_1,Q33_B_Part_2,Q33_B_Part_3,Q33_B_Part_4,Q33_B_Part_5,Q33_B_Part_6,Q33_B_Part_7,Q33_B_OTHER,Q34_B_Part_1,Q34_B_Part_2,Q34_B_Part_3,Q34_B_Part_4,Q34_B_Part_5,Q34_B_Part_6,Q34_B_Part_7,Q34_B_Part_8,Q34_B_Part_9,Q34_B_Part_10,Q34_B_Part_11,Q34_B_OTHER,Q35_B_Part_1,Q35_B_Part_2,Q35_B_Part_3,Q35_B_Part_4,Q35_B_Part_5,Q35_B_Part_6,Q35_B_Part_7,Q35_B_Part_8,Q35_B_Part_9,Q35_B_Part_10,Q35_B_OTHER
0,1838,35-39,Man,Colombia,Doctoral degree,Student,5-10 years,Python,R,SQL,C,,,Javascript,,,,MATLAB,,Other,Python,"Jupyter (JupyterLab, Jupyter Notebooks, etc)",,,Visual Studio Code (VSCode),,Spyder,,,,,,,Kaggle Notebooks,Colab Notebooks,,,,,,,,,,,,,"A cloud computing platform (AWS, Azure, GCP, h...",GPUs,,,,2-5 times,Matplotlib,,,,,,,,Geoplotlib,,,,1-2 years,,TensorFlow,Keras,,,,Xgboost,,,,,,,,,,,Decision Trees or Random Forests,"Gradient Boosting Machines (xgboost, lightgbm,...",Bayesian Approaches,,"Dense Neural Networks (MLPs, etc)",Convolutional Neural Networks,,Recurrent Neural Networks,,,,,,,Image classification and other general purpose...,,,,"Word embeddings/vectors (GLoVe, fastText, word...",,"Contextualized embeddings (ELMo, CoVe)","Transformer language models (GPT-3, BERT, XLne...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Coursera,,Kaggle Learn Courses,,,,,,,University Courses (resulting in a university ...,,,"Basic statistical software (Microsoft Excel, G...",,,,"Kaggle (notebooks, forums, etc)",,,,,"Journal Publications (peer-reviewed journals, ...",,,,Amazon Web Services (AWS),Microsoft Azure,Google Cloud Platform (GCP),IBM Cloud / Red Hat,,SAP Cloud,,,,,,,,,,Azure Cloud Services,Microsoft Azure Container Instances,Azure Functions,Google Cloud Compute Engine,Google Cloud Functions,Google Cloud Run,Google Cloud App Engine,,,Amazon SageMaker,Amazon Forecast,Amazon Rekognition,Azure Machine Learning Studio,Azure Cognitive Services,Google Cloud AI Platform / Google Cloud ML Engine,Google Cloud Video AI,Google Cloud Natural Language,Google Cloud Vision AI,,,,,,,MongoDB,,,Microsoft SQL Server,,,,,,Google Cloud BigQuery,Google Cloud SQL,Google Cloud Firestore,,,Microsoft Power BI,Amazon QuickSight,Google Data Studio,,Tableau,,,,,,,,SAP Analytics Cloud,,,"Automated data augmentation (e.g. imgaug, albu...",,,,Automated hyperparameter tuning (e.g. hyperopt...,Automation of full ML pipelines (e.g. Google C...,,,Google Cloud AutoML,,Databricks AutoML,,,Auto-Keras,Auto-Sklearn,,,,,,,,,,TensorBoard,,,,,,
1,289287,30-34,Man,United States of America,Master’s degree,Data Engineer,5-10 years,Python,R,SQL,,,,,,,,,,,Python,,,Visual Studio,,PyCharm,,,Sublime Text,,,,,,Colab Notebooks,,,,,,,,,,,,,A personal computer or laptop,GPUs,,,,2-5 times,Matplotlib,Seaborn,,Ggplot / ggplot2,Shiny,,,,,,,,1-2 years,Scikit-learn,TensorFlow,Keras,PyTorch,,,,,,,,,,,,,Linear or Logistic Regression,,,,,,Convolutional Neural Networks,,,"Transformer Networks (BERT, gpt-3, etc)",,,,"Image segmentation methods (U-Net, Mask R-CNN,...",,Image classification and other general purpose...,,,,,,"Contextualized embeddings (ELMo, CoVe)","Transformer language models (GPT-3, BERT, XLne...",,,"10,000 or more employees",20+,"We have well established ML methods (i.e., mod...",Analyze and understand data to influence produ...,,,,,Do research that advances the state of the art...,,,"100,000-124,999","$100,000 or more ($USD)",Amazon Web Services (AWS),Microsoft Azure,Google Cloud Platform (GCP),,,,,,,,,,Amazon EC2,AWS Lambda,,,,Azure Functions,Google Cloud Compute Engine,,,,,,Amazon SageMaker,,,,,,,,,,,,PostgresSQL,,,,,,,,,Amazon Redshift,Amazon Athena,,,,,,,PostgresSQL,Amazon QuickSight,Microsoft Power BI,,,Tableau,,,,,,,,,,,Microsoft Power BI,,,,,,,No / None,,,,,,,,,,,,,,,,,,,,,,,No / None,,,,,GitHub,,,,,,,Coursera,,,DataCamp,,,Udemy,,,,,,"Business intelligence software (Salesforce, Ta...",Twitter (data science influencers),,"Reddit (r/machinelearning, etc)","Kaggle (notebooks, forums, etc)","Course Forums (forums.fast.ai, Coursera forums...","YouTube (Kaggle YouTube, Cloud AI Adventures, ...",,"Blogs (Towards Data Science, Analytics Vidhya,...",,"Slack Communities (ods.ai, kagglenoobs, etc)",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## A3 Modify Kaggle data format to assist exploration (column names, answers)

In [18]:
# Column 1: "Time from Start to Finish (seconds)" contains integers. Let's cast it and rename it to something more convenient
df.rename(columns={'Time from Start to Finish (seconds)': 'duration'}, inplace=True)
df['duration'] = df['duration'].apply(int)

In [19]:
# orig.Q24.unique() # Q24 salary
# Remove symbols and "," from salary ranges.
df.Q24 = df.Q24.replace({
    '$0-999': '0-999',
    '> $500,000': '500,000-999,999',
    '300,000-500,000': '300,000-499,999', 
}).str.replace(',', '')

#### Columns about experience have different ranges and different format. Modify format to be similar and DNRY
#### This way, we minimize errors that may be caused by human typing,
#### e.g. executive summary p. 10, machine learning experience class from 10-20 years is reference as 10-15 years

In [20]:
orig.Q6.unique(), orig.Q15.unique()

(array(['For how many years have you been writing code and/or programming?', '5-10 years', '10-20 years', '3-5 years', '< 1 years', '1-2 years', '20+ years', 'I have never written code', nan],
       dtype=object),
 array(['For how many years have you used machine learning methods?', '1-2 years', 'I do not use machine learning methods', '3-4 years', nan, 'Under 1 year', '2-3 years', '4-5 years', '5-10 years',
        '20 or more years', '10-20 years'], dtype=object))

In [21]:
df.Q6 = df.Q6.replace({
    '< 1 year': '< 1',
    'I have never written code': '0'
}).str.replace(' years', '')

In [22]:
df.Q15 = df.Q15.replace({
    'Under 1 year': '< 1',
    '20 or more years': '20+',
    'I do not use machine learning methods': '0'
}).str.replace(' years', '')

In [23]:
# Refine Company employment size values
df.Q20 = df.Q20.replace({
    '10,000 or more employees': '> 10000',
}).str.replace(' employees', '').replace(',', '')

In [24]:
# Select useful columns for data validity exploration and rename them
validation_cols_names = {
    'Q1': 'age',
    'Q2': 'gender',
    'Q3': 'country',
    'Q4': 'education',
    'Q5': 'role',
    'Q6': 'code_exp',
    'Q15': 'ml_exp',
    'Q20': 'employees',
    'Q21': 'team_ds',
    'Q22': 'company_ml_use',
    'Q24': 'salary',
    'Q25': 'spend_ds'
}

df.rename(columns=validation_cols_names, inplace=True)

validation_cols = [*validation_cols_names.values()]

# Part B: Examine the data for invalid values and clean it

## B1. Investigate duration -> Examine everything!

In [25]:
# Some people were too quick in completing the survey and their answers should be ignored.
# Nevertheless, coming up with a reasonable "cut-off" threshold is not that easy though.
df.duration.nsmallest(200, keep='all')

3598     20
11558    20
18875    21
5743     22
17664    23
         ..
15908    48
16104    48
17090    48
17864    48
18600    48
Name: duration, Length: 203, dtype: int64

## B2. Participants who did not answer any non-demographic questions.
### Let's identify them.

In [26]:
questions[:7]

0                                Duration (in seconds)
1                          What is your age (# years)?
2               What is your gender? - Selected Choice
3            In which country do you currently reside?
4    What is the highest level of formal education ...
5    Select the title most similar to your current ...
6    For how many years have you been writing code ...
Name: 0, dtype: object

In [27]:
# Note: The non-demographic questions start from Q7 and afterwards (years of coding (or not) is included in basic demographic questions here),
# that's why we use:
#     df.iloc[:, 6:]
temp_df = df.iloc[0:, 7:]
only_answer_demographic = ((temp_df == 'None')
                           | temp_df.isnull()).all(axis=1)

len(df[only_answer_demographic])

1082

In [None]:
# Construct logical conditions to filter out invalid data:
# participants with All Nan or Nonw after Q6:
only_answer_demographic = ((temp_df == 'None')
                           | temp_df.isnull()).all(axis=1)


In [28]:
# # So there is a large number of 1082, more than 5% of participants who did not answer anything related to data science.
# # Let's drop these participants. 
# if len(only_answer_demographic) != len(df):
#     print("rows already dropped!")
# else:
#     df = df[~only_answer_demographic].reset_index(drop=True)

In [29]:
df = df[~only_answer_demographic].reset_index(drop=True)

In [30]:
len(df)

18954

In [31]:
# After the drops, the duration seems to be much more reasonable. 
# Using a threshold might still make some sense, but probably we may drop some real typing heros.
# Any suggestions on how to further investigate this?
df.duration.nsmallest(100, keep='all')
# df[df.duration < 120]

16893    48
17955    49
1392     51
17946    52
709      54
         ..
17538    95
1214     96
3625     96
8995     96
16697    96
Name: duration, Length: 103, dtype: int64

In [32]:
len(df[df.duration < 61])

7

In [33]:
# We could also set a more strict quality limit
# e.g. keep participants for minimum 4 answers besides demographics = drop 600 more.
# Anyway, we shall keep those and find other methods to filter out invalid data.
len(df.dropna(thresh=11))

18395

## B3 Discard invalid submissions, using logical conditions.
### Not outliers, we just drop data entries who are invalid.

### First easy candidate for invalid submissions: too young for experience or salary

In [92]:
extreme_young = (
    (df.age <= '24')
    & ((df.code_exp == '20+')
       | (df.ml_exp == '20+')
       | (df.salary.isin(['300000-499999', '500000-999999'])
         )
      )
)
extreme_young.sum()
# len(df[extreme_young])

0

In [35]:
df = df[~extreme_young].reset_index(drop=True)

In [36]:
len(df)

18927

### Second easy candidate for invalid submissions:
Don't you think that something does not seem right with the salary distribution?
Logically, as we mentioned above beginners are expected to receive a lower salary for many reasons. But don't you think that for someone living in the USA, earing a yearly salary less than "2.8$", even for a part time job of, lets say 1 hour per day, is questionable for anything relating to coding?
Let us explote this a bit more; what could the chareteristics of the sample be that explain with such a low salary, especially for USA residents?
Obvious, candidates: level of education and experience.

In [37]:
work_in_USA_for_3_per_day_code_exper = df[(df.salary.isin(['0-999'])
                                            # & (df.role != 'Currently not employed')  # this is not necessary
                                            & (df.country == 'United States of America'))
].groupby('code_exp', as_index=False).size()

work_in_USA_for_3_per_day_code_exper

Unnamed: 0,code_exp,size
0,0,9
1,1-2,12
2,10-20,15
3,20+,34
4,3-5,14
5,5-10,9
6,< 1,9


Obviously, the 34 submissions with 20+ years of coding experience do not belong in the 0-999 salary range in the USA.
In general, someone with minimum 5, to 10 years of coding experience (a highly skilled individual in a demanding role!) that should be employed part time, e.g. only 5 hours per week, 20 hours per month, and only for 10 months, let's say for 5$ per hour, to be in this group. NOPE! No matter how you cut in it does not fit in.

In [38]:
#if you are not convinced, take a look at their submitted current occupation
# if you really like EDA, check the company employment size.
work_in_USA_for_3_per_day_code_exper = df[(df.salary.isin(['0-999'])
                                            & (df.role != 'Currently not employed')
                                            & (df.country == 'United States of America'))
                                            & (df.code_exp.isin(['3-5', '5-10', '10-20', '20+']))
].groupby(['code_exp', 'role'], as_index=False).size().sort_values('size', ascending=False)

work_in_USA_for_3_per_day_code_exper

Unnamed: 0,code_exp,role,size
11,20+,Other,9
14,20+,Software Engineer,8
9,20+,Data Scientist,8
3,10-20,Other,4
13,20+,Research Scientist,3
5,10-20,Research Scientist,3
19,3-5,Other,3
18,3-5,Machine Learning Engineer,3
17,3-5,Data Scientist,3
16,3-5,Data Analyst,3


In [39]:
# We definetely should drop these observations, if we intend to make meaningful EDA about this dataset.
# First, let's construct a similar condition for Ml experience.

In [40]:
work_in_USA_for_3_per_day_ml_exper = df[(df.salary.isin(['0-999'])
                                          & (df.role != 'Currently not employed')
                                          & (df.country == 'United States of America'))
].groupby('ml_exp', as_index=False).size()

work_in_USA_for_3_per_day_ml_exper

Unnamed: 0,ml_exp,size
0,0,10
1,1-2,18
2,10-20,3
3,2-3,9
4,20+,4
5,3-4,8
6,4-5,5
7,5-10,7
8,< 1,29


In [41]:
work_in_India_for_3_per_day_code_exper = df[(df.salary.isin(['0-999'])
                                                       & (df.role != 'Currently not employed')
                                                       & (df.country == 'India'))
                                                     ].groupby('code_exp', as_index=False).size()

work_in_India_for_3_per_day_code_exper

Unnamed: 0,code_exp,size
0,0,51
1,1-2,216
2,10-20,29
3,20+,7
4,3-5,173
5,5-10,49
6,< 1,155


In [42]:
work_in_India_for_3_per_day_ml_exper = df[(df.salary.isin(['0-999'])
                                                     & (df.role != 'Currently not employed')
                                                     & (df.country == 'India'))
                                                   ].groupby('ml_exp', as_index=False).size()
# work_in_India_for_3_per_day_ml_exper

In [43]:
# df.salary.unique()

In [44]:
# len(df.salary.unique())

## ALSO SOS: We lose a lot of info with 25 bins, who are so unevenly distributed! A distribution chart may be misleading

### Since there are huge cross-country wage differences, we should set a reasonable and unamibguous threshold that would hold for all countries.
We could state that it is impossible for someone with 3 or more years of coding experience, or let's say 2 years of Ml experience to earn less than 80$ per month in such a skill demanding occupation, in any country (taking into account that this market offers a lot of opportunities for remote work on a global scale).

Since this threshold is ad hoc, we could be even more strict and set a the threshold to 2-3 years. This would not be a bad choice either.
Chosing a lower coding experience threshold would result in dropping more invalid submissions (True Positive, if we define Positive as identifying invalid submissions). But, we could ge also a few False Positives (dropping observations who ).
As a rule, in this part of the analysis, we will set a "loose" threshold, so as not to drop the maximum number of invalid submission, but in order to keep as many valid submissions as possible (minimize False Positives).

This analysis can be modified easily for stricter quality rules, as we will show in the end.
It would be equally reasonable to rule out any submissions in this salary range with more than 2 years of experience in either coding or machine learning, or even everybody who is currently employed in the USA, dropping perhpaps out of the data some fellows how less than part-time.

Also, note carefully that it would not be an optimal approach to use any statistical measures to exclude outliers for the data in this case, since all such measures are very biased by this kind of invalid submissions.

### To conclude, let us drop all submissions that stated that they are currently employed, with salary below $1000 and coding experience of 5 or more years or machine learning experience of more than 2 years.

In [45]:
too_exper_for_min_salary = (
    (df.salary.isin(['0-999']))
    & (df.code_exp.isin(['3-5', '5-10', '10-20', '20+'])
       | (df.ml_exp.isin(['2-3', '3-4', '4-5', '5-10', '10-20', '20+']))
      )
)

len(df[too_exper_for_min_salary])

953

In [46]:
# Example of invalid nonsense data
df.loc[16115][validation_cols]

age                                                           35-39
gender                                                          Man
country           United Kingdom of Great Britain and Northern I...
education                                           Master’s degree
role                                              Software Engineer
code_exp                                                        20+
ml_exp                                                          2-3
employees                                                   250-999
team_ds                                                         20+
company_ml_use    We have well established ML methods (i.e., mod...
salary                                                        0-999
spend_ds                                                  $0 ($USD)
Name: 16115, dtype: object

In [47]:
# question[107]

In [48]:
df = df[~too_exper_for_min_salary].reset_index(drop=True)

In [49]:
len(df), len(df[df.role == 'Data Scientist'])

(17974, 2460)

In [50]:
# Similarly, setting 'loose' thresholds that allow for
# some invalid submission to remain but
# minimize False Positives of invalid submissions:
too_exper_for_subsistence_salary = (
    (df.salary.isin(['1000-1999', '2000-2999', '3000-3999', '4000-4999']))
                             #  & (df.role != 'Currently not employed')  # not necessary condition
                               & ((df.code_exp.isin(['5-10', '10-20', '20+'])
                                  | (df.ml_exp.isin(['5-10', '10-20', '20+'])))
                            ))

len(df[too_exper_for_subsistence_salary])

400

In [51]:
df = df[~too_exper_for_subsistence_salary].reset_index(drop=True)

In [52]:
len(df), len(df[df.role == 'Data Scientist'])

(17574, 2405)

# ???

In [53]:
# Similarly, setting 'loose' thresholds that allow for
# some invalid submission to remain but
# minimize False Positives of invalid submissions:
too_exper_for_avg_salary = (
    #(df.salary.isin(['5000-7499', '7500-9999']))
        (df.salary.isin(['5000-7499']))
                               #& (df.role != 'Currently not employed')  # not necessary condition
                               & ((df.code_exp.isin(['10-20', '20+'])
                                  | (df.ml_exp.isin(['10-20', '20+'])))
                            ))

len(df[too_exper_for_avg_salary])

44

In [54]:
df = df[~too_exper_for_avg_salary].reset_index(drop=True)

In [55]:
# Similarly, setting 'loose' thresholds that allow for
# some invalid submission to remain but
# minimize False Positives of invalid submissions:
too_exper_for_low_salary = (
    (df.salary.isin(['10000-14999']))
                               #& (df.role != 'Currently not employed')  # not necessary condition
                               & ((df.code_exp.isin(['20+'])
                                  | (df.ml_exp.isin(['20+'])))  # perhaps add '10-20' in ml exp
                            ))

len(df[too_exper_for_low_salary])

37

In [56]:
df = df[~too_exper_for_low_salary].reset_index(drop=True)

In [57]:
len(df), len(df[df.role == 'Data Scientist'])

(17493, 2393)

### Now let's work the other way around, no exper and young age and top salary

In [58]:
# some invalid submission to remain but
# minimize False Positives of invalid submissions:
unexper_for_top_salary = (
    (df.salary.isin(['300000-499000', '500000-999999']))  # maybe add '300000-499000',
                               & (df.age <= '34')  
                               & (df.code_exp.isin(['1-2', '< 1', '0'])
                               & (df.ml_exp.isin(['1-2', '< 1', '0', np.nan]))  # perhaps or (instead of and) term
                            ))

len(df[unexper_for_top_salary])

5

In [59]:
df_validation = df[validation_cols]

In [60]:
demogr_unexper_for_top_salary = df_validation[unexper_for_top_salary]

In [61]:
demogr_unexper_for_top_salary

Unnamed: 0,age,gender,country,education,role,code_exp,ml_exp,employees,team_ds,company_ml_use,salary,spend_ds
3016,25-29,Man,Pakistan,Bachelor’s degree,Data Scientist,1-2,< 1,0-49,0,No (we do not use ML methods),500000-999999,"$1000-$9,999"
5429,30-34,Woman,Nigeria,Master’s degree,Other,< 1,0,250-999,10-14,I do not know,500000-999999,"$10,000-$99,999"
5486,25-29,Man,Thailand,Master’s degree,Data Scientist,0,,250-999,1-2,We are exploring ML methods (and may one day p...,500000-999999,"$1000-$9,999"
8126,30-34,Man,China,Master’s degree,Software Engineer,< 1,< 1,250-999,0,No (we do not use ML methods),500000-999999,$0 ($USD)
11067,25-29,Man,India,Master’s degree,Product/Project Manager,1-2,< 1,250-999,1-2,We use ML methods for generating insights (but...,500000-999999,"$100,000 or more ($USD)"


In [62]:
demogr_unexper_for_top_salary.index

Int64Index([3016, 5429, 5486, 8126, 11067], dtype='int64')

In [63]:
task_index = filter_df(df, 23)
# task_index

In [64]:
task = task_index[task_index]
len(task)

17493

In [65]:
task.iloc[1]

Analyze and understand data to influence product or business decisions                                              1.0
Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data    NaN
Build prototypes to explore applying machine learning to new areas                                                  NaN
Build and/or run a machine learning service that operationally improves my product or workflows                     NaN
Experimentation and iteration to improve existing ML models                                                         NaN
Do research that advances the state of the art of machine learning                                                  1.0
None of these activities are an important part of my role at work                                                   NaN
Other                                                                                                               NaN
Name: 1, dtype: float64

In [66]:
task.iloc[1][0]

1.0

In [67]:
task.iloc[demogr_unexper_for_top_salary.index]  # task.iloc[['3088', '5612', '6573', '8319', '7820', '11348']]   # '7820'

Unnamed: 0,Analyze and understand data to influence product or business decisions,"Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data",Build prototypes to explore applying machine learning to new areas,Build and/or run a machine learning service that operationally improves my product or workflows,Experimentation and iteration to improve existing ML models,Do research that advances the state of the art of machine learning,None of these activities are an important part of my role at work,Other
3016,,,,,,,1.0,
5429,1.0,,,,,,,
5486,,,,,,1.0,,
8126,,,,,,,1.0,
11067,,,1.0,,,,,


# OBVIOUS INVALID CONDITIONS

#### Q5: Machine Learning Engineer VS Q15 "I do not use ML methods" : 'ml_exp' == 0
#### Q5: Machine Learning Engineer VS Q15 
#### Q5: Software Engineer VS Q6: 'code_exp' == 0
#### Q5: Data Engineer, Data Scientist, Data Analyst VS Q21: ds_team == 0 and Q23 Activities
#### Q15 "I do not use ML methods" : 'ml_exp' == 0 VS 
    A) Q16 anythin but 'None'
    B)Q17 using special methods such as   (this leads to Q18 and Q19)
    Gradient Boosting Machines (xgboost, lightgbm, etc)
    Dense Neural Networks (MLPs, etc)
    Convolutional Neural Networks
    Generative Adversarial Networks
    Recurrent Neural Networks
    Transformer Networks (BERT, gpt-3, etc)
    

In [68]:
check_q23 = join_dfs(df_validation, task.iloc[df_validation.index])
check_q23.head(2)

Unnamed: 0,age,gender,country,education,role,code_exp,ml_exp,employees,team_ds,company_ml_use,salary,spend_ds,Analyze and understand data to influence product or business decisions,"Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data",Build prototypes to explore applying machine learning to new areas,Build and/or run a machine learning service that operationally improves my product or workflows,Experimentation and iteration to improve existing ML models,Do research that advances the state of the art of machine learning,None of these activities are an important part of my role at work,Other
0,35-39,Man,Colombia,Doctoral degree,Student,5-10,1-2,,,,,,,,,,,,,
1,30-34,Man,United States of America,Master’s degree,Data Engineer,5-10,1-2,> 10000,20+,"We have well established ML methods (i.e., mod...",100000-124999,"$100,000 or more ($USD)",1.0,,,,,1.0,,


In [69]:
check_q23.columns[0:3]

Index(['age', 'gender', 'country'], dtype='object')

In [70]:
check_q23.iloc[0:3]

Unnamed: 0,age,gender,country,education,role,code_exp,ml_exp,employees,team_ds,company_ml_use,salary,spend_ds,Analyze and understand data to influence product or business decisions,"Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data",Build prototypes to explore applying machine learning to new areas,Build and/or run a machine learning service that operationally improves my product or workflows,Experimentation and iteration to improve existing ML models,Do research that advances the state of the art of machine learning,None of these activities are an important part of my role at work,Other
0,35-39,Man,Colombia,Doctoral degree,Student,5-10,1-2,,,,,,,,,,,,,
1,30-34,Man,United States of America,Master’s degree,Data Engineer,5-10,1-2,> 10000,20+,"We have well established ML methods (i.e., mod...",100000-124999,"$100,000 or more ($USD)",1.0,,,,,1.0,,
2,35-39,Man,Argentina,Bachelor’s degree,Software Engineer,10-20,0,"1000-9,999",0,No (we do not use ML methods),15000-19999,$0 ($USD),,,,,,,1.0,


In [71]:
 check_q23.iloc[:3, [1, 5]] 

Unnamed: 0,gender,code_exp
0,Man,5-10
1,Man,5-10
2,Man,10-20


In [72]:
 check_q23.iloc[:3, [13]]

Unnamed: 0,"Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data"
0,
1,
2,


In [73]:
check_q23[check_q23.columns[13]]

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
        ... 
17488    NaN
17489    1.0
17490    NaN
17491    NaN
17492    1.0
Name: Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data, Length: 17493, dtype: float64

In [74]:
check_q23.columns[13:16]

Index(['Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data',
       'Build prototypes to explore applying machine learning to new areas',
       'Build and/or run a machine learning service that operationally improves my product or workflows'],
      dtype='object')

In [75]:
check_q23.columns[[14]].values

array(['Build prototypes to explore applying machine learning to new areas'], dtype=object)

In [76]:
ml_eng_without_exp = ((check_q23.role == 'Machine Learning Engineer')
                        & ((check_q23.ml_exp == 0) 
                           | (check_q23.ml_exp == np.nan)
                          )
                      & (
                          #(check_q23['Build prototypes to explore applying machine learning to new areas'] == 1.0)
                      | (check_q23[check_q23.columns[13]] == 1.0)
                          | (check_q23[check_q23.columns[14]] == 1.0)
                      | (check_q23[check_q23.columns[15]] == 1.0))
                        #& (check_q23.iloc[:, [13]]  == '1.0')
                     )

check_q23[ml_eng_without_exp]

SyntaxError: invalid syntax (<ipython-input-76-6b55df04dcf2>, line 7)

In [None]:
data_role_no_team = df[
    ((df.role == 'Data Scientist')
     | (df.role == 'Data Engineer')
     | (df.role == 'Data Analyst'))
    & (df.team_ds == '0')
]

len(data_role_no_team)

In [None]:
# SUGGESTIONS FOR QUESTIONNAIRE
# Q4: Should be 2 distinct questions
# Q5: about role could be 'Employed full time', 'Employed part time', 'Currently not employed', 'Student'
# and a Q6 for employment role with current question B choices.

In [None]:
check_q23_activity_conditions = join_dfs(df_validation, task.iloc[df_validation.index])
check_q23_activity_conditions

In [None]:
# demogr_task = join_dfs(demogr_unexper_for_top_salary, task)
# demogr_task

In [None]:
# len(df.dropna(thresh=10))

In [None]:
# len(df[df.duration < 61])

# IT IS DRAFT OR OLD CODE FROM HERE. IGNORE IT
Explore strange US salary above 100k to 200k and global modes of 10-15 and 40 to 50

In [None]:
# salary_abmormal_India = ((df.salary.isin(['10000-14999'])  # 100000-124999', '125000-149999', '150000-199999', '10000-14999'
#                                & (df.country == 'India')
# #                                & (df.code_exp.isin(['10-20', '20+'])
#                                   #| (df.ml_exp.isin(['10-20', '20+'])))
#                        ))

# len(df[salary_abmormal_India])

In [None]:
# df[salary_abmormal_India].groupby(['code_exp', 'age'], as_index=False).size()

In [None]:
# too_exper_for_min_salary.sample(4)

In [None]:
# extreme_young = ((df.salary.isin('') <= '24')
#                        & ((df.code_exp == '20+')
#                           | (df.ml_exp == '20+')
#                           | (df.salary.isin(['300000-499999', '500000-999999'])
#                          )
#                 ))

In [None]:
# # create the user demogr_compgraphics dataframe
# demogr = df[['age', 'gender', 'country', 'education', 'role', 'code_exp', 'ml_exp', 'salary']]

In [None]:
# demogr[
#     (demogr.salary.isin(['0-999'])
#     & (demogr.country == 'India'))
# ].groupby(['code_exp']).size().reset_index()

In [None]:
# demogr[
#     (demogr.salary.isin(['0-999'])
#     & (demogr.country == 'India'))
# ].groupby(['code_exp']).size().reset_index()

In [None]:
# demogr[
#     (demogr.salary.isin(['0-999'])
#     & (demogr.country == 'United States of America'))
# ].groupby(['code_exp', 'ml_exp']).size()#.reset_index()

In [None]:
# us_in_group_code_exp = ds_Us_In.groupby(['country', 'code_exp', 'salary']).size().reset_index()
# us_in_group_code_exp.head(40)

In [None]:
#Prepare Q23: Work activities, part of work role for analysis
# task = filter_df(df, 23)
# task = task[task]
# # task

In [None]:
# demogr_task = join_dfs(demogr, task)
# demogr_task

In [None]:
# # create the employment demogr_compgraphics dataframe
# comp = df[["Q20", "Q21", "Q22"]]
# comp.columns = ["empl_size", "ds_team", "comp_ml"]  # plus Q23 who we prepared
# # comp

In [None]:
# demogr_comp = join_dfs(demogr_task, comp)

## EXAMINE Salary 'outliers"

Ideas: Clean the data from submission errors, not from Outliers.

A) Basic Variables:

    0) Absoluterly across all the data: Age and experience, experience and salary, age and salary (with country.) check if more than one 'extreme' answers.

    1) Within country: age + coding exper + ml exper + comp size + median or mean of salary of other bins (next and previous?, depending on No of Obs)
    Additive flow:
    Start with age and coding exper and ml exper and salary. Remove extremely obvious observations.
    2) Compare with other countries
    3) Compare with previous years
    4) salary size in comparison to bin and number of observations in the bin (eg. 500000 is 500 obs in first age bin and 1 in 10 in salaries bin)

B) Secondary Variables:

    1) Completion time:
    2) number of Nan:

In [None]:
# demogr.salary.unique()

In [None]:
# # too_much_exper_young = demo[((demo.age <= '22') & (demo.code_exp.isin(["10-20 years", "20+"])))]# too_much_exper_young = demo[((demo.age <= '22') & (demo.code_exp.isin(["5-10 years", "10-20 years", "20+"])))]
# extreme_young = demogr[((demogr.age <= "24") 
#                                  & ((demogr.code_exp > 20)  # > 10 = 10-20 years
#                                  |  (demogr.ml_exp > 20)  # > 10 = 10-20 years
#                                  |  (demogr.salary.isin(["300000-499999", "500000-999999"]))  # > 500
#                                    )
#                                 )]
# extreme_young

In [None]:
# 27 observations should be excluded
# len(extreme_young)

In [None]:
# less_exteme_younger = demogr[((demogr.age <= "21") 
#                                  & ((demogr.code_exp > 10) & (demogr.code_exp  < 30) # > 10 = 10-20 years
#                                  |  (demogr.ml_exp > 10) & (demogr.ml_exp  < 30) # > 10 = 10-20 years
#                                  |  (demogr.salary.isin(["300000-499999", "500000-999999"]))  # > 500
#                                    )
#                                 )]
# less_exteme_younger

In [None]:
# len(exteme_young[exteme_young.salary.isin(["300000-499999", "500000-999999"])])

In [None]:
# len(exteme_young[(exteme_young.salary == "500000-999999")
#                  | (exteme_young.salary == "300000-499999")
#                  & (exteme_young.country == "India")])

# len(exteme_young[(exteme_young.salary == "500000-999999")
#                  | (exteme_young.salary == "300000-499999")
#                  & (exteme_young.country == "India")
#                  & (exteme_young.role == "Data Scientist")])

# len(demogr[(demogr.salary == "500000-999999")
#            | (demogr.salary == "300000-499999")
#            & (demogr.country == "India")
#            & (demogr.role == "Data Scientist")])

# len(demogr[(demogr.Q24 == "500000-999999") & (demogr.Q3 == "India")])

# len(orig[(orig.Q24 == "> $500,000")])# & (orig.Q3 == "India")])

# len(orig[(orig.Q24 == "> $500,000") & (orig.Q3 == "India")])

# orig.Q4

# data_scientists_India = demogr_comp[(
#      # (demogr_comp.role =='Data Scientist')
#      (demogr_comp.country.isin(['India']))
#     & (demogr_comp.salary.notna())
#     & ((demogr_comp.salary == "500000-999999")
#     |  (demogr_comp.salary == "300000-499999"))
#     #& (demogr_comp.code_exp <= 10)
# )].reset_index(drop=True)
# data_scientists_India.sort_values(by='salary').head(3)

# # demogr_comp['combined_exp'] = demogr_comp['code_exp'] + demogr_comp['ml_exp']

# # demogr_comp.role.value_counts()

# # demogr_comp[demogr_comp.role == 'Currently not employed']#.value_counts()

In [None]:
# too_much_exper_young = demogr_comp[((demogr_comp.age <= '24') & 
#                              (demogr_comp.role == 'data_scientist') & 
#                              (demogr_comp.code_exp.isin(["20+ years"]) |
#                               demogr_comp.ml_exp.isin(["20 or more years"]) |
#                               demogr_comp.salary > 100000)
#                             )]
# too_much_exper_young

In [None]:
# demogr_comp.country.unique()

In [None]:
# ds = demogr_comp[demogr_comp['role'] == 'Data Scientist']
# ds

In [None]:
# ds.country.value_counts()

In [None]:
# demogr_comp.ml_exp.value_counts()

# data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.salary.notna())
#     & (demogr_comp.salary > 199000 )
#     & (demogr_comp.salary <= 500000)
#     & (demogr_comp.code_exp <= 10)
# )].reset_index(drop=True)
# data_scientists_USA.sort_values(by='salary').head(3)

# code_exp_salary = data_scientists_USA.groupby('code_exp').size()
# code_exp_salary

# code_exp_salary = data_scientists_USA.groupby('code_exp').mean()  # .size()#
# code_exp_salary

# plt.plot(code_exp_salary.salary)

# ml_exp_salary = data_scientists_USA.groupby('ml_exp').mean()  #.size()
# ml_exp_salary

# plt.plot(ml_exp_salary.salary)

# data_scientists_USA.salary.value_counts()

# data_scientists_USA.salary.dtypes

In [None]:
# data_scientists_USA['salary'] = pd.to_numeric(data_scientists_USA['salary'], errors='coerce')

In [None]:
# data_scientists_USA['salary'] = data_scientists_USA['salary'].astype(float)

In [None]:
# five_year_exper_data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.code_exp == 30)
#     & (demogr_comp.ml_exp == 20)
#     #| (demogr_comp.salary > 125000)
# )]
# five_year_exper_data_scientists_USA.salary.mean()

# five_year_exper_data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.code_exp == 30)
#     & (demogr_comp.ml_exp == 10)
#     #| (demogr_comp.salary > 125000)
# )]
# five_year_exper_data_scientists_USA.salary.mean()

# five_year_exper_data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.code_exp == 20)
#     & (demogr_comp.ml_exp == 5)
#     #| (demogr_comp.salary > 125000)
# )]
# five_year_exper_data_scientists_USA.salary.mean()

# five_year_exper_data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.code_exp == 10)
#     & (demogr_comp.ml_exp == 5)
#     #| (demogr_comp.salary > 125000)
# )]
# five_year_exper_data_scientists_USA.salary.mean()

# five_year_exper_data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.code_exp == 5)
#     & (demogr_comp.ml_exp == 5)
#     #| (demogr_comp.salary > 125000)
# )]
# five_year_exper_data_scientists_USA.salary.mean()

# five_year_exper_data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.code_exp == 5)
#     & (demogr_comp.ml_exp == 4)
#     #| (demogr_comp.salary > 125000)
# )]
# five_year_exper_data_scientists_USA.salary.mean()

# five_year_exper_data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.code_exp == 5)
#     & (demogr_comp.ml_exp == 3)
#     #| (demogr_comp.salary > 125000)
# )]
# five_year_exper_data_scientists_USA.salary.mean()

In [None]:
# ten_year_exper_data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.code_exp == 5)
#     & (demogr_comp.ml_exp == 2)
#     #| (demogr_comp.salary > 125000)
# )]
# ten_year_exper_data_scientists_USA.salary.mean()

# twenty_year_exper_data_scientists_USA = demogr_comp[(
#       (demogr_comp.role =='Data Scientist')
#     & (demogr_comp.country.isin(['United States of America']))
#     & (demogr_comp.code_exp == 20)
#     & (demogr_comp.ml_exp == 20)
#     #| (demogr_comp.salary > 125000)
# )]
# twenty_year_exper_data_scientists_USA.salary.mean()

# demogr_25_29 = demogr_comp[(demogr_comp['age'] == '25-29')]
# ds_Us_In = ds[
#       (ds['country'] == 'United States of America')
#     | (ds['country'] == 'India')].reset_index(drop=True)
# ds_Us_In

# # len(five_year_exper_data_scientists_USA_India)

# # five_year_exper_data_scientists_USA_India.groupby("country").mean()

# ds.info()

# ds.ml_exp.unique()

# ds.code_exp.value_counts().sort_index()

# ds.ml_exp.value_counts().sort_index()

# demogr[demogr.Q5 == "Data Scientist"].Q24.describe()

# ds.country.value_counts().sort_index()

# ds.age.value_counts(True).sort_index()

# ds.gender.value_counts(True).sort_index()

# # ds.country.value_counts(sort=True)#.sort_index()



# len(ds_Us_In.salary)

# ds_Us_In.salary.value_counts().sort_index()

# ds_Us_In.code_exp.value_counts().sort_index()

# ds_Us_In.ml_exp.value_counts().sort_index()

# ds_Us_In

# us_in_group_code_exp = ds_Us_In.groupby(['country', 'code_exp']).mean()
# us_in_group_code_exp

# us_in_group_code_exp = ds_Us_In.groupby(['country', 'education','code_exp', 'ml_exp', 'salary']).size().reset_index()
# us_in_group_code_exp.head(40)

# us_in_group_code_exp = ds_Us_In.groupby(['country', 'code_exp', 'salary']).size().reset_index()
# us_in_group_code_exp.head(40)

# us_in_group_code_exp[us_in_group_code_exp['country'] == 'India']

# ds_Us_In[ds_Us_In['salary'] > 999000]

# ds_Us_In[ds_Us_In['salary'] == 500000]

# too_much_exper_young = demogr_comp[((demogr_comp.age <= '24') & 
#                              (demogr_comp.code_exp.isin(["20+ years"]) |
#                               demogr_comp.ml_exp.isin(["20 or more years"]) |
#                               demogr_comp.salary.isin(["199.999", "249.999", "299.999", "500.000"]))
#                             )]
# too_much_exper_young

