# Exploratory Data Analysis in Python
## Data Cleaning and Imputation

In [2]:
import pandas as pd

In [3]:
salaries = pd.read_csv('../datasets/ds_salaries.csv')
salaries.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [4]:
# Checking for missing values
salaries.isna().sum()

Unnamed: 0            0
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [5]:
# Vai se importante para saber qunato é 5% dos dados
# vamoa eliminar caso se for essa quantidade de representividade 
threshold = len(salaries) * 0.05
threshold

30.35

In [6]:
# Toda coluna que tiver dados faltando que representa menos de 5%
cols_to_drop = salaries.columns[salaries.isna().sum() <= threshold]
print(cols_to_drop)

Index(['Unnamed: 0', 'work_year', 'experience_level', 'employment_type',
       'job_title', 'salary', 'salary_currency', 'salary_in_usd',
       'employee_residence', 'remote_ratio', 'company_location',
       'company_size'],
      dtype='object')


In [7]:
# Vai retirar esse dados faltante 
salaries.dropna(subset=cols_to_drop, inplace=True)

In [8]:
# Verificando se ainda existe valores faltante 
cols_with_missing_values = salaries.columns[salaries.isna().sum() > 0]
print(cols_with_missing_values)

Index([], dtype='object')


In [9]:
# estamos substindo pela moda 
for col in cols_with_missing_values[:-1]:
    salaries[col].fillna(salaries[col].mode()[0])

In [12]:
salaries_dict = salaries.groupby("experience_level")["salary_in_usd"].median().to_dict()
print(salaries_dict)

{'EN': 56500.0, 'EX': 171437.5, 'MI': 76940.0, 'SE': 135500.0}


In [13]:
salaries["salary_in_usd"] = salaries["salary_in_usd"].fillna(salaries["experience_level"].map(salaries_dict))