# Aula 1 - Análise de Dados com Pandas


In [7]:
import pandas as pd


In [8]:
df = pd.read_csv("https://raw.githubusercontent.com/guilhermeonrails/data-jobs/refs/heads/main/salaries.csv")


In [9]:
df.head()


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2025.0,SE,FT,Solutions Engineer,214000,USD,214000,US,100,US,M
1,2025.0,SE,FT,Solutions Engineer,136000,USD,136000,US,100,US,M
2,2025.0,MI,FT,Data Engineer,158800,USD,158800,AU,0,AU,M
3,2025.0,MI,FT,Data Engineer,139200,USD,139200,AU,0,AU,M
4,2025.0,EN,FT,Data Engineer,90000,USD,90000,US,0,US,M


In [10]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133349 entries, 0 to 133348
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   work_year           133339 non-null  float64
 1   experience_level    133349 non-null  object 
 2   employment_type     133349 non-null  object 
 3   job_title           133349 non-null  object 
 4   salary              133349 non-null  int64  
 5   salary_currency     133349 non-null  object 
 6   salary_in_usd       133349 non-null  int64  
 7   employee_residence  133349 non-null  object 
 8   remote_ratio        133349 non-null  int64  
 9   company_location    133349 non-null  object 
 10  company_size        133349 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 11.2+ MB


In [11]:
df.describe()


Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,133339.0,133349.0,133349.0,133349.0
mean,2024.35877,163283.3,157617.272098,20.905669
std,0.680627,217386.0,74288.363097,40.590044
min,2020.0,14000.0,15000.0,0.0
25%,2024.0,106020.0,106000.0,0.0
50%,2024.0,147000.0,146206.0,0.0
75%,2025.0,199000.0,198000.0,0.0
max,2025.0,30400000.0,800000.0,100.0


In [12]:
df.shape


(133349, 11)

In [13]:
df.columns


Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

## Renomeando as colunas do DataFrame


In [14]:
# Dicionário de renomeação
novos_nomes = {
  'work_year': 'ano',
  'experience_level': 'senioridade',
  'employment_type': 'contrato',
  'job_title': 'cargo',
  'salary': 'salario',
  'salary_currency': 'moeda',
  'salary_in_usd': 'usd',
  'employee_residence': 'residencia',
  'remote_ratio': 'remoto',
  'company_location': 'empresa',
  'company_size': 'tamanho_empresa'
}

# Aplicando renomeação
df.rename(columns=novos_nomes, inplace=True)

# Verificando resultado
df.head()

Unnamed: 0,ano,senioridade,contrato,cargo,salario,moeda,usd,residencia,remoto,empresa,tamanho_empresa
0,2025.0,SE,FT,Solutions Engineer,214000,USD,214000,US,100,US,M
1,2025.0,SE,FT,Solutions Engineer,136000,USD,136000,US,100,US,M
2,2025.0,MI,FT,Data Engineer,158800,USD,158800,AU,0,AU,M
3,2025.0,MI,FT,Data Engineer,139200,USD,139200,AU,0,AU,M
4,2025.0,EN,FT,Data Engineer,90000,USD,90000,US,0,US,M


### Analisando quais sãos as categorias das colunas categóricas


##### Nível de senioridade


In [15]:
# O método .value_counts() serve para contar quantas vezes cada valor único aparece em uma coluna.
df['senioridade'].value_counts()
     

senioridade
SE    77241
MI    40465
EN    12443
EX     3200
Name: count, dtype: int64

#### Modificando o nome das categorias:


In [17]:
senioridade = {
  'SE': 'senior',
  'MI': 'pleno',
  'EN': 'junior',
  'EX': 'executivo'
}
df['senioridade'] = df['senioridade'].replace(senioridade)
df['senioridade'].value_counts()
     

senioridade
senior       77241
pleno        40465
junior       12443
executivo     3200
Name: count, dtype: int64

In [18]:
contrato = {
  'FT': 'integral',
  'PT': 'parcial',
  'CT': 'contrato',
  'FL': 'freelancer'
}
df['contrato'] = df['contrato'].replace(contrato)
df['contrato'].value_counts()

contrato
integral      132563
contrato         394
parcial          376
freelancer        16
Name: count, dtype: int64

In [19]:

tamanho_empresa = {
  'L': 'grande',
  'S': 'pequena',
  'M':	'media'

}
df['tamanho_empresa'] = df['tamanho_empresa'].replace(tamanho_empresa)
df['tamanho_empresa'].value_counts()

tamanho_empresa
media      129561
grande       3574
pequena       214
Name: count, dtype: int64

In [20]:
df.head()

Unnamed: 0,ano,senioridade,contrato,cargo,salario,moeda,usd,residencia,remoto,empresa,tamanho_empresa
0,2025.0,senior,integral,Solutions Engineer,214000,USD,214000,US,100,US,media
1,2025.0,senior,integral,Solutions Engineer,136000,USD,136000,US,100,US,media
2,2025.0,pleno,integral,Data Engineer,158800,USD,158800,AU,0,AU,media
3,2025.0,pleno,integral,Data Engineer,139200,USD,139200,AU,0,AU,media
4,2025.0,junior,integral,Data Engineer,90000,USD,90000,US,0,US,media


In [21]:
#  informações categóricas com o método describe(), exibindo a quantidade de categorias únicas, qual é categoria mais frequente e sua respectiva frequência

df.describe(include='object')


Unnamed: 0,senioridade,contrato,cargo,moeda,residencia,empresa,tamanho_empresa
count,133349,133349,133349,133349,133349,133349,133349
unique,4,4,390,26,102,95,3
top,senior,integral,Data Scientist,USD,US,US,media
freq,77241,132563,17314,126140,119579,119641,129561


In [22]:
df.describe()

Unnamed: 0,ano,salario,usd,remoto
count,133339.0,133349.0,133349.0,133349.0
mean,2024.35877,163283.3,157617.272098,20.905669
std,0.680627,217386.0,74288.363097,40.590044
min,2020.0,14000.0,15000.0,0.0
25%,2024.0,106020.0,106000.0,0.0
50%,2024.0,147000.0,146206.0,0.0
75%,2025.0,199000.0,198000.0,0.0
max,2025.0,30400000.0,800000.0,100.0
