# Pandas
Panel Data - dados tabulares heterogêneos

In [156]:
import pandas as pd
import numpy as np

In [157]:
s1 = pd.Series([1, 2, -5, 0])
s1

0    1
1    2
2   -5
3    0
dtype: int64

In [158]:
s1.values

array([ 1,  2, -5,  0])

In [159]:
s1.index

RangeIndex(start=0, stop=4, step=1)

In [160]:
s2 = pd.Series([1, 2, -5, 0], index = ['a', 'b', 'c', 'd'])
s2

a    1
b    2
c   -5
d    0
dtype: int64

In [161]:
s2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [162]:
s2['a'] = 1000
s2

a    1000
b       2
c      -5
d       0
dtype: int64

In [163]:
s2[s2 > 0]

a    1000
b       2
dtype: int64

In [164]:
s2 * 2

a    2000
b       4
c     -10
d       0
dtype: int64

In [165]:
s2.isnull()

a    False
b    False
c    False
d    False
dtype: bool

In [166]:
dados = {'estado': ['SP', 'MG', 'PR', 'SP', 'MG', 'PR'], 'ano': [2019, 2019, 2019, 2020, 2020, 2020], 'pop': [45.9, 21.2, 16.9, 46.6, 21.4, 17.3]}
dados

{'estado': ['SP', 'MG', 'PR', 'SP', 'MG', 'PR'],
 'ano': [2019, 2019, 2019, 2020, 2020, 2020],
 'pop': [45.9, 21.2, 16.9, 46.6, 21.4, 17.3]}

In [167]:
df1 = pd.DataFrame(dados)
df1

Unnamed: 0,estado,ano,pop
0,SP,2019,45.9
1,MG,2019,21.2
2,PR,2019,16.9
3,SP,2020,46.6
4,MG,2020,21.4
5,PR,2020,17.3


In [168]:
df1.head(2) #mostra as duas primeiras linhas

Unnamed: 0,estado,ano,pop
0,SP,2019,45.9
1,MG,2019,21.2


In [169]:
df1.tail(2) #mostra as duas últimas linhas

Unnamed: 0,estado,ano,pop
4,MG,2020,21.4
5,PR,2020,17.3


In [170]:
df1.sample(2) #mostra duas linhas aleatórias

Unnamed: 0,estado,ano,pop
2,PR,2019,16.9
0,SP,2019,45.9


In [171]:
df2 = pd.DataFrame(dados, columns = ['ano', 'estado', 'pop']) #escolhe a ordem das colunas
df2

Unnamed: 0,ano,estado,pop
0,2019,SP,45.9
1,2019,MG,21.2
2,2019,PR,16.9
3,2020,SP,46.6
4,2020,MG,21.4
5,2020,PR,17.3


In [172]:
df2['estado']

0    SP
1    MG
2    PR
3    SP
4    MG
5    PR
Name: estado, dtype: object

In [173]:
df2.ano

0    2019
1    2019
2    2019
3    2020
4    2020
5    2020
Name: ano, dtype: int64

In [174]:
df2.dtypes

ano         int64
estado     object
pop       float64
dtype: object

In [175]:
df2['estimativa'] = 50
df2

Unnamed: 0,ano,estado,pop,estimativa
0,2019,SP,45.9,50
1,2019,MG,21.2,50
2,2019,PR,16.9,50
3,2020,SP,46.6,50
4,2020,MG,21.4,50
5,2020,PR,17.3,50


In [176]:
df2['estimativa'] = np.arange(6)
df2

Unnamed: 0,ano,estado,pop,estimativa
0,2019,SP,45.9,0
1,2019,MG,21.2,1
2,2019,PR,16.9,2
3,2020,SP,46.6,3
4,2020,MG,21.4,4
5,2020,PR,17.3,5


In [177]:
df3 = df2['ano']
df3

0    2019
1    2019
2    2019
3    2020
4    2020
5    2020
Name: ano, dtype: int64

In [178]:
df2['não Paraná'] = df2.estado != 'PR'
df2

Unnamed: 0,ano,estado,pop,estimativa,não Paraná
0,2019,SP,45.9,0,True
1,2019,MG,21.2,1,True
2,2019,PR,16.9,2,False
3,2020,SP,46.6,3,True
4,2020,MG,21.4,4,True
5,2020,PR,17.3,5,False


In [179]:
del df2['não Paraná']
df2

Unnamed: 0,ano,estado,pop,estimativa
0,2019,SP,45.9,0
1,2019,MG,21.2,1
2,2019,PR,16.9,2
3,2020,SP,46.6,3
4,2020,MG,21.4,4
5,2020,PR,17.3,5


In [180]:
df2.shape

(6, 4)

In [181]:
df2.shape[0]

6

In [182]:
df2.index

RangeIndex(start=0, stop=6, step=1)

In [183]:
df2.count() #conta os valores não nulos

ano           6
estado        6
pop           6
estimativa    6
dtype: int64

In [184]:
df2.columns = ['Ano', 'Estado', 'População', 'Estimativa'] #renomear colunas
df2

Unnamed: 0,Ano,Estado,População,Estimativa
0,2019,SP,45.9,0
1,2019,MG,21.2,1
2,2019,PR,16.9,2
3,2020,SP,46.6,3
4,2020,MG,21.4,4
5,2020,PR,17.3,5


In [185]:
df2.describe(include = 'all') #estatísticas descritivas

Unnamed: 0,Ano,Estado,População,Estimativa
count,6.0,6,6.0,6.0
unique,,3,,
top,,SP,,
freq,,2,,
mean,2019.5,,28.216667,2.5
std,0.547723,,14.096725,1.870829
min,2019.0,,16.9,0.0
25%,2019.0,,18.275,1.25
50%,2019.5,,21.3,2.5
75%,2020.0,,39.775,3.75


In [186]:
df2['Ano'] = df2['Ano'] + 2
df2

Unnamed: 0,Ano,Estado,População,Estimativa
0,2021,SP,45.9,0
1,2021,MG,21.2,1
2,2021,PR,16.9,2
3,2022,SP,46.6,3
4,2022,MG,21.4,4
5,2022,PR,17.3,5


In [187]:
df2[df2['Ano'] > 2021]

Unnamed: 0,Ano,Estado,População,Estimativa
3,2022,SP,46.6,3
4,2022,MG,21.4,4
5,2022,PR,17.3,5


In [188]:
df4 = df2[df2['Ano'] > 2021]
df4.drop('Ano', axis = 'columns')   #apaenas para visualização

Unnamed: 0,Estado,População,Estimativa
3,SP,46.6,3
4,MG,21.4,4
5,PR,17.3,5


In [190]:
df2

Unnamed: 0,Ano,Estado,População,Estimativa
0,2021,SP,45.9,0
1,2021,MG,21.2,1
2,2021,PR,16.9,2
3,2022,SP,46.6,3
4,2022,MG,21.4,4
5,2022,PR,17.3,5


In [191]:
df2.drop('Ano', axis = 'columns', inplace = True) #apaga na tablea original
df2

Unnamed: 0,Estado,População,Estimativa
0,SP,45.9,0
1,MG,21.2,1
2,PR,16.9,2
3,SP,46.6,3
4,MG,21.4,4
5,PR,17.3,5


In [192]:
df4

Unnamed: 0,Ano,Estado,População,Estimativa
3,2022,SP,46.6,3
4,2022,MG,21.4,4
5,2022,PR,17.3,5


In [193]:
df2.iloc[0]

Estado          SP
População     45.9
Estimativa       0
Name: 0, dtype: object

In [194]:
df2.iloc[1:3, [1, 2]]

Unnamed: 0,População,Estimativa
1,21.2,1
2,16.9,2
