# Manipulando Series

In [1]:
import pandas as pd

In [2]:
dado = [1, 2, 3, 4, 5]

In [3]:
series = pd.Series(dado)

In [4]:
series

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [5]:
rename = ["number" + str(a) for a in range(5)]

In [6]:
rename

['number0', 'number1', 'number2', 'number3', 'number4']

In [7]:
# Renomeando a series 
series = pd.Series(data = dado, index = rename)

In [8]:
series

number0    1
number1    2
number2    3
number3    4
number4    5
dtype: int64

In [9]:
# Renomeando a series a partir de um dicionário
dado = {
    'number' + str(a) : a + 1 for a in range(5)
}

In [10]:
dado

{'number0': 1, 'number1': 2, 'number2': 3, 'number3': 4, 'number4': 5}

In [11]:
pd.Series(dado)

number0    1
number1    2
number2    3
number3    4
number4    5
dtype: int64

### DataFrame

In [12]:
dado = [[1,2,3],
       [4,5,6],
       [7,8,9]]

In [13]:
dado

[[1, 2, 3], [4, 5, 6], [7, 8, 9]]

In [14]:
df = pd.DataFrame(data = dado)

In [15]:
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [16]:
index = ["number" + str(a) for a in range(3)]

In [17]:
index

['number0', 'number1', 'number2']

In [18]:
# Renomeando o index do dataframe
df = pd.DataFrame(data = dado, index = index)

In [19]:
df

Unnamed: 0,0,1,2
number0,1,2,3
number1,4,5,6
number2,7,8,9


In [20]:
columns = ["column" + str(a) for a in range(3)]

In [21]:
columns

['column0', 'column1', 'column2']

In [22]:
df = pd.DataFrame(data = dado, index = index, columns = columns)

In [23]:
df

Unnamed: 0,column0,column1,column2
number0,1,2,3
number1,4,5,6
number2,7,8,9


In [24]:
# DataFrame com Tupla
dado = [(1,2,3),
       (4,5,6),
       (7,8,9)]

In [25]:
dado

[(1, 2, 3), (4, 5, 6), (7, 8, 9)]

In [26]:
dfTupla = pd.DataFrame(data = dado, index = index, columns = columns)

In [27]:
dfTupla

Unnamed: 0,column0,column1,column2
number0,1,2,3
number1,4,5,6
number2,7,8,9


In [28]:
# Atribuindo novo valor para todas as colunas do DataFrame
df[df > 0] = "A"
df

Unnamed: 0,column0,column1,column2
number0,A,A,A
number1,A,A,A
number2,A,A,A


In [29]:
# Atribuindo novo valor para todas as colunas do DataFrame
dfTupla[dfTupla > 0] = "B"
dfTupla

Unnamed: 0,column0,column1,column2
number0,B,B,B
number1,B,B,B
number2,B,B,B


In [30]:
# A concatenação é feita a partir das variáveis de link column0, column1, column2
dfUniao = pd.concat([df, dfTupla])
dfUniao

Unnamed: 0,column0,column1,column2
number0,A,A,A
number1,A,A,A
number2,A,A,A
number0,B,B,B
number1,B,B,B
number2,B,B,B


In [31]:
# Concatenação lado a lado
dfUniao = pd.concat([df, dfTupla], axis = 1)
dfUniao

Unnamed: 0,column0,column1,column2,column0.1,column1.1,column2.1
number0,A,A,A,B,B,B
number1,A,A,A,B,B,B
number2,A,A,A,B,B,B


### Organizando DataFrames

In [32]:
numbers = [[1,2,3],
       [4,5,6],
       [7,8,9]]

In [33]:
list("ABC")

['A', 'B', 'C']

In [34]:
df = pd.DataFrame(numbers, list("CBA"), list("ZYX"))

In [35]:
df

Unnamed: 0,Z,Y,X
C,1,2,3
B,4,5,6
A,7,8,9


In [36]:
df.sort_index(inplace = True)

In [37]:
df

Unnamed: 0,Z,Y,X
A,7,8,9
B,4,5,6
C,1,2,3


In [38]:
df.sort_index(inplace = True, axis = 1)

In [39]:
df

Unnamed: 0,X,Y,Z
A,9,8,7
B,6,5,4
C,3,2,1


In [40]:
df.sort_values(by = ["C", "A"], axis = 1, inplace = True)

In [41]:
df

Unnamed: 0,Z,Y,X
A,7,8,9
B,4,5,6
C,1,2,3


### Formas de seleção

In [42]:
data = [(1,2,3,4),(5,6,7,8),(9,10,11,12),(13,14,15,16)]
df = pd.DataFrame(data, "l1 l2 l3 l4".split(), "c1 c2 c3 c4".split())

In [43]:
df

Unnamed: 0,c1,c2,c3,c4
l1,1,2,3,4
l2,5,6,7,8
l3,9,10,11,12
l4,13,14,15,16


In [44]:
df["c1"]

l1     1
l2     5
l3     9
l4    13
Name: c1, dtype: int64

In [45]:
type(df["c1"])

pandas.core.series.Series

In [46]:
df[["c3", "c1"]]

Unnamed: 0,c3,c1
l1,3,1
l2,7,5
l3,11,9
l4,15,13


In [47]:
type(df[["c3", "c1"]])

pandas.core.frame.DataFrame

In [48]:
df[1:3]

Unnamed: 0,c1,c2,c3,c4
l2,5,6,7,8
l3,9,10,11,12


In [49]:
df[1:3][["c3", "c1"]]

Unnamed: 0,c3,c1
l2,7,5
l3,11,9


In [50]:
# seleção a partir dos rótulos das linhas
df.loc["l3"]

c1     9
c2    10
c3    11
c4    12
Name: l3, dtype: int64

In [51]:
df

Unnamed: 0,c1,c2,c3,c4
l1,1,2,3,4
l2,5,6,7,8
l3,9,10,11,12
l4,13,14,15,16


In [52]:
df.loc[["l3", "l2"]]

Unnamed: 0,c1,c2,c3,c4
l3,9,10,11,12
l2,5,6,7,8


In [53]:
# Retorna o valor da coluna c2 na linha 1 l1
df.loc["l1", "c2"]

2

In [54]:
# O método "iloc" pega o valor da coluna e linha pelo índice
df.iloc[0, 1]

2

In [55]:
# Dataframe a partir do loc do DataFrame principal DF
df.loc[["l3", "l2"], ["c1", "c2"]]

Unnamed: 0,c1,c2
l3,9,10
l2,5,6


In [56]:
# Dataframe a partir do indice no loc do DataFrame principal DF
df.iloc[[2, 1], [0, 1]]

Unnamed: 0,c1,c2
l3,9,10
l2,5,6


### Métodos de interpolação

In [57]:
data = [0.5, None, None, 0.52, 0.54, None, None, 0.59, 0.6, None, 0.7]
s = pd.Series(data)

In [58]:
s

0     0.50
1      NaN
2      NaN
3     0.52
4     0.54
5      NaN
6      NaN
7     0.59
8     0.60
9      NaN
10    0.70
dtype: float64

In [59]:
# Interpola nulos com 0 
s.fillna(0)

0     0.50
1     0.00
2     0.00
3     0.52
4     0.54
5     0.00
6     0.00
7     0.59
8     0.60
9     0.00
10    0.70
dtype: float64

In [60]:
# Interpola nulos com o valor anterior 
s.fillna(method = "ffill") # é possível limitar passando no método o 'limit'

0     0.50
1     0.50
2     0.50
3     0.52
4     0.54
5     0.54
6     0.54
7     0.59
8     0.60
9     0.60
10    0.70
dtype: float64

In [61]:
s.fillna(s.mean())

0     0.500
1     0.575
2     0.575
3     0.520
4     0.540
5     0.575
6     0.575
7     0.590
8     0.600
9     0.575
10    0.700
dtype: float64

In [62]:
s1 = s.fillna(method = "ffill", limit = 1)

In [63]:
# Com esse método interpola nulo com o valor abaixo na series
s1.fillna(method = "bfill", limit = 1)

0     0.50
1     0.50
2     0.52
3     0.52
4     0.54
5     0.54
6     0.59
7     0.59
8     0.60
9     0.60
10    0.70
dtype: float64

### Contadores

In [64]:
lista = pd.Series(list("111232435657898793"))

In [65]:
lista

0     1
1     1
2     1
3     2
4     3
5     2
6     4
7     3
8     5
9     6
10    5
11    7
12    8
13    9
14    8
15    7
16    9
17    3
dtype: object

In [66]:
lista.unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype=object)

In [67]:
# O método 'value_counts' informa o n° de vezes que um valor é repetido na lista
lista.value_counts()

1    3
3    3
2    2
5    2
7    2
8    2
9    2
4    1
6    1
Name: count, dtype: int64

### Faixas de valor

In [68]:
dados = pd.read_csv("aluguel.csv", sep=";")

In [69]:
classes = [0, 2, 4 ,6, 100]

In [70]:
quartos = pd.cut(dados["Quartos"], classes)

In [71]:
quartos.head(10)

0    (0.0, 2.0]
1    (0.0, 2.0]
2           NaN
3    (0.0, 2.0]
4    (0.0, 2.0]
5    (2.0, 4.0]
6    (0.0, 2.0]
7    (4.0, 6.0]
8    (0.0, 2.0]
9           NaN
Name: Quartos, dtype: category
Categories (4, interval[int64, right]): [(0, 2] < (2, 4] < (4, 6] < (6, 100]]

In [72]:
pd.value_counts(quartos)

Quartos
(0, 2]      12419
(2, 4]       9894
(4, 6]        709
(6, 100]       87
Name: count, dtype: int64

In [73]:
labels = ["1 e 2 quartos", "3 e 4 quartos", "5 e 6 quartos", "7 ou mais quartos"]

In [74]:
# A função cut() permite que sejam especificados os limites de cada classe.
# A função cut() auxilia na criação distribuições de frequências.
quartos = pd.cut(dados["Quartos"], classes, labels = labels, include_lowest = True)

In [75]:
pd.value_counts(quartos)

Quartos
1 e 2 quartos        22270
3 e 4 quartos         9894
5 e 6 quartos          709
7 ou mais quartos       87
Name: count, dtype: int64

In [76]:
dataset = pd.DataFrame({
    'Sexo': ['H', 'M', 'M', 'M', 'M', 'H', 'H', 'H', 'M', 'M'],
    'Idade': [53, 72, 54, 27, 30, 40, 58, 32, 44, 51]
})

In [96]:
# Média de homens e mulheres
dataset.groupby("Sexo")["Idade"].mean()

Sexo
H    45.750000
M    46.333333
Name: Idade, dtype: float64

In [98]:
# Para pegar somente a média de homens
dataset.groupby("Sexo").mean().loc["H"]

Idade    45.75
Name: H, dtype: float64