## <font color=green> Series

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

# Comando para exibir todas as colunas do arquivo
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

- Series são objetos unidimensionais contendo dados e labels (ou index)
- Formas de criação de Series

In [2]:
s = pd.Series(list('abcdef'))
s

0    a
1    b
2    c
3    d
4    e
5    f
dtype: object

In [3]:
s = pd.Series([2, 4, 6, 8])
s

0    2
1    4
2    6
3    8
dtype: int64

- O index pode ser especificado

In [4]:
s = pd.Series([2, 4, 6, 8], index = ['f', 'a', 'c', 'e'])
s

f    2
a    4
c    6
e    8
dtype: int64

- O valor pode ser selecionado pelo seu index

In [5]:
s['a']

np.int64(4)

- Assim como múltiplos valores também podem ser selecionados

In [6]:
s[['a','c']]

a    4
c    6
dtype: int64

- Os indices não precisam ser valores únicos

In [7]:
s2 = pd.Series(range(4), index = list('abab'))
s2

a    0
b    1
a    2
b    3
dtype: int64

In [8]:
s2['a']

a    0
a    2
dtype: int64

In [9]:
s2['a'][1]

np.int64(2)

- Series suportam operações de filtragem

In [10]:
s

f    2
a    4
c    6
e    8
dtype: int64

In [11]:
s[s>4]

c    6
e    8
dtype: int64

In [12]:
s>4

f    False
a    False
c     True
e     True
dtype: bool

- Podemos realizar também operações aritméticas


In [13]:
s+4

f     6
a     8
c    10
e    12
dtype: int64

In [14]:
s*4

f     8
a    16
c    24
e    32
dtype: int64

- Series suportam variáveis nulas

In [15]:
sdata = {'b': 100, 'c': 150, 'd': 200}
s = pd.Series(sdata)
s

b    100
c    150
d    200
dtype: int64

In [16]:
s = pd.Series(sdata, list('abcd'))
s

a      NaN
b    100.0
c    150.0
d    200.0
dtype: float64

- É possível realizar também operações aritméticas entre duas séries

In [17]:
s2 = pd.Series([1,2,3], index = ['c','b','a'])
s2

c    1
b    2
a    3
dtype: int64

In [18]:
s*s2

a      NaN
b    200.0
c    150.0
d      NaN
dtype: float64

## <font color=green> DataFrame

- DataFrames são como planilhas, uma estrutura de dados contendo uma coleção de colunas.
- Possui linhas e colunas

In [19]:
data = {'pontuação':[81,80,72,80,90,71],
        'time':['Corinthians','Palmeiras','Corinthians','Palmeiras','Flamengo','Flamengo'],
        'ano':[2015,2016,2017,2018,2019,2020]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,pontuação,time,ano
0,81,Corinthians,2015
1,80,Palmeiras,2016
2,72,Corinthians,2017
3,80,Palmeiras,2018
4,90,Flamengo,2019
5,71,Flamengo,2020


In [20]:
pop_data = {'Corinthians': {2015:81, 2017:72},
        'Palmeiras': {2016:80, 2018:80},
        'Flamengo': {2019:90,2020:71}        
        }
        
pop = pd.DataFrame(pop_data)
pop

Unnamed: 0,Corinthians,Palmeiras,Flamengo
2015,81.0,,
2017,72.0,,
2016,,80.0,
2018,,80.0,
2019,,,90.0
2020,,,71.0


- As colunas podem ser retornadas como uma série

In [21]:
frame['time']

0    Corinthians
1      Palmeiras
2    Corinthians
3      Palmeiras
4       Flamengo
5       Flamengo
Name: time, dtype: object

- O atributo values retorna os dados contidos no DataFrame como um array bidimensional

In [22]:
pop.values

array([[81., nan, nan],
       [72., nan, nan],
       [nan, 80., nan],
       [nan, 80., nan],
       [nan, nan, 90.],
       [nan, nan, 71.]])

In [23]:
pop['Flamengo'].values

array([nan, nan, nan, nan, 90., 71.])

- Novas linhas podem ser adicionadas

In [24]:
frame

Unnamed: 0,pontuação,time,ano
0,81,Corinthians,2015
1,80,Palmeiras,2016
2,72,Corinthians,2017
3,80,Palmeiras,2018
4,90,Flamengo,2019
5,71,Flamengo,2020


In [25]:
new_line = {'pontuação': 84, 'time': 'Atlético Mineiro', 'ano': 2021}
frame = pd.concat([frame, pd.DataFrame([new_line])], ignore_index=True)
frame

Unnamed: 0,pontuação,time,ano
0,81,Corinthians,2015
1,80,Palmeiras,2016
2,72,Corinthians,2017
3,80,Palmeiras,2018
4,90,Flamengo,2019
5,71,Flamengo,2020
6,84,Atlético Mineiro,2021


- Novas colunas podem ser adicionadas (por cálculo aritmético ou atribuição direta)

In [26]:
import numpy as np

In [27]:
frame['nova coluna'] = np.nan
frame

Unnamed: 0,pontuação,time,ano,nova coluna
0,81,Corinthians,2015,
1,80,Palmeiras,2016,
2,72,Corinthians,2017,
3,80,Palmeiras,2018,
4,90,Flamengo,2019,
5,71,Flamengo,2020,
6,84,Atlético Mineiro,2021,


In [28]:
frame['processada'] = frame['pontuação'] * 2
frame

Unnamed: 0,pontuação,time,ano,nova coluna,processada
0,81,Corinthians,2015,,162
1,80,Palmeiras,2016,,160
2,72,Corinthians,2017,,144
3,80,Palmeiras,2018,,160
4,90,Flamengo,2019,,180
5,71,Flamengo,2020,,142
6,84,Atlético Mineiro,2021,,168


- DataFrames permitem trocar linhas por colunas

In [29]:
pop.T

Unnamed: 0,2015,2017,2016,2018,2019,2020
Corinthians,81.0,72.0,,,,
Palmeiras,,,80.0,80.0,,
Flamengo,,,,,90.0,71.0


- DataFrame tem suporte a funções descritivas e estatísticas

In [30]:
pop

Unnamed: 0,Corinthians,Palmeiras,Flamengo
2015,81.0,,
2017,72.0,,
2016,,80.0,
2018,,80.0,
2019,,,90.0
2020,,,71.0


- Describe gera vários dados estatísticos de resumo de uma só vez

In [31]:
pop.describe()

Unnamed: 0,Corinthians,Palmeiras,Flamengo
count,2.0,2.0,2.0
mean,76.5,80.0,80.5
std,6.363961,0.0,13.435029
min,72.0,80.0,71.0
25%,74.25,80.0,75.75
50%,76.5,80.0,80.5
75%,78.75,80.0,85.25
max,81.0,80.0,90.0


- Info retorna o número de valores não nulos e o tipo por coluna

In [32]:
pop.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, 2015 to 2020
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Corinthians  2 non-null      float64
 1   Palmeiras    2 non-null      float64
 2   Flamengo     2 non-null      float64
dtypes: float64(3)
memory usage: 364.0 bytes


- Chamar o método sum de DataFrame devolve uma Series contendo as soma das colunas

In [33]:
pop.sum()

Corinthians    153.0
Palmeiras      160.0
Flamengo       161.0
dtype: float64

- Passar axis='columns' faz a soma pelas colunas

In [34]:
pop.sum(axis='columns')

2015    81.0
2017    72.0
2016    80.0
2018    80.0
2019    90.0
2020    71.0
dtype: float64

- Mean retorna a média dos valores

In [35]:
pop.mean()

Corinthians    76.5
Palmeiras      80.0
Flamengo       80.5
dtype: float64

- Median retorna a mediana dos valores

In [36]:
pop.median()

Corinthians    76.5
Palmeiras      80.0
Flamengo       80.5
dtype: float64

- min e max calcula os valores mínimo e máximo

In [37]:
pop.min()

Corinthians    72.0
Palmeiras      80.0
Flamengo       71.0
dtype: float64

In [38]:
pop.max()

Corinthians    81.0
Palmeiras      80.0
Flamengo       90.0
dtype: float64

- cumsum calcula a soma cumulativa

In [39]:
sales = pd.DataFrame({'Date':['2022-08-01','2022-08-02','2022-08-03','2022-08-04','2022-08-05','2022-08-06'],
                   'Total Sales':[2000,3000,2500,4000,3000,2000]
                   })
sales

Unnamed: 0,Date,Total Sales
0,2022-08-01,2000
1,2022-08-02,3000
2,2022-08-03,2500
3,2022-08-04,4000
4,2022-08-05,3000
5,2022-08-06,2000


In [40]:
sales['Cummulative Sales'] = sales['Total Sales'].cumsum()
sales

Unnamed: 0,Date,Total Sales,Cummulative Sales
0,2022-08-01,2000,2000
1,2022-08-02,3000,5000
2,2022-08-03,2500,7500
3,2022-08-04,4000,11500
4,2022-08-05,3000,14500
5,2022-08-06,2000,16500


- melt transforma um DataFrame de várias colunas para várias linhas

In [41]:
df_fruits = pd.DataFrame({
    "fruit": ["apple", "banana", "orange"],
    "Aldi": [4, 5, 6],
    "Costco": [1, 2, 3],
    "Target": [3, 4, 5],
    "Walmart": [6, 7, 8]
})

df_fruits

Unnamed: 0,fruit,Aldi,Costco,Target,Walmart
0,apple,4,1,3,6
1,banana,5,2,4,7
2,orange,6,3,5,8


In [42]:
df_fruits.melt(id_vars=["fruit"],
        value_vars=["Aldi", "Costco", "Target", "Walmart"],
        var_name='store')

Unnamed: 0,fruit,store,value
0,apple,Aldi,4
1,banana,Aldi,5
2,orange,Aldi,6
3,apple,Costco,1
4,banana,Costco,2
5,orange,Costco,3
6,apple,Target,3
7,banana,Target,4
8,orange,Target,5
9,apple,Walmart,6


- Pandas carrega vários tipos de dados como csv, json, xml, html, excel

In [43]:
walmart = pd.read_csv('datasets/Walmart_Store_sales.csv')
walmart

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


- É possível filtrar as colunas ao carregar o dataset

In [44]:
pd.read_csv("datasets/Walmart_Store_sales.csv", usecols=["Store", "Date", "Weekly_Sales"])

Unnamed: 0,Store,Date,Weekly_Sales
0,1,05-02-2010,1643690.90
1,1,12-02-2010,1641957.44
2,1,19-02-2010,1611968.17
3,1,26-02-2010,1409727.59
4,1,05-03-2010,1554806.68
...,...,...,...
6430,45,28-09-2012,713173.95
6431,45,05-10-2012,733455.07
6432,45,12-10-2012,734464.36
6433,45,19-10-2012,718125.53


- É possível especificar o tipo da coluna ao carregar o dataset

In [45]:
pd.read_csv("datasets/Walmart_Store_sales.csv", dtype={"Store": "category"})

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


- É possível especificar uma coluna como data ao carregar o dataset

In [46]:
pd.read_csv("datasets/Walmart_Store_sales.csv", parse_dates=["Date"])

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.90,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6431,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6432,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6433,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


- Especificar uma coluna como index ao carregar o dataset

In [47]:
pd.read_csv("datasets/Walmart_Store_sales.csv", index_col=["Date"])

Unnamed: 0_level_0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
05-02-2010,1,1643690.90,0,42.31,2.572,211.096358,8.106
12-02-2010,1,1641957.44,1,38.51,2.548,211.242170,8.106
19-02-2010,1,1611968.17,0,39.93,2.514,211.289143,8.106
26-02-2010,1,1409727.59,0,46.63,2.561,211.319643,8.106
05-03-2010,1,1554806.68,0,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...
28-09-2012,45,713173.95,0,64.88,3.997,192.013558,8.684
05-10-2012,45,733455.07,0,64.89,3.985,192.170412,8.667
12-10-2012,45,734464.36,0,54.47,4.000,192.327265,8.667
19-10-2012,45,718125.53,0,56.47,3.969,192.330854,8.667


- Especificar o número de linhas a serem carregadas

In [48]:
pd.read_csv("datasets/Walmart_Store_sales.csv", nrows=5)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


- Especificar linhas a não serem carregadas

In [49]:
pd.read_csv("datasets/Walmart_Store_sales.csv", skiprows=100)  # skips the first 100 lines

Unnamed: 0,1,30-12-2011,1497462.72,1.1,44.55,3.129,219.5359898,7.866
0,1,06-01-2012,1550369.92,0,49.01,3.157,219.714258,7.348
1,1,13-01-2012,1459601.17,0,48.53,3.261,219.892526,7.348
2,1,20-01-2012,1394393.84,0,54.11,3.268,219.985689,7.348
3,1,27-01-2012,1319325.59,0,54.26,3.290,220.078852,7.348
4,1,03-02-2012,1636339.65,0,56.55,3.360,220.172015,7.348
...,...,...,...,...,...,...,...,...
6330,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6331,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6332,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6333,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


In [50]:
pd.read_csv("datasets/Walmart_Store_sales.csv", skiprows=[1, 5])  # skips line 1 and 5

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,12-02-2010,1641957.44,1,38.51,2.548,211.242170,8.106
1,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
2,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
3,1,12-03-2010,1439541.59,0,57.79,2.667,211.380643,8.106
4,1,19-03-2010,1472515.79,0,54.58,2.720,211.215635,8.106
...,...,...,...,...,...,...,...,...
6428,45,28-09-2012,713173.95,0,64.88,3.997,192.013558,8.684
6429,45,05-10-2012,733455.07,0,64.89,3.985,192.170412,8.667
6430,45,12-10-2012,734464.36,0,54.47,4.000,192.327265,8.667
6431,45,19-10-2012,718125.53,0,56.47,3.969,192.330854,8.667


- É possível alterar os tipos das colunas

In [51]:
walmart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB


In [52]:
walmart = walmart.astype(
    {
        "Date": "datetime64[ns]",
        "Holiday_Flag": "bool",
        "Store": "category",
    }
)
walmart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         6435 non-null   category      
 1   Date          6435 non-null   datetime64[ns]
 2   Weekly_Sales  6435 non-null   float64       
 3   Holiday_Flag  6435 non-null   bool          
 4   Temperature   6435 non-null   float64       
 5   Fuel_Price    6435 non-null   float64       
 6   CPI           6435 non-null   float64       
 7   Unemployment  6435 non-null   float64       
dtypes: bool(1), category(1), datetime64[ns](1), float64(5)
memory usage: 315.7 KB


In [53]:
walmart

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-05-02,1643690.90,False,42.31,2.572,211.096358,8.106
1,1,2010-12-02,1641957.44,True,38.51,2.548,211.242170,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,211.289143,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,211.319643,8.106
4,1,2010-05-03,1554806.68,False,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,2012-09-28,713173.95,False,64.88,3.997,192.013558,8.684
6431,45,2012-05-10,733455.07,False,64.89,3.985,192.170412,8.667
6432,45,2012-12-10,734464.36,False,54.47,4.000,192.327265,8.667
6433,45,2012-10-19,718125.53,False,56.47,3.969,192.330854,8.667


- É possível realizar consultas por coluna e por index

In [54]:
walmart.query('Holiday_Flag == 1')

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
1,1,2010-12-02,1641957.44,True,38.51,2.548,211.242170,8.106
31,1,2010-10-09,1507460.69,True,78.69,2.565,211.495190,7.787
42,1,2010-11-26,1955624.11,True,64.52,2.735,211.748433,7.838
47,1,2010-12-31,1367320.01,True,48.43,2.943,211.404932,7.838
53,1,2011-11-02,1649614.93,True,36.39,3.022,212.936705,7.742
...,...,...,...,...,...,...,...,...
6375,45,2011-09-09,746129.56,True,71.48,3.738,186.673738,8.625
6386,45,2011-11-25,1170672.94,True,48.71,3.492,188.350400,8.523
6391,45,2011-12-30,869403.63,True,37.79,3.389,189.062016,8.523
6397,45,2012-10-02,803657.12,True,37.00,3.640,189.707605,8.424


In [55]:
val = 3049614.93
walmart.query('Weekly_Sales > @val')

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
189,2,2010-12-24,3436007.68,False,49.97,2.886,211.06466,8.163
241,2,2011-12-23,3224369.8,False,46.66,3.112,218.99955,7.441
475,4,2010-12-24,3526713.39,False,43.21,2.887,126.983581,7.127
527,4,2011-12-23,3676388.98,False,35.92,3.103,129.984548,5.143
1333,10,2010-12-24,3749057.69,False,57.06,3.236,126.983581,9.003
1385,10,2011-12-23,3487986.89,False,48.36,3.541,129.984548,7.874
1762,13,2010-12-24,3595903.2,False,34.9,2.846,126.983581,7.795
1814,13,2011-12-23,3556766.03,False,24.76,3.186,129.984548,6.392
1905,14,2010-12-24,3818686.45,False,30.59,3.141,182.54459,8.724
1957,14,2011-12-23,3369068.99,False,42.27,3.389,188.929975,8.523


In [56]:
walmart.query('1 <= index < 7')

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
1,1,2010-12-02,1641957.44,True,38.51,2.548,211.24217,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,211.289143,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,211.319643,8.106
4,1,2010-05-03,1554806.68,False,46.5,2.625,211.350143,8.106
5,1,2010-12-03,1439541.59,False,57.79,2.667,211.380643,8.106
6,1,2010-03-19,1472515.79,False,54.58,2.72,211.215635,8.106


In [57]:
walmart.query("Store in [1,2]")

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-05-02,1643690.90,False,42.31,2.572,211.096358,8.106
1,1,2010-12-02,1641957.44,True,38.51,2.548,211.242170,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,211.289143,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,211.319643,8.106
4,1,2010-05-03,1554806.68,False,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
281,2,2012-09-28,1746470.56,False,79.45,3.666,222.616433,6.565
282,2,2012-05-10,1998321.04,False,70.27,3.617,222.815930,6.170
283,2,2012-12-10,1900745.13,False,60.97,3.601,223.015426,6.170
284,2,2012-10-19,1847990.41,False,68.08,3.594,223.059808,6.170


In [58]:
walmart.query("Date == '2010-12-02' and Temperature < 40 and Unemployment < 8")

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
573,5,2010-12-02,311825.7,True,39.81,2.548,211.80047,6.566
1002,8,2010-12-02,994801.4,True,33.34,2.548,214.621419,6.299
1145,9,2010-12-02,552677.48,True,37.08,2.548,214.805653,6.415
2146,16,2010-12-02,472044.28,True,20.87,2.572,189.464272,7.039
2289,17,2010-12-02,841951.91,True,18.36,2.671,126.496258,6.548
3147,23,2010-12-02,1380892.08,True,18.75,2.771,131.586613,5.892
5578,40,2010-12-02,955338.29,True,20.84,2.771,131.586613,5.892
5721,41,2010-12-02,1075656.34,True,23.04,2.572,189.464272,7.541


In [59]:
walmart.query("Date.dt.month == 2 and Date.dt.year == 2010")

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
2,1,2010-02-19,1611968.17,False,39.93,2.514,211.289143,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,211.319643,8.106
8,1,2010-02-04,1594968.28,False,62.27,2.719,210.820450,7.808
21,1,2010-02-07,1492418.14,False,80.91,2.669,211.223533,7.787
145,2,2010-02-19,2124451.54,False,39.69,2.514,210.945160,8.324
...,...,...,...,...,...,...,...,...
6170,44,2010-02-07,300628.19,False,78.82,2.814,126.139200,7.804
6294,45,2010-02-19,841264.04,False,31.27,2.745,182.034782,8.992
6295,45,2010-02-26,741891.65,False,34.89,2.754,182.077469,8.992
6300,45,2010-02-04,877235.96,False,47.74,2.850,181.865754,8.899


In [60]:
walmart.query("Date >= '2010-02-12' and Date <= '2010-02-20'")

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
2,1,2010-02-19,1611968.17,False,39.93,2.514,211.289143,8.106
145,2,2010-02-19,2124451.54,False,39.69,2.514,210.94516,8.324
288,3,2010-02-19,421642.19,False,47.07,2.514,214.619887,7.368
431,4,2010-02-19,2049860.26,False,36.45,2.54,126.526286,8.623
574,5,2010-02-19,303447.57,False,41.14,2.514,211.847128,6.566
717,6,2010-02-19,1567138.07,False,43.58,2.514,212.816155,7.259
860,7,2010-02-19,506760.54,False,27.28,2.55,189.5341,9.014
1003,8,2010-02-19,963960.37,False,39.1,2.514,214.666488,6.299
1146,9,2010-02-19,511327.9,False,43.06,2.514,214.850618,6.415
1289,10,2010-02-19,2113432.58,False,58.22,2.915,126.526286,9.765


- .iloc acessa por posição (linha,coluna)
- .loc permite acessar as variáveis por index 

In [61]:
walmart.iloc[0,1]

Timestamp('2010-05-02 00:00:00')

In [62]:
walmart.loc[[0,1,2]]

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-05-02,1643690.9,False,42.31,2.572,211.096358,8.106
1,1,2010-12-02,1641957.44,True,38.51,2.548,211.24217,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,211.289143,8.106


In [63]:
walmart.loc[0]

Store                             1
Date            2010-05-02 00:00:00
Weekly_Sales              1643690.9
Holiday_Flag                  False
Temperature                   42.31
Fuel_Price                    2.572
CPI                      211.096358
Unemployment                  8.106
Name: 0, dtype: object

- .loc também permite criar condições para retornar 

In [64]:
walmart

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-05-02,1643690.90,False,42.31,2.572,211.096358,8.106
1,1,2010-12-02,1641957.44,True,38.51,2.548,211.242170,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,211.289143,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,211.319643,8.106
4,1,2010-05-03,1554806.68,False,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,2012-09-28,713173.95,False,64.88,3.997,192.013558,8.684
6431,45,2012-05-10,733455.07,False,64.89,3.985,192.170412,8.667
6432,45,2012-12-10,734464.36,False,54.47,4.000,192.327265,8.667
6433,45,2012-10-19,718125.53,False,56.47,3.969,192.330854,8.667


In [65]:
walmart.loc[walmart['Holiday_Flag'] == 1]

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
1,1,2010-12-02,1641957.44,True,38.51,2.548,211.242170,8.106
31,1,2010-10-09,1507460.69,True,78.69,2.565,211.495190,7.787
42,1,2010-11-26,1955624.11,True,64.52,2.735,211.748433,7.838
47,1,2010-12-31,1367320.01,True,48.43,2.943,211.404932,7.838
53,1,2011-11-02,1649614.93,True,36.39,3.022,212.936705,7.742
...,...,...,...,...,...,...,...,...
6375,45,2011-09-09,746129.56,True,71.48,3.738,186.673738,8.625
6386,45,2011-11-25,1170672.94,True,48.71,3.492,188.350400,8.523
6391,45,2011-12-30,869403.63,True,37.79,3.389,189.062016,8.523
6397,45,2012-10-02,803657.12,True,37.00,3.640,189.707605,8.424


In [66]:
walmart.loc[(walmart['Temperature'] > 90) & (walmart['Store'] == 1)]

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
78,1,2011-05-08,1624383.75,False,91.65,3.684,215.544618,7.962
79,1,2011-12-08,1525147.09,False,90.76,3.638,215.605788,7.962


In [67]:
walmart.loc[walmart.index[0:10],["Store", "Date"]]

Unnamed: 0,Store,Date
0,1,2010-05-02
1,1,2010-12-02
2,1,2010-02-19
3,1,2010-02-26
4,1,2010-05-03
5,1,2010-12-03
6,1,2010-03-19
7,1,2010-03-26
8,1,2010-02-04
9,1,2010-09-04


- Vamos selecionar as temperaturas máxima e mínima da loja 1.
- Usando pipe para auxiliar na filtragem:

In [68]:
walmart.loc[walmart['Store'] == 1].pipe(lambda x: [x['Temperature'].max(), x['Temperature'].min()])

[np.float64(91.65), np.float64(35.4)]

In [69]:
walmart.query("Store==1").pipe(lambda x: [x['Temperature'].max(), x['Temperature'].min()])

[np.float64(91.65), np.float64(35.4)]

- Usando eval

In [70]:
walmart.loc[walmart['Store'] == 1].eval("Temperature.max(), Temperature.min()")

[np.float64(91.65), np.float64(35.4)]

In [71]:
walmart.query("Store==1").eval("Temperature.max(), Temperature.min()")

[np.float64(91.65), np.float64(35.4)]

- DataFrames permitem fazer copias

In [72]:
df_aux = walmart.copy()
df_aux

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-05-02,1643690.90,False,42.31,2.572,211.096358,8.106
1,1,2010-12-02,1641957.44,True,38.51,2.548,211.242170,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,211.289143,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,211.319643,8.106
4,1,2010-05-03,1554806.68,False,46.50,2.625,211.350143,8.106
...,...,...,...,...,...,...,...,...
6430,45,2012-09-28,713173.95,False,64.88,3.997,192.013558,8.684
6431,45,2012-05-10,733455.07,False,64.89,3.985,192.170412,8.667
6432,45,2012-12-10,734464.36,False,54.47,4.000,192.327265,8.667
6433,45,2012-10-19,718125.53,False,56.47,3.969,192.330854,8.667


- É possível filtrar colunas usando o método filter

In [73]:
df_aux.filter(items=['Date','Weekly_Sales'])

Unnamed: 0,Date,Weekly_Sales
0,2010-05-02,1643690.90
1,2010-12-02,1641957.44
2,2010-02-19,1611968.17
3,2010-02-26,1409727.59
4,2010-05-03,1554806.68
...,...,...
6430,2012-09-28,713173.95
6431,2012-05-10,733455.07
6432,2012-12-10,734464.36
6433,2012-10-19,718125.53


- Selecionar colunas e index com expressão regular

In [74]:
# Selecionando as colunas que terminam com e
df_aux.filter(regex='e$', axis=1)

Unnamed: 0,Store,Date,Temperature,Fuel_Price
0,1,2010-05-02,42.31,2.572
1,1,2010-12-02,38.51,2.548
2,1,2010-02-19,39.93,2.514
3,1,2010-02-26,46.63,2.561
4,1,2010-05-03,46.50,2.625
...,...,...,...,...
6430,45,2012-09-28,64.88,3.997
6431,45,2012-05-10,64.89,3.985
6432,45,2012-12-10,54.47,4.000
6433,45,2012-10-19,56.47,3.969


- Filtrar por caracteres colunas e index utilizando like

In [75]:
#Filtrando os indexes que possuem '10'
df_aux.filter(like='10', axis=0)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
10,1,2010-04-16,1466058.28,False,66.32,2.808,210.488700,7.808
100,1,2012-06-01,1550369.92,False,49.01,3.157,219.714258,7.348
101,1,2012-01-13,1459601.17,False,48.53,3.261,219.892526,7.348
102,1,2012-01-20,1394393.84,False,54.11,3.268,219.985689,7.348
103,1,2012-01-27,1319325.59,False,54.26,3.290,220.078852,7.348
...,...,...,...,...,...,...,...,...
6109,43,2012-01-27,587685.38,False,52.10,3.290,211.587991,9.653
6110,43,2012-03-02,629176.71,False,51.92,3.360,211.676200,9.653
6210,44,2011-08-04,292498.61,False,42.75,3.547,128.823806,6.906
6310,45,2010-11-06,794698.77,False,69.71,2.809,182.431557,8.899


In [76]:
#Filtrando as colunas que possuem 're'
df_aux.filter(like='re', axis=1)

Unnamed: 0,Store,Temperature
0,1,42.31
1,1,38.51
2,1,39.93
3,1,46.63
4,1,46.50
...,...,...
6430,45,64.88
6431,45,64.89
6432,45,54.47
6433,45,56.47


- Podemos utilizar del para excluir colunas

In [77]:
del df_aux['CPI']
df_aux

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,Unemployment
0,1,2010-05-02,1643690.90,False,42.31,2.572,8.106
1,1,2010-12-02,1641957.44,True,38.51,2.548,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,8.106
4,1,2010-05-03,1554806.68,False,46.50,2.625,8.106
...,...,...,...,...,...,...,...
6430,45,2012-09-28,713173.95,False,64.88,3.997,8.684
6431,45,2012-05-10,733455.07,False,64.89,3.985,8.667
6432,45,2012-12-10,734464.36,False,54.47,4.000,8.667
6433,45,2012-10-19,718125.53,False,56.47,3.969,8.667


- Já para excluir linhas devemos utilizar o drop

In [78]:
df_aux.drop(0)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,Unemployment
1,1,2010-12-02,1641957.44,True,38.51,2.548,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,8.106
4,1,2010-05-03,1554806.68,False,46.50,2.625,8.106
5,1,2010-12-03,1439541.59,False,57.79,2.667,8.106
...,...,...,...,...,...,...,...
6430,45,2012-09-28,713173.95,False,64.88,3.997,8.684
6431,45,2012-05-10,733455.07,False,64.89,3.985,8.667
6432,45,2012-12-10,734464.36,False,54.47,4.000,8.667
6433,45,2012-10-19,718125.53,False,56.47,3.969,8.667


In [79]:
df_aux.drop([6431,6433])

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,Unemployment
0,1,2010-05-02,1643690.90,False,42.31,2.572,8.106
1,1,2010-12-02,1641957.44,True,38.51,2.548,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,8.106
4,1,2010-05-03,1554806.68,False,46.50,2.625,8.106
...,...,...,...,...,...,...,...
6428,45,2012-09-14,702238.27,False,67.87,3.948,8.684
6429,45,2012-09-21,723086.20,False,65.32,4.038,8.684
6430,45,2012-09-28,713173.95,False,64.88,3.997,8.684
6432,45,2012-12-10,734464.36,False,54.47,4.000,8.667


- Perceba que drop retorna um novo objeto removendo o que foi passado por paramentro. Mas o objeto original não é modificado

In [80]:
df_aux

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,Unemployment
0,1,2010-05-02,1643690.90,False,42.31,2.572,8.106
1,1,2010-12-02,1641957.44,True,38.51,2.548,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,8.106
4,1,2010-05-03,1554806.68,False,46.50,2.625,8.106
...,...,...,...,...,...,...,...
6430,45,2012-09-28,713173.95,False,64.88,3.997,8.684
6431,45,2012-05-10,733455.07,False,64.89,3.985,8.667
6432,45,2012-12-10,734464.36,False,54.47,4.000,8.667
6433,45,2012-10-19,718125.53,False,56.47,3.969,8.667


- Para realizar a operação no mesmo objeto é necessário passar True no parametro inplace

In [81]:
df_aux.drop([6431,6433], inplace=True)
df_aux

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,Unemployment
0,1,2010-05-02,1643690.90,False,42.31,2.572,8.106
1,1,2010-12-02,1641957.44,True,38.51,2.548,8.106
2,1,2010-02-19,1611968.17,False,39.93,2.514,8.106
3,1,2010-02-26,1409727.59,False,46.63,2.561,8.106
4,1,2010-05-03,1554806.68,False,46.50,2.625,8.106
...,...,...,...,...,...,...,...
6428,45,2012-09-14,702238.27,False,67.87,3.948,8.684
6429,45,2012-09-21,723086.20,False,65.32,4.038,8.684
6430,45,2012-09-28,713173.95,False,64.88,3.997,8.684
6432,45,2012-12-10,734464.36,False,54.47,4.000,8.667


- Drop também permite a remoção de colunas

In [82]:
df_aux.drop(['Fuel_Price','Unemployment'], inplace=True, axis='columns')
df_aux

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature
0,1,2010-05-02,1643690.90,False,42.31
1,1,2010-12-02,1641957.44,True,38.51
2,1,2010-02-19,1611968.17,False,39.93
3,1,2010-02-26,1409727.59,False,46.63
4,1,2010-05-03,1554806.68,False,46.50
...,...,...,...,...,...
6428,45,2012-09-14,702238.27,False,67.87
6429,45,2012-09-21,723086.20,False,65.32
6430,45,2012-09-28,713173.95,False,64.88
6432,45,2012-12-10,734464.36,False,54.47


- É possível também adicionar um nome para o index e para as coluna

In [83]:
df_aux.index.name = 'indice'
df_aux.columns.name = 'dados'

In [84]:
df_aux

dados,Store,Date,Weekly_Sales,Holiday_Flag,Temperature
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2010-05-02,1643690.90,False,42.31
1,1,2010-12-02,1641957.44,True,38.51
2,1,2010-02-19,1611968.17,False,39.93
3,1,2010-02-26,1409727.59,False,46.63
4,1,2010-05-03,1554806.68,False,46.50
...,...,...,...,...,...
6428,45,2012-09-14,702238.27,False,67.87
6429,45,2012-09-21,723086.20,False,65.32
6430,45,2012-09-28,713173.95,False,64.88
6432,45,2012-12-10,734464.36,False,54.47


- Analisando Store 1

In [85]:
df_aux = df_aux.query("Store==1")
df_aux

dados,Store,Date,Weekly_Sales,Holiday_Flag,Temperature
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2010-05-02,1643690.90,False,42.31
1,1,2010-12-02,1641957.44,True,38.51
2,1,2010-02-19,1611968.17,False,39.93
3,1,2010-02-26,1409727.59,False,46.63
4,1,2010-05-03,1554806.68,False,46.50
...,...,...,...,...,...
138,1,2012-09-28,1437059.26,False,76.08
139,1,2012-05-10,1670785.97,False,68.55
140,1,2012-12-10,1573072.81,False,62.99
141,1,2012-10-19,1508068.77,False,67.97


- O método assign atribui novas colunas ao DataFrame, podendo até sobrescrevê-la caso a coluna já exista

In [86]:
time_sentences = ["Saturday: Weekend (Not working day)",
                  "Sunday: Weekend (Not working day)",
                  "Monday: Doctor appointment at 2:45pm.",
                  "Tuesday: Dentist appointment at 11:30 am.",
                  "Wednesday: basketball game At 7:00pm",
                  "Thursday: Back home by 11:15 pm.",
                  "Friday: Take the train at 08:10 am."]

df_sentences = pd.DataFrame(time_sentences, columns=['text'])
df_sentences

Unnamed: 0,text
0,Saturday: Weekend (Not working day)
1,Sunday: Weekend (Not working day)
2,Monday: Doctor appointment at 2:45pm.
3,Tuesday: Dentist appointment at 11:30 am.
4,Wednesday: basketball game At 7:00pm
5,Thursday: Back home by 11:15 pm.
6,Friday: Take the train at 08:10 am.


In [87]:
df_sentences = df_sentences.assign(text=df_sentences.text.str.lower(),
                                    text_len=df_sentences.text.str.len(),
                                    word_count=df_sentences.text.str.count(" ") + 1,
                                    weekend=df_sentences.text.str.contains("saturday|sunday", case=False)
                                )
df_sentences

Unnamed: 0,text,text_len,word_count,weekend
0,saturday: weekend (not working day),35,5,True
1,sunday: weekend (not working day),33,5,True
2,monday: doctor appointment at 2:45pm.,37,5,False
3,tuesday: dentist appointment at 11:30 am.,41,6,False
4,wednesday: basketball game at 7:00pm,36,5,False
5,thursday: back home by 11:15 pm.,32,6,False
6,friday: take the train at 08:10 am.,35,7,False


- Renomeando as colunas

In [88]:
df_aux = df_aux.rename({"Store": "loja", 
                        "Date": "data", 
                        "Weekly_Sales": "vendas_semanais",
                        "Holiday_Flag": "feriado",
                        "Temperature": "temperatura"
                       }, axis=1)
df_aux

dados,loja,data,vendas_semanais,feriado,temperatura
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2010-05-02,1643690.90,False,42.31
1,1,2010-12-02,1641957.44,True,38.51
2,1,2010-02-19,1611968.17,False,39.93
3,1,2010-02-26,1409727.59,False,46.63
4,1,2010-05-03,1554806.68,False,46.50
...,...,...,...,...,...
138,1,2012-09-28,1437059.26,False,76.08
139,1,2012-05-10,1670785.97,False,68.55
140,1,2012-12-10,1573072.81,False,62.99
141,1,2012-10-19,1508068.77,False,67.97


- Alterando formatação dos valores

In [89]:
format_dict = {
    "data": "{:%d/%m/%y}",
    "vendas_semanais": "$ {:.2f}",
    "temperatura": "{:.2f}º"
}
df_aux.style.format(format_dict)

dados,loja,data,vendas_semanais,feriado,temperatura
indice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,02/05/10,$ 1643690.90,False,42.31º
1,1,02/12/10,$ 1641957.44,True,38.51º
2,1,19/02/10,$ 1611968.17,False,39.93º
3,1,26/02/10,$ 1409727.59,False,46.63º
4,1,03/05/10,$ 1554806.68,False,46.50º
5,1,03/12/10,$ 1439541.59,False,57.79º
6,1,19/03/10,$ 1472515.79,False,54.58º
7,1,26/03/10,$ 1404429.92,False,51.45º
8,1,04/02/10,$ 1594968.28,False,62.27º
9,1,04/09/10,$ 1545418.53,False,65.86º


- Adicionando cores e barras para destacar valores

In [90]:
(
    df_aux.style.format(format_dict)
    .hide(axis="index")
    .highlight_max(["feriado"], color="red")
    .background_gradient(subset="vendas_semanais", cmap="Greens")
    .bar('temperatura', color='lightblue', align='zero')
    .set_caption('Vendas semanais das lojas do Walmart')
)

loja,data,vendas_semanais,feriado,temperatura
1,02/05/10,$ 1643690.90,False,42.31º
1,02/12/10,$ 1641957.44,True,38.51º
1,19/02/10,$ 1611968.17,False,39.93º
1,26/02/10,$ 1409727.59,False,46.63º
1,03/05/10,$ 1554806.68,False,46.50º
1,03/12/10,$ 1439541.59,False,57.79º
1,19/03/10,$ 1472515.79,False,54.58º
1,26/03/10,$ 1404429.92,False,51.45º
1,04/02/10,$ 1594968.28,False,62.27º
1,04/09/10,$ 1545418.53,False,65.86º


## <font color=green> Reindexação

In [91]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

- Reindex nessa Series reorganiza os dados de acordo com o novo índice, introduzindo valores indicativos de ausência se algum valor de índice não estava presente antes

In [92]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

- Para dados ordenados, como séries temporais, talvez seja desejável fazer alguma interpolação ou preenchimento de valores de reindexação. A opção method nos permite fazer isso, usando um método como ffill, que faz um preenchimento para a frente (foward-fill) dos valores, enquanto bfill preenche pra trás

In [93]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [94]:
obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [95]:
obj3.reindex(range(6), method='bfill')

0      blue
1    purple
2    purple
3    yellow
4    yellow
5       NaN
dtype: object

- Com DataFrame, reindex pode alterar o índice (linha), as colunas, ou ambas. Se apenas uma sequência for passada, as linhas serão reindexadas no resultado

In [96]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)), 
                        index=['a','c','d'], 
                        columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [97]:
frame2 = frame.reindex(['a','b','c','d'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [98]:
frame3 = frame.reindex(columns=['Texas', 'Utah', 'California'])
frame3

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


- Convertendo coluna multivalores para multiplas linhas

In [99]:
df = pd.DataFrame({
    "name": ["Kevin", "Jack", "Mary"],
    "sex": ["M", "M", "F"],
    "hobbies": ["Writing,Chess", "Drawing,Coding", "Music,Reading"]
}, index=["K", "J", "M"])

df

Unnamed: 0,name,sex,hobbies
K,Kevin,M,"Writing,Chess"
J,Jack,M,"Drawing,Coding"
M,Mary,F,"Music,Reading"


In [100]:
df["hobbies"] = df["hobbies"].str.split(",") 
df

Unnamed: 0,name,sex,hobbies
K,Kevin,M,"[Writing, Chess]"
J,Jack,M,"[Drawing, Coding]"
M,Mary,F,"[Music, Reading]"


In [101]:
df = df.explode("hobbies") 
df

Unnamed: 0,name,sex,hobbies
K,Kevin,M,Writing
K,Kevin,M,Chess
J,Jack,M,Drawing
J,Jack,M,Coding
M,Mary,F,Music
M,Mary,F,Reading
