# Os 25 melhores truques no Pandas.

## Importação dos Módulos

In [1]:
import pandas as pd
import numpy as np
from glob import glob

## Leituras

In [2]:
drinks =  pd.read_csv('http://bit.ly/drinksbycountry')
movies =  pd.read_csv('http://bit.ly/imdbratings')
orders =  pd.read_csv('http://bit.ly/chiporders', sep='\t')
orders['item_price'] =  orders['item_price'].str.replace('$', '').astype('float')
stocks =  pd.read_csv('http://bit.ly/smallstocks', parse_dates=['Date'])
titanic = pd.read_csv('http://bit.ly/kaggletrain')
ufo =     pd.read_csv('http://bit.ly/uforeports', parse_dates=['Time'])

  orders['item_price'] =  orders['item_price'].str.replace('$', '').astype('float')


## 1. Versões instaladas

In [3]:
# Versão da depedência pandas
pd.__version__

'1.3.2'

In [4]:
# Versões dos módulo/depedências
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : 5f648bf1706dd75a9ca0d29f26eadfbb595fe52b
python           : 3.9.6.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19043
machine          : AMD64
processor        : Intel64 Family 6 Model 140 Stepping 1, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : pt_BR.cp1252

pandas           : 1.3.2
numpy            : 1.19.5
pytz             : 2021.1
dateutil         : 2.8.2
pip              : 21.2.4
setuptools       : 56.0.0
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.6.3
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.0.1
IPython          : 7.26.0
pandas_datareader: None
bs4              : 4.9.3
bottleneck       : None
fsspec           : No

## 2. Criação de DataFrame

In [5]:
df = pd.DataFrame({'col one':[100,200], 'col two':[300,400]})
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [6]:
pd.DataFrame(np.random.rand(4,8))

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.370061,0.765446,0.415796,0.239025,0.314363,0.621996,0.513454,0.407561
1,0.896636,0.534492,0.368594,0.040571,0.045841,0.499936,0.096697,0.308721
2,0.447688,0.812923,0.879864,0.202579,0.144042,0.552272,0.859138,0.863564
3,0.279718,0.28509,0.905092,0.171057,0.740518,0.974531,0.274656,0.918098


In [7]:
pd.DataFrame(np.random.rand(4,8), columns=list('abcdefgh'))

Unnamed: 0,a,b,c,d,e,f,g,h
0,0.147195,0.502499,0.158848,0.848558,0.295841,0.466156,0.802716,0.670539
1,0.038534,0.141244,0.090426,0.11027,0.753155,0.913637,0.528655,0.783902
2,0.335127,0.167095,0.353208,0.127882,0.166123,0.169914,0.779472,0.249793
3,0.081576,0.599258,0.817793,0.660264,0.634039,0.084171,0.776128,0.959354


## 3. Renomenado colunas

In [8]:
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [9]:
df = df.rename({'col one':'col_one', 'col two':'col_two'}, axis='columns')
df = df.rename(columns={'col one':'col_one1', 'col two':'col_two1'})
df

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [10]:
df.columns = ['col__one','col__two']
df

Unnamed: 0,col__one,col__two
0,100,300
1,200,400


In [11]:
df.columns = df.columns.str.replace('__', '_')
df

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [12]:
df.add_prefix('X_')

Unnamed: 0,X_col_one,X_col_two
0,100,300
1,200,400


In [13]:
df.add_suffix('_Y')

Unnamed: 0,col_one_Y,col_two_Y
0,100,300
1,200,400


## 4. Ordem reversa das linhas

In [14]:
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa


In [15]:
drinks.tail(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
190,Yemen,6,0,0,0.1,Asia
191,Zambia,32,19,4,2.5,Africa
192,Zimbabwe,64,18,4,4.7,Africa


In [16]:
drinks.loc[::-1].head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
192,Zimbabwe,64,18,4,4.7,Africa
191,Zambia,32,19,4,2.5,Africa
190,Yemen,6,0,0,0.1,Asia


In [17]:
drinks.loc[::-1].reset_index(drop=True).head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Zimbabwe,64,18,4,4.7,Africa
1,Zambia,32,19,4,2.5,Africa
2,Yemen,6,0,0,0.1,Asia


## 5. Ordem reversa das colunas

In [18]:
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa


In [19]:
drinks.loc[:, ::-1].head(3)

Unnamed: 0,continent,total_litres_of_pure_alcohol,wine_servings,spirit_servings,beer_servings,country
0,Asia,0.0,0,0,0,Afghanistan
1,Europe,4.9,54,132,89,Albania
2,Africa,0.7,14,0,25,Algeria


## 6. Selecionando colunas pelo data type

In [20]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [21]:
drinks.select_dtypes(include='number').head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9


In [22]:
drinks.select_dtypes(include='object').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


In [23]:
drinks.select_dtypes(include=['number','object','category','datetime']).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [24]:
drinks.select_dtypes(exclude='number').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


## 7. Convertendo strings para numeros

In [25]:
df = pd.DataFrame({
    'col_one':['1.1','2.2','3.3'],
    'col_two':['4.4','5.5','6.6'],
    'col_three':['7.7','8.8','-']
})
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,-


In [26]:
df.dtypes

col_one      object
col_two      object
col_three    object
dtype: object

In [27]:
df.astype({'col_one':'float', 'col_two':'float'}).dtypes

col_one      float64
col_two      float64
col_three     object
dtype: object

In [28]:
# Coerce força o parse/cast de string para float
pd.to_numeric(df['col_three'], errors='coerce')

0    7.7
1    8.8
2    NaN
Name: col_three, dtype: float64

In [29]:
pd.to_numeric(df['col_three'], errors='coerce').fillna(0)

0    7.7
1    8.8
2    0.0
Name: col_three, dtype: float64

In [30]:
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,0.0


In [31]:
df.dtypes


col_one      float64
col_two      float64
col_three    float64
dtype: object

## 8. Reduzindo o tamanho do DataFrame com Reduce

In [32]:
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [33]:
cols = ['beer_servings', 'continent']
small_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   beer_servings  193 non-null    int64 
 1   continent      193 non-null    object
dtypes: int64(1), object(1)
memory usage: 13.7 KB


In [34]:
dtypes = {'continent':'category'}
small_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols, dtype=dtypes)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   beer_servings  193 non-null    int64   
 1   continent      193 non-null    category
dtypes: category(1), int64(1)
memory usage: 2.4 KB


## 9. Construindo um dataframe a partir de um arquivo com multiplas linhas

In [35]:
pd.read_csv('data/stocks1.csv')
pd.read_csv('data/stocks2.csv')
pd.read_csv('data/stocks1.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'data/stocks1.csv'

## 10. Construindo um dataframe a partir de um arquivo com multiplas colunas

## 11. Crie um DataFrame de um clipboard

## 12. Fazendo um split dentro de dois subsets aleatórios

## 13. Filtrando o DataFrame por multiplas categorias

## 14. Filtrando o DataFrame por grandes categorias

## 15. Lidando com valores faltantes

## 16. Fazendo o split de uma string dentro de multiplas colunas

## 17. Espandindo uma lista de Series dentro de um DataFrame

## 18. Agregando (agregate) por multiplas funções

## 19. Combinando a saída com a agregação do DataFrame

## 20. Selecionando slice(pedaço/fatiamento) das linhas e colunas

## 21. Remodelar (Reshape) um índice multiplo (MultiIndexed) Series

## 22. Criando uma tabela dinâmica (pivot table ou análise dinâmica)

## 23. Conversão continua de data dentro dados categoricos (categorical data)

## 24. Mudança do display de opções

## 25. Estilo(Style) do DataFrame