# Os 25 melhores truques no Pandas.

## Importação dos Módulos

In [1]:
import pandas as pd
import numpy as np
from glob import glob

## Leituras

In [2]:
drinks =  pd.read_csv('http://bit.ly/drinksbycountry')
movies =  pd.read_csv('http://bit.ly/imdbratings')
orders =  pd.read_csv('http://bit.ly/chiporders', sep='\t')
orders['item_price'] =  orders['item_price'].str.replace('$', '').astype('float')
stocks =  pd.read_csv('http://bit.ly/smallstocks', parse_dates=['Date'])
titanic = pd.read_csv('http://bit.ly/kaggletrain')
ufo =     pd.read_csv('http://bit.ly/uforeports', parse_dates=['Time'])

  orders['item_price'] =  orders['item_price'].str.replace('$', '').astype('float')


## 1. Versões instaladas

In [3]:
# Versão da depedência pandas
pd.__version__

'1.3.2'

In [4]:
# Versões dos módulo/depedências
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : 5f648bf1706dd75a9ca0d29f26eadfbb595fe52b
python           : 3.9.6.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19043
machine          : AMD64
processor        : Intel64 Family 6 Model 140 Stepping 1, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : pt_BR.cp1252

pandas           : 1.3.2
numpy            : 1.19.5
pytz             : 2021.1
dateutil         : 2.8.2
pip              : 21.2.4
setuptools       : 56.0.0
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.6.3
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.0.1
IPython          : 7.26.0
pandas_datareader: None
bs4              : 4.9.3
bottleneck       : None
fsspec           : No

## 2. Criação de DataFrame

In [5]:
df = pd.DataFrame({'col one':[100,200], 'col two':[300,400]})
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [6]:
pd.DataFrame(np.random.rand(4,8))

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.615037,0.384852,0.208849,0.051444,0.902645,0.721576,0.759954,0.673416
1,0.254406,0.491201,0.39732,0.75918,0.297049,0.464926,0.706354,0.340211
2,0.059051,0.566215,0.994747,0.276591,0.883364,0.778544,0.903072,0.076609
3,0.130793,0.495369,0.76014,0.296455,0.456956,0.158129,0.323027,0.385949


In [7]:
pd.DataFrame(np.random.rand(4,8), columns=list('abcdefgh'))

Unnamed: 0,a,b,c,d,e,f,g,h
0,0.963783,0.015128,0.093806,0.849786,0.709443,0.683428,0.6836,0.264219
1,0.552689,0.47989,0.296532,0.95387,0.509205,0.446058,0.265027,0.326673
2,0.45316,0.655879,0.457646,0.367088,0.271207,0.84778,0.582536,0.603055
3,0.553959,0.646782,0.848419,0.907154,0.048889,0.02999,0.959354,0.595347


## 3. Renomenado colunas

In [8]:
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [9]:
df = df.rename({'col one':'col_one', 'col two':'col_two'}, axis='columns')
df = df.rename(columns={'col one':'col_one1', 'col two':'col_two1'})
df

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [10]:
df.columns = ['col__one','col__two']
df

Unnamed: 0,col__one,col__two
0,100,300
1,200,400


In [11]:
df.columns = df.columns.str.replace('__', '_')
df

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [12]:
df.add_prefix('X_')

Unnamed: 0,X_col_one,X_col_two
0,100,300
1,200,400


In [13]:
df.add_suffix('_Y')

Unnamed: 0,col_one_Y,col_two_Y
0,100,300
1,200,400


## 4. Ordem reversa das linhas

In [14]:
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa


In [15]:
drinks.tail(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
190,Yemen,6,0,0,0.1,Asia
191,Zambia,32,19,4,2.5,Africa
192,Zimbabwe,64,18,4,4.7,Africa


In [16]:
drinks.loc[::-1].head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
192,Zimbabwe,64,18,4,4.7,Africa
191,Zambia,32,19,4,2.5,Africa
190,Yemen,6,0,0,0.1,Asia


In [17]:
drinks.loc[::-1].reset_index(drop=True).head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Zimbabwe,64,18,4,4.7,Africa
1,Zambia,32,19,4,2.5,Africa
2,Yemen,6,0,0,0.1,Asia


## 5. Ordem reversa das colunas

In [18]:
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa


In [19]:
drinks.loc[:, ::-1].head(3)

Unnamed: 0,continent,total_litres_of_pure_alcohol,wine_servings,spirit_servings,beer_servings,country
0,Asia,0.0,0,0,0,Afghanistan
1,Europe,4.9,54,132,89,Albania
2,Africa,0.7,14,0,25,Algeria


## 6. Selecionando colunas pelo data type

In [20]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [21]:
drinks.select_dtypes(include='number').head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9


In [22]:
drinks.select_dtypes(include='object').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


In [23]:
drinks.select_dtypes(include=['number','object','category','datetime']).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [24]:
drinks.select_dtypes(exclude='number').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


## 7. Convertendo strings para numeros

In [25]:
df = pd.DataFrame({
    'col_one':['1.1','2.2','3.3'],
    'col_two':['4.4','5.5','6.6'],
    'col_three':['7.7','8.8','-']
})
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,-


In [26]:
df.dtypes

col_one      object
col_two      object
col_three    object
dtype: object

In [27]:
df.astype({'col_one':'float', 'col_two':'float'}).dtypes

col_one      float64
col_two      float64
col_three     object
dtype: object

In [28]:
# Coerce força o parse/cast de string para float
pd.to_numeric(df['col_three'], errors='coerce')

0    7.7
1    8.8
2    NaN
Name: col_three, dtype: float64

In [29]:
pd.to_numeric(df['col_three'], errors='coerce').fillna(0)

0    7.7
1    8.8
2    0.0
Name: col_three, dtype: float64

In [30]:
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,0.0


In [31]:
df.dtypes


col_one      float64
col_two      float64
col_three    float64
dtype: object

## 8. Reduzindo o tamanho do DataFrame com Reduce

In [32]:
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [33]:
cols = ['beer_servings', 'continent']
small_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   beer_servings  193 non-null    int64 
 1   continent      193 non-null    object
dtypes: int64(1), object(1)
memory usage: 13.7 KB


In [34]:
dtypes = {'continent':'category'}
small_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols, dtype=dtypes)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   beer_servings  193 non-null    int64   
 1   continent      193 non-null    category
dtypes: category(1), int64(1)
memory usage: 2.4 KB


## 9. Construindo um dataframe a partir de um arquivo com multiplas linhas

In [35]:
pd.read_csv('data/stocks1.csv')



Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT


In [36]:
pd.read_csv('data/stocks2.csv')

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-04,113.0,29736800,AAPL
1,2016-10-04,57.24,20085900,MSFT
2,2016-10-04,31.35,18460400,CSCO


In [37]:
pd.read_csv('data/stocks1.csv')

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT


In [38]:
stock_files = sorted(glob('data/stocks*.csv'))
stock_files.pop(0)
stock_files

['data\\stocks1.csv', 'data\\stocks2.csv', 'data\\stocks3.csv']

In [39]:
pd.concat((pd.read_csv(file) for file in stock_files))

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
0,2016-10-04,113.0,29736800,AAPL
1,2016-10-04,57.24,20085900,MSFT
2,2016-10-04,31.35,18460400,CSCO
0,2016-10-05,57.64,16726400,MSFT
1,2016-10-05,31.59,11808600,CSCO
2,2016-10-05,113.05,21453100,AAPL


In [40]:
pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True)

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


## 10. Construindo um dataframe a partir de um arquivo com multiplas colunas

In [41]:
pd.read_csv('data/drinks1.csv').head(3)

Unnamed: 0,country,beer_servings,spirit_servings
0,Afghanistan,0,0
1,Albania,89,132
2,Algeria,25,0


In [42]:
pd.read_csv('data/drinks2.csv').head(3)

Unnamed: 0,wine_servings,total_litres_of_pure_alcohol,continent
0,0,0.0,Asia
1,54,4.9,Europe
2,14,0.7,Africa


In [43]:
drinks_files = sorted(glob('data/drinks*.csv'))
drinks_files.pop(0)
drinks_files

['data\\drinks1.csv', 'data\\drinks2.csv']

In [44]:
pd.concat((pd.read_csv(file) for file in drinks_files), axis='columns' ).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


## 11. Crie um DataFrame de um clipboard(prancheta ou área de transferência)

In [45]:
df = pd.read_clipboard()
df

Unnamed: 0,#,frac,=,"0,75",*,979,=.1,"734,25",areedondando,734
0,print(len(movies_1)),,,,,,,,,


In [46]:
df.index

RangeIndex(start=0, stop=1, step=1)

In [47]:
df.dtypes 

#                object
frac            float64
=               float64
0,75            float64
*               float64
979             float64
=.1             float64
734,25          float64
areedondando    float64
734             float64
dtype: object

In [48]:
df = pd.read_clipboard()
df

Unnamed: 0,#,frac,=,"0,75",*,979,=.1,"734,25",areedondando,734
0,print(len(movies_1)),,,,,,,,,


In [49]:
df.index

RangeIndex(start=0, stop=1, step=1)

## 12. Fazendo um split dentro de dois subsets aleatórios

In [50]:
# Tamanho da amostra
len(movies)

979

In [51]:
# Pegando uma amostra do dataframe movie
movies_1 = movies.sample(frac=0.75, random_state=1234)
movies_1.head(2)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
387,8.0,Midnight Cowboy,X,Drama,113,"[u'Dustin Hoffman', u'Jon Voight', u'Sylvia Mi..."
653,7.7,Fearless,PG-13,Action,104,"[u'Jet Li', u'Li Sun', u'Yong Dong']"


In [52]:
# frac = 0,75 * 979 = 734,25 areedondando 734
print(len(movies_1))


734


In [53]:
movies_2 = movies.drop(movies_1.index)
movies_2.head(2)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


In [54]:
# deletou a linhas em comum do índice de movies1 e sobrou somente 245
print(len(movies_2))

245


In [55]:
len(movies_1) + len(movies_2) 

979

In [56]:
movies_1.index.sort_values()

Int64Index([  0,   2,   5,   6,   7,   8,   9,  11,  13,  16,
            ...
            966, 967, 969, 971, 972, 974, 975, 976, 977, 978],
           dtype='int64', length=734)

In [57]:
movies_2.index.sort_values()

Int64Index([  1,   3,   4,  10,  12,  14,  15,  18,  26,  30,
            ...
            931, 934, 937, 941, 950, 954, 960, 968, 970, 973],
           dtype='int64', length=245)

## 13. Filtrando o DataFrame por multiplas categorias

In [58]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [59]:
movies['genre'].unique()


array(['Crime', 'Action', 'Drama', 'Western', 'Adventure', 'Biography',
       'Comedy', 'Animation', 'Mystery', 'Horror', 'Film-Noir', 'Sci-Fi',
       'History', 'Thriller', 'Family', 'Fantasy'], dtype=object)

In [60]:
movies[(movies['genre'] == 'Action')|
       (movies['genre'] == 'Drama')|
       (movies['genre'] == 'Western')].head()


Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."


## 14. Filtrando o DataFrame por grandes categorias

## 15. Lidando com valores faltantes

## 16. Fazendo o split de uma string dentro de multiplas colunas

## 17. Espandindo uma lista de Series dentro de um DataFrame

## 18. Agregando (agregate) por multiplas funções

## 19. Combinando a saída com a agregação do DataFrame

## 20. Selecionando slice(pedaço/fatiamento) das linhas e colunas

## 21. Remodelar (Reshape) um índice multiplo (MultiIndexed) Series

## 22. Criando uma tabela dinâmica (pivot table ou análise dinâmica)

## 23. Conversão continua de data dentro dados categoricos (categorical data)

## 24. Mudança do display de opções

## 25. Estilo(Style) do DataFrame