# Os 25 melhores truques no Pandas.

## Importação dos Módulos

In [1]:
import pandas as pd
import numpy as np
from glob import glob

## Leituras

In [2]:
drinks =  pd.read_csv('http://bit.ly/drinksbycountry')
movies =  pd.read_csv('http://bit.ly/imdbratings')
orders =  pd.read_csv('http://bit.ly/chiporders', sep='\t')
orders['item_price'] =  orders['item_price'].str.replace('$', '').astype('float')
stocks =  pd.read_csv('http://bit.ly/smallstocks', parse_dates=['Date'])
titanic = pd.read_csv('http://bit.ly/kaggletrain')
ufo =     pd.read_csv('http://bit.ly/uforeports', parse_dates=['Time'])

  orders['item_price'] =  orders['item_price'].str.replace('$', '').astype('float')


## 1. Versões instaladas

In [3]:
# Versão da depedência pandas
pd.__version__

'1.3.2'

In [4]:
# Versões dos módulo/depedências
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : 5f648bf1706dd75a9ca0d29f26eadfbb595fe52b
python           : 3.9.6.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19043
machine          : AMD64
processor        : Intel64 Family 6 Model 140 Stepping 1, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : pt_BR.cp1252

pandas           : 1.3.2
numpy            : 1.19.5
pytz             : 2021.1
dateutil         : 2.8.2
pip              : 21.2.4
setuptools       : 56.0.0
Cython           : None
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.6.3
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.0.1
IPython          : 7.26.0
pandas_datareader: None
bs4              : 4.9.3
bottleneck       : None
fsspec           : No

## 2. Criação de DataFrame

In [5]:
df = pd.DataFrame({'col one':[100,200], 'col two':[300,400]})
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [6]:
pd.DataFrame(np.random.rand(4,8))

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.162603,0.391556,0.884613,0.439639,0.232001,0.732367,0.07458,0.744443
1,0.83142,0.850088,0.888889,0.701255,0.264703,0.445417,0.913952,0.705596
2,0.270435,0.977408,0.707752,0.092634,0.355886,0.804171,0.947503,0.330454
3,0.489354,0.823011,0.983196,0.431554,0.043537,0.513119,0.65729,0.733349


In [7]:
pd.DataFrame(np.random.rand(4,8), columns=list('abcdefgh'))

Unnamed: 0,a,b,c,d,e,f,g,h
0,0.023229,0.759228,0.252219,0.923301,0.151511,0.999653,0.502277,0.183224
1,0.588849,0.908312,0.728777,0.158002,0.40003,0.124694,0.836649,0.449131
2,0.200326,0.768707,0.458429,0.51666,0.248354,0.259245,0.552973,0.530248
3,0.138348,0.361642,0.132261,0.248892,0.369138,0.325353,0.959836,0.25362


## 3. Renomenado colunas

In [8]:
df

Unnamed: 0,col one,col two
0,100,300
1,200,400


In [9]:
df = df.rename({'col one':'col_one', 'col two':'col_two'}, axis='columns')
df = df.rename(columns={'col one':'col_one1', 'col two':'col_two1'})
df

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [10]:
df.columns = ['col__one','col__two']
df

Unnamed: 0,col__one,col__two
0,100,300
1,200,400


In [11]:
df.columns = df.columns.str.replace('__', '_')
df

Unnamed: 0,col_one,col_two
0,100,300
1,200,400


In [12]:
df.add_prefix('X_')

Unnamed: 0,X_col_one,X_col_two
0,100,300
1,200,400


In [13]:
df.add_suffix('_Y')

Unnamed: 0,col_one_Y,col_two_Y
0,100,300
1,200,400


## 4. Ordem reversa das linhas

In [14]:
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa


In [15]:
drinks.tail(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
190,Yemen,6,0,0,0.1,Asia
191,Zambia,32,19,4,2.5,Africa
192,Zimbabwe,64,18,4,4.7,Africa


In [16]:
drinks.loc[::-1].head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
192,Zimbabwe,64,18,4,4.7,Africa
191,Zambia,32,19,4,2.5,Africa
190,Yemen,6,0,0,0.1,Asia


In [17]:
drinks.loc[::-1].reset_index(drop=True).head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Zimbabwe,64,18,4,4.7,Africa
1,Zambia,32,19,4,2.5,Africa
2,Yemen,6,0,0,0.1,Asia


## 5. Ordem reversa das colunas

In [18]:
drinks.head(3)

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa


In [19]:
drinks.loc[:, ::-1].head(3)

Unnamed: 0,continent,total_litres_of_pure_alcohol,wine_servings,spirit_servings,beer_servings,country
0,Asia,0.0,0,0,0,Afghanistan
1,Europe,4.9,54,132,89,Albania
2,Africa,0.7,14,0,25,Algeria


## 6. Selecionando colunas pelo data type

In [20]:
drinks.dtypes

country                          object
beer_servings                     int64
spirit_servings                   int64
wine_servings                     int64
total_litres_of_pure_alcohol    float64
continent                        object
dtype: object

In [21]:
drinks.select_dtypes(include='number').head()

Unnamed: 0,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol
0,0,0,0,0.0
1,89,132,54,4.9
2,25,0,14,0.7
3,245,138,312,12.4
4,217,57,45,5.9


In [22]:
drinks.select_dtypes(include='object').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


In [23]:
drinks.select_dtypes(include=['number','object','category','datetime']).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


In [24]:
drinks.select_dtypes(exclude='number').head()

Unnamed: 0,country,continent
0,Afghanistan,Asia
1,Albania,Europe
2,Algeria,Africa
3,Andorra,Europe
4,Angola,Africa


## 7. Convertendo strings para numeros

In [25]:
df = pd.DataFrame({
    'col_one':['1.1','2.2','3.3'],
    'col_two':['4.4','5.5','6.6'],
    'col_three':['7.7','8.8','-']
})
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,-


In [26]:
df.dtypes

col_one      object
col_two      object
col_three    object
dtype: object

In [27]:
df.astype({'col_one':'float', 'col_two':'float'}).dtypes

col_one      float64
col_two      float64
col_three     object
dtype: object

In [28]:
# Coerce força o parse/cast de string para float
pd.to_numeric(df['col_three'], errors='coerce')

0    7.7
1    8.8
2    NaN
Name: col_three, dtype: float64

In [29]:
pd.to_numeric(df['col_three'], errors='coerce').fillna(0)

0    7.7
1    8.8
2    0.0
Name: col_three, dtype: float64

In [30]:
df = df.apply(pd.to_numeric, errors='coerce').fillna(0)
df

Unnamed: 0,col_one,col_two,col_three
0,1.1,4.4,7.7
1,2.2,5.5,8.8
2,3.3,6.6,0.0


In [31]:
df.dtypes


col_one      float64
col_two      float64
col_three    float64
dtype: object

## 8. Reduzindo o tamanho do DataFrame com Reduce

In [32]:
drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   country                       193 non-null    object 
 1   beer_servings                 193 non-null    int64  
 2   spirit_servings               193 non-null    int64  
 3   wine_servings                 193 non-null    int64  
 4   total_litres_of_pure_alcohol  193 non-null    float64
 5   continent                     193 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 30.5 KB


In [33]:
cols = ['beer_servings', 'continent']
small_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   beer_servings  193 non-null    int64 
 1   continent      193 non-null    object
dtypes: int64(1), object(1)
memory usage: 13.7 KB


In [34]:
dtypes = {'continent':'category'}
small_drinks = pd.read_csv('http://bit.ly/drinksbycountry', usecols=cols, dtype=dtypes)
small_drinks.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193 entries, 0 to 192
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   beer_servings  193 non-null    int64   
 1   continent      193 non-null    category
dtypes: category(1), int64(1)
memory usage: 2.4 KB


## 9. Construindo um dataframe a partir de um arquivo com multiplas linhas

In [35]:
pd.read_csv('data/stocks1.csv')



Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT


In [36]:
pd.read_csv('data/stocks2.csv')

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-04,113.0,29736800,AAPL
1,2016-10-04,57.24,20085900,MSFT
2,2016-10-04,31.35,18460400,CSCO


In [37]:
pd.read_csv('data/stocks1.csv')

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT


In [38]:
stock_files = sorted(glob('data/stocks*.csv'))
stock_files.pop(0)
stock_files

['data\\stocks1.csv', 'data\\stocks2.csv', 'data\\stocks3.csv']

In [39]:
pd.concat((pd.read_csv(file) for file in stock_files))

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
0,2016-10-04,113.0,29736800,AAPL
1,2016-10-04,57.24,20085900,MSFT
2,2016-10-04,31.35,18460400,CSCO
0,2016-10-05,57.64,16726400,MSFT
1,2016-10-05,31.59,11808600,CSCO
2,2016-10-05,113.05,21453100,AAPL


In [40]:
pd.concat((pd.read_csv(file) for file in stock_files), ignore_index=True)

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


## 10. Construindo um dataframe a partir de um arquivo com multiplas colunas

In [41]:
pd.read_csv('data/drinks1.csv').head(3)

Unnamed: 0,country,beer_servings,spirit_servings
0,Afghanistan,0,0
1,Albania,89,132
2,Algeria,25,0


In [42]:
pd.read_csv('data/drinks2.csv').head(3)

Unnamed: 0,wine_servings,total_litres_of_pure_alcohol,continent
0,0,0.0,Asia
1,54,4.9,Europe
2,14,0.7,Africa


In [43]:
drinks_files = sorted(glob('data/drinks*.csv'))
drinks_files.pop(0)
drinks_files

['data\\drinks1.csv', 'data\\drinks2.csv']

In [44]:
pd.concat((pd.read_csv(file) for file in drinks_files), axis='columns' ).head()

Unnamed: 0,country,beer_servings,spirit_servings,wine_servings,total_litres_of_pure_alcohol,continent
0,Afghanistan,0,0,0,0.0,Asia
1,Albania,89,132,54,4.9,Europe
2,Algeria,25,0,14,0.7,Africa
3,Andorra,245,138,312,12.4,Europe
4,Angola,217,57,45,5.9,Africa


## 11. Crie um DataFrame de um clipboard(prancheta ou área de transferência)

In [45]:
df = pd.read_clipboard()
df

Unnamed: 0,for,i,in,range(len(new_columns)


In [46]:
df.index

Index([], dtype='object')

In [47]:
df.dtypes 

for                       object
i                         object
in                        object
range(len(new_columns)    object
dtype: object

In [48]:
df = pd.read_clipboard()
df

Unnamed: 0,for,i,in,range(len(new_columns)


In [49]:
df.index

Index([], dtype='object')

## 12. Fazendo um split dentro de dois subsets aleatórios

In [50]:
# Tamanho da amostra
len(movies)

979

In [51]:
# Pegando uma amostra do dataframe movie
movies_1 = movies.sample(frac=0.75, random_state=1234)
movies_1.head(2)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
387,8.0,Midnight Cowboy,X,Drama,113,"[u'Dustin Hoffman', u'Jon Voight', u'Sylvia Mi..."
653,7.7,Fearless,PG-13,Action,104,"[u'Jet Li', u'Li Sun', u'Yong Dong']"


In [52]:
# frac = 0,75 * 979 = 734,25 areedondando 734
print(len(movies_1))


734


In [53]:
movies_2 = movies.drop(movies_1.index)
movies_2.head(2)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."


In [54]:
# deletou a linhas em comum do índice de movies1 e sobrou somente 245
print(len(movies_2))

245


In [55]:
len(movies_1) + len(movies_2) 

979

In [56]:
movies_1.index.sort_values()

Int64Index([  0,   2,   5,   6,   7,   8,   9,  11,  13,  16,
            ...
            966, 967, 969, 971, 972, 974, 975, 976, 977, 978],
           dtype='int64', length=734)

In [57]:
movies_2.index.sort_values()

Int64Index([  1,   3,   4,  10,  12,  14,  15,  18,  26,  30,
            ...
            931, 934, 937, 941, 950, 954, 960, 968, 970, 973],
           dtype='int64', length=245)

## 13. Filtrando o DataFrame por multiplas categorias

In [58]:
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [59]:
movies['genre'].unique()


array(['Crime', 'Action', 'Drama', 'Western', 'Adventure', 'Biography',
       'Comedy', 'Animation', 'Mystery', 'Horror', 'Film-Noir', 'Sci-Fi',
       'History', 'Thriller', 'Family', 'Fantasy'], dtype=object)

In [60]:
movies[(movies['genre'] == 'Action')|
       (movies['genre'] == 'Drama')|
       (movies['genre'] == 'Western')].head()


Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."


In [61]:
# filtra somente os valores True no método isin.
movies[(movies['genre'].isin(['Action', 'Drama', 'Western']))].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."
6,8.9,"The Good, the Bad and the Ugly",NOT RATED,Western,161,"[u'Clint Eastwood', u'Eli Wallach', u'Lee Van ..."
9,8.9,Fight Club,R,Drama,139,"[u'Brad Pitt', u'Edward Norton', u'Helena Bonh..."
11,8.8,Inception,PG-13,Action,148,"[u'Leonardo DiCaprio', u'Joseph Gordon-Levitt'..."


In [62]:
# filtra somente os valores Falsos no método isin.
movies[~(movies['genre'].isin(['Action', 'Drama', 'Western']))].head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."
7,8.9,The Lord of the Rings: The Return of the King,PG-13,Adventure,201,"[u'Elijah Wood', u'Viggo Mortensen', u'Ian McK..."


## 14. Filtrando o DataFrame por grandes categorias

In [63]:
counts = movies['genre'].value_counts()
counts

Drama        278
Comedy       156
Action       136
Crime        124
Biography     77
Adventure     75
Animation     62
Horror        29
Mystery       16
Western        9
Sci-Fi         5
Thriller       5
Film-Noir      3
Family         2
History        1
Fantasy        1
Name: genre, dtype: int64

In [64]:
counts.nlargest(3)

Drama     278
Comedy    156
Action    136
Name: genre, dtype: int64

In [65]:
counts.nlargest(3).index

Index(['Drama', 'Comedy', 'Action'], dtype='object')

In [66]:
counts.nsmallest(3)

History    1
Fantasy    1
Family     2
Name: genre, dtype: int64

In [67]:
counts.nsmallest(3).index

Index(['History', 'Fantasy', 'Family'], dtype='object')

In [68]:
movies[movies['genre'].isin(counts.nlargest(3).index)].head(2)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
5,8.9,12 Angry Men,NOT RATED,Drama,96,"[u'Henry Fonda', u'Lee J. Cobb', u'Martin Bals..."


In [69]:
movies[movies['genre'].isin(counts.nsmallest(3).index)].head(2)

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
338,8.0,Battleship Potemkin,UNRATED,History,66,"[u'Aleksandr Antonov', u'Vladimir Barsky', u'G..."
468,7.9,E.T. the Extra-Terrestrial,PG,Family,115,"[u'Henry Thomas', u'Drew Barrymore', u'Peter C..."


In [70]:
movies.groupby('genre')['genre'].count()

genre
Action       136
Adventure     75
Animation     62
Biography     77
Comedy       156
Crime        124
Drama        278
Family         2
Fantasy        1
Film-Noir      3
History        1
Horror        29
Mystery       16
Sci-Fi         5
Thriller       5
Western        9
Name: genre, dtype: int64

## 15. Lidando com valores faltantes

In [71]:
ufo.head()


Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,1930-06-01 22:00:00
1,Willingboro,,OTHER,NJ,1930-06-30 20:00:00
2,Holyoke,,OVAL,CO,1931-02-15 14:00:00
3,Abilene,,DISK,KS,1931-06-01 13:00:00
4,New York Worlds Fair,,LIGHT,NY,1933-04-18 19:00:00


In [72]:
ufo.isna().sum()

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [73]:
ufo.isna()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,False,True,False,False,False
1,False,True,False,False,False
2,False,True,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False
...,...,...,...,...,...
18236,False,True,False,False,False
18237,False,True,False,False,False
18238,False,True,True,False,False
18239,False,False,False,False,False


In [74]:
ufo.isna().mean()

City               0.001371
Colors Reported    0.842004
Shape Reported     0.144948
State              0.000000
Time               0.000000
dtype: float64

In [75]:
ufo.dropna(axis='columns').head()

Unnamed: 0,State,Time
0,NY,1930-06-01 22:00:00
1,NJ,1930-06-30 20:00:00
2,CO,1931-02-15 14:00:00
3,KS,1931-06-01 13:00:00
4,NY,1933-04-18 19:00:00


In [76]:
ufo.dropna(thresh=len(ufo)*0.9, axis='columns').head()

Unnamed: 0,City,State,Time
0,Ithaca,NY,1930-06-01 22:00:00
1,Willingboro,NJ,1930-06-30 20:00:00
2,Holyoke,CO,1931-02-15 14:00:00
3,Abilene,KS,1931-06-01 13:00:00
4,New York Worlds Fair,NY,1933-04-18 19:00:00


## 16. Fazendo o split de uma string dentro de multiplas colunas

In [77]:
df = pd.DataFrame({'name':['John Arthur Doe','Jane Ann Smith'],'location':['Los Angeles, CA','Washington, DC']})
df 


Unnamed: 0,name,location
0,John Arthur Doe,"Los Angeles, CA"
1,Jane Ann Smith,"Washington, DC"


In [78]:
df['name'].str.split(' ', expand=True)

Unnamed: 0,0,1,2
0,John,Arthur,Doe
1,Jane,Ann,Smith


In [79]:
df[['first', 'middle', 'last']] = df['name'].str.split(' ', expand=True)
df

Unnamed: 0,name,location,first,middle,last
0,John Arthur Doe,"Los Angeles, CA",John,Arthur,Doe
1,Jane Ann Smith,"Washington, DC",Jane,Ann,Smith


In [80]:
df['location'].str.split(', ', expand=True)

Unnamed: 0,0,1
0,Los Angeles,CA
1,Washington,DC


In [81]:
df['location'].str.split(', ', expand=True)[0] 

0    Los Angeles
1     Washington
Name: 0, dtype: object

In [82]:
df['location'].str.split(', ', expand=True)[1] 

0    CA
1    DC
Name: 1, dtype: object

In [86]:
df['city'] = df['location'].str.split(', ', expand=True)[0] 
df['postal_code'] = df['location'].str.split(', ', expand=True)[1] 
df

Unnamed: 0,name,location,first,middle,last,city,postal_code
0,John Arthur Doe,"Los Angeles, CA",John,Arthur,Doe,Los Angeles,CA
1,Jane Ann Smith,"Washington, DC",Jane,Ann,Smith,Washington,DC


In [87]:
new_columns = ['city', 'postal_code']
for i in range(len(new_columns)):
    df[new_columns[i]] = df['location'].str.split(', ', expand=True)[i] 
df

Unnamed: 0,name,location,first,middle,last,city,postal_code
0,John Arthur Doe,"Los Angeles, CA",John,Arthur,Doe,Los Angeles,CA
1,Jane Ann Smith,"Washington, DC",Jane,Ann,Smith,Washington,DC


## 17. Espandindo uma lista de Series dentro de um DataFrame

In [88]:
df = pd.DataFrame({'col_one':['a', 'b', 'c'], 'col_two':[[10, 40], [20, 50], [30, 60]] })
df


Unnamed: 0,col_one,col_two
0,a,"[10, 40]"
1,b,"[20, 50]"
2,c,"[30, 60]"


In [97]:
df_new = df['col_two'].apply(pd.Series)
df_new.rename({0:'value_one', 1:'value_two'} , inplace=True, axis=1)
df_new



Unnamed: 0,value_one,value_two
0,10,40
1,20,50
2,30,60


In [100]:
pd.concat([df, df_new], axis='columns')

Unnamed: 0,col_one,col_two,value_one,value_two
0,a,"[10, 40]",10,40
1,b,"[20, 50]",20,50
2,c,"[30, 60]",30,60


## 18. Agregando (agregate) por multiplas funções

In [101]:
orders.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98
6,3,1,Side of Chips,,1.69
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25


In [105]:
orders[orders['order_id'] == 1]

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39


In [115]:
orders[orders['order_id'] == 1]['item_price'].sum()

11.56

In [114]:
orders.groupby('order_id')['item_price'].sum().head()

order_id
1    11.56
2    16.98
3    12.67
4    21.00
5    13.70
Name: item_price, dtype: float64

In [116]:
orders.groupby('order_id')['item_price'].agg(['sum', 'count']).head()

Unnamed: 0_level_0,sum,count
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,11.56,4
2,16.98,1
3,12.67,2
4,21.0,2
5,13.7,2


## 19. Combinando a saída com a agregação do DataFrame

In [117]:
orders.head(3)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39
1,1,1,Izze,[Clementine],3.39
2,1,1,Nantucket Nectar,[Apple],3.39


In [122]:
orders.groupby('order_id')['item_price'].sum().head()



order_id
1    11.56
2    16.98
3    12.67
4    21.00
5    13.70
Name: item_price, dtype: float64

In [124]:
len(orders.groupby('order_id')['item_price'].sum())

1834

In [125]:
len(orders['item_price'])

4622

In [127]:
total_price = orders.groupby('order_id')['item_price'].transform('sum')
total_price

0       11.56
1       11.56
2       11.56
3       11.56
4       16.98
        ...  
4617    23.50
4618    23.50
4619    28.75
4620    28.75
4621    28.75
Name: item_price, Length: 4622, dtype: float64

In [128]:
len(total_price)

4622

In [130]:
orders['total_price'] = total_price
orders.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price
0,1,1,Chips and Fresh Tomato Salsa,,2.39,11.56
1,1,1,Izze,[Clementine],3.39,11.56
2,1,1,Nantucket Nectar,[Apple],3.39,11.56
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,11.56
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,16.98
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98,12.67
6,3,1,Side of Chips,,1.69,12.67
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75,21.0
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25,21.0
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25,13.7


In [132]:
orders['percent_of_total'] = orders['item_price'] / orders['total_price']
orders.head(10)

Unnamed: 0,order_id,quantity,item_name,choice_description,item_price,total_price,percent_of_total
0,1,1,Chips and Fresh Tomato Salsa,,2.39,11.56,0.206747
1,1,1,Izze,[Clementine],3.39,11.56,0.293253
2,1,1,Nantucket Nectar,[Apple],3.39,11.56,0.293253
3,1,1,Chips and Tomatillo-Green Chili Salsa,,2.39,11.56,0.206747
4,2,2,Chicken Bowl,"[Tomatillo-Red Chili Salsa (Hot), [Black Beans...",16.98,16.98,1.0
5,3,1,Chicken Bowl,"[Fresh Tomato Salsa (Mild), [Rice, Cheese, Sou...",10.98,12.67,0.866614
6,3,1,Side of Chips,,1.69,12.67,0.133386
7,4,1,Steak Burrito,"[Tomatillo Red Chili Salsa, [Fajita Vegetables...",11.75,21.0,0.559524
8,4,1,Steak Soft Tacos,"[Tomatillo Green Chili Salsa, [Pinto Beans, Ch...",9.25,21.0,0.440476
9,5,1,Steak Burrito,"[Fresh Tomato Salsa, [Rice, Black Beans, Pinto...",9.25,13.7,0.675182


## 20. Selecionando slice(pedaço/fatiamento) das linhas e colunas

In [134]:
titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [135]:
titanic.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [136]:
titanic.describe().loc['min':'max']

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [137]:
titanic.describe().loc['min':'max', 'Pclass':'Parch']

Unnamed: 0,Pclass,Age,SibSp,Parch
min,1.0,0.42,0.0,0.0
25%,2.0,20.125,0.0,0.0
50%,3.0,28.0,0.0,0.0
75%,3.0,38.0,1.0,0.0
max,3.0,80.0,8.0,6.0


## 21. Remodelar (Reshape) um índice multiplo (MultiIndexed) Series

In [None]:
titanic['Survived'].mean()

In [139]:
titanic.groupby('Sex')['Survived'].mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [140]:
titanic.groupby(['Sex', 'Pclass'])['Survived'].mean()

Sex     Pclass
female  1         0.968085
        2         0.921053
        3         0.500000
male    1         0.368852
        2         0.157407
        3         0.135447
Name: Survived, dtype: float64

In [141]:
titanic.groupby(['Sex', 'Pclass'])['Survived'].mean().unstack()

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


## 22. Criando uma tabela dinâmica (pivot table ou análise dinâmica)

## 23. Conversão continua de data dentro dados categoricos (categorical data)

## 24. Mudança do display de opções

## 25. Estilo(Style) do DataFrame