In [85]:

import pandas as pd

#create DataFrame in long format
df = pd.DataFrame({'team': ['A', 'A', 'A', 'A', 'B', 'B', 'B', 'B'],
                   'player': [1, 2, 3, 4, 1, 2, 3, 4],
                   'points': [11, 8, 10, 6, 12, 5, 9, 4]})

#view DataFrame
df


Unnamed: 0,team,player,points
0,A,1,11
1,A,2,8
2,A,3,10
3,A,4,6
4,B,1,12
5,B,2,5
6,B,3,9
7,B,4,4


In [86]:

#reshape DataFrame from long format to wide format
df_pvt = pd.pivot(df, index='team', columns='player', values='points')

#view updated DataFrame
df_pvt


player,1,2,3,4
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,11,8,10,6
B,12,5,9,4


In [107]:

# method chaining
df_pvt = df.unstack().unstack()
df_pvt
                    

Unnamed: 0,0,1,2,3,4,5,6,7
team,A,A,A,A,B,B,B,B
player,1,2,3,4,1,2,3,4
points,11,8,10,6,12,5,9,4


In [87]:

# now we have a multi-index pivot object
# this can be a bit challenging to work with, so let's handle this issue
df_pvt.columns = df_pvt.columns.get_level_values(0)

print(df_pvt.columns)

df_pvt


Int64Index([1, 2, 3, 4], dtype='int64', name='player')


player,1,2,3,4
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,11,8,10,6
B,12,5,9,4


In [88]:

# here is a slightly different way to do the same thing 
df_pvt.columns.to_flat_index()

df_pvt


player,1,2,3,4
team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A,11,8,10,6
B,12,5,9,4


In [94]:

df_final = df_pvt.reset_index()
df_final


player,team,1,2,3,4
0,A,11,8,10,6
1,B,12,5,9,4


In [None]:

# You can find many more great ideas for flattening multi-index dataframes from the resuorce below.

# https://towardsdatascience.com/how-to-flatten-multiindex-columns-and-rows-in-pandas-f5406c50e569


In [61]:

# let's test some ideas for reshaping or transforming, dataframes
import seaborn as sns
flights = sns.load_dataset('flights')
print(flights)

# we can easily perform multiple types of aggregations in a single line of code
df_summary = flights.groupby('year').agg({'passengers': ['count', 'sum', 'mean', 'min', 'max']}).reset_index()
df_summary


     year month  passengers
0    1949   Jan         112
1    1949   Feb         118
2    1949   Mar         132
3    1949   Apr         129
4    1949   May         121
..    ...   ...         ...
139  1960   Aug         606
140  1960   Sep         508
141  1960   Oct         461
142  1960   Nov         390
143  1960   Dec         432

[144 rows x 3 columns]


Unnamed: 0_level_0,year,passengers,passengers,passengers,passengers,passengers
Unnamed: 0_level_1,Unnamed: 1_level_1,count,sum,mean,min,max
0,1949,12,1520,126.666667,104,148
1,1950,12,1676,139.666667,114,170
2,1951,12,2042,170.166667,145,199
3,1952,12,2364,197.0,171,242
4,1953,12,2700,225.0,180,272
5,1954,12,2867,238.916667,188,302
6,1955,12,3408,284.0,233,364
7,1956,12,3939,328.25,271,413
8,1957,12,4421,368.416667,301,467
9,1958,12,4572,381.0,310,505


In [80]:

df_stacked = df_summary.stack().reset_index(level = 0)
df_stacked.columns=['the_month','passengers_agg','the_year']

# 'ffill' is forward-fill from above down and 'bfill' is back-fill from below up
df_stacked = df_stacked.fillna(method='ffill', axis=0)
df_stacked


Unnamed: 0,the_month,passengers_agg,the_year
,0,,1949.0
count,0,12.000000,1949.0
max,0,148.000000,1949.0
mean,0,126.666667,1949.0
min,0,104.000000,1949.0
...,...,...,...
count,11,12.000000,1960.0
max,11,622.000000,1960.0
mean,11,476.166667,1960.0
min,11,390.000000,1960.0


In [82]:

df_stacked = pd.melt(flights)
df_stacked


Unnamed: 0,variable,value
0,year,1949
1,year,1949
2,year,1949
3,year,1949
4,year,1949
...,...,...
427,passengers,606
428,passengers,508
429,passengers,461
430,passengers,390


In [84]:

df_stacked = flights.melt(id_vars=['month','passengers'])
df_stacked


Unnamed: 0,month,passengers,variable,value
0,Jan,112,year,1949
1,Feb,118,year,1949
2,Mar,132,year,1949
3,Apr,129,year,1949
4,May,121,year,1949
...,...,...,...,...
139,Aug,606,year,1960
140,Sep,508,year,1960
141,Oct,461,year,1960
142,Nov,390,year,1960


In [None]:

# So, unstacking and pivoting are similar. This is also known as long to wide. 

# So, melting and stacking and unpivoting are all similar. This is also known as wide to long. 

# See the resuorce below for more insight into these transformation options.

# https://www.roelpeters.be/four-ways-to-cast-a-pandas-dataframe-from-long-to-wide-format/
# https://towardsdatascience.com/reshaping-a-dataframe-with-pandas-stack-and-unstack-925dc9ce1289
# https://towardsdatascience.com/wide-to-long-data-how-and-when-to-use-pandas-melt-stack-and-wide-to-long-7c1e0f462a98


In [111]:

import pandas as pd
import numpy as np

df = pd.DataFrame({'animals': [['koala', 'kangaroo', 'echidna'], 
                               ['sloth', 'alpaca'], 
                               ['zebra', 'lion', 'baboon']],
                   'diet': [['herbivorous', 'herbivorous', 'carnivorous'], 
                            ['omnivorous', 'herbivorous'], 
                            ['herbivorous', 'carnivorous', 'omnivorous']],
                   'country': ['Australia', 'Peru', 'Kenya']})
df


Unnamed: 0,animals,diet,country
0,"[koala, kangaroo, echidna]","[herbivorous, herbivorous, carnivorous]",Australia
1,"[sloth, alpaca]","[omnivorous, herbivorous]",Peru
2,"[zebra, lion, baboon]","[herbivorous, carnivorous, omnivorous]",Kenya


In [110]:

# We can easily 'explode' multiple items that are nested in a column of a dataframe, and thereby normalize the data set.
df_exp = df.explode('animals').reset_index(drop=True)
df_exp = df_exp.explode('diet').reset_index(drop=True)
df_exp


Unnamed: 0,animals,diet,country
0,koala,herbivorous,Australia
1,koala,herbivorous,Australia
2,koala,carnivorous,Australia
3,kangaroo,herbivorous,Australia
4,kangaroo,herbivorous,Australia
5,kangaroo,carnivorous,Australia
6,echidna,herbivorous,Australia
7,echidna,herbivorous,Australia
8,echidna,carnivorous,Australia
9,sloth,omnivorous,Peru
