# Advanced Pandas
More advanced: https://towardsdatascience.com/learn-advanced-features-for-pythons-main-data-analysis-library-in-20-minutes-d0eedd90d086

In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({ 'genus_overall': ['avian', 'canine', 'cephalothorax', 'pisces'],
                    'rating_overall': [1.2, 3.4, 5.2, 7.8 ],
                    'num_legs_1178': [2, 4, 8, 0],
                    'num_wings': [2, 0, 0, 0],
                    'num_specimen': [10, 2, 1, 8],
                   
                  },
                  index=['falcon', 'dog', 'spider', 'fish'])
df

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen
falcon,avian,1.2,2,2,10
dog,canine,3.4,4,0,2
spider,cephalothorax,5.2,8,0,1
fish,pisces,7.8,0,0,8


## Select columns

In [4]:
df.dtypes.value_counts()
# col by type
df.select_dtypes(include=['int'])
# col by name - has "num" in name
df.filter(like='num')
# has digit(s) in col name
df.filter(regex='\d')

Unnamed: 0,num_legs_1178
falcon,2
dog,4
spider,8
fish,0


Equ. to: `df[ df['num_legs_1178'] > 2]`:

In [14]:
# same things
df[df['num_legs_1178'] > 2]
df.query('num_legs_1178 > 2')

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen,rank
dog,canine,3.4,4,0,2,2
spider,cephalothorax,5.2,8,0,1,1


## Rank
__Rank col values in descend or ascend order__  
Method - How rank group of records  
that have the same value (ties):
* average: average rank of group
* min: lowest rank in group
* max: highest rank in group
* first: in order ranks appear
* dense: = ‘min’, but rank increases by 1

Axending=False => max value has rank 1

In [13]:
df['rank'] = df['num_legs_1178'].rank(
    method='first',
    ascending=False).astype("int")
df[['num_legs_1178', 'rank']]

Unnamed: 0,num_legs_1178,rank
falcon,2,3
dog,4,2
spider,8,1
fish,0,4


## Select rows and cols
__Rows & cols = scalars,lists,slice obj,bools__  
__Simult. rows & cols - rows left, cols right of ","__

In [31]:
# BY POSITION (2ND ROW)
df.iloc[1]
positions = [0, 2, 3]
df.iloc[positions]
# SLICING - EXCLUSIVE (as lists)
df.iloc[1:4, 1:4]

# BY INDEX
df.loc['dog']
idxs=['falcon','spider','fish']
df.loc[idxs]
# SLICING - INCLUSIVE
df.loc['dog':'fish','height':'weight']

# ALL ROWS+COLS 1&4
df.iloc[:,[1,4]]
rows = [0, 2, 3]
cols = [2, 3, 4]
df.iloc[rows, cols]

# SAME W/LOC
df.loc[:,['rating','spec']]
rows = ['falcon', 'dog',]
cols = ['legs', 'wings', 'spec']
df.loc[rows, cols]

genus_overall     canine
rating_overall       3.4
num_legs_1178          4
num_wings              0
num_specimen           2
Name: dog, dtype: object

## Select w/"get_loc" and "index"

In [6]:
start = df.columns.get_loc('num_legs_1178')
end   = df.columns.get_loc('num_specimen')
df.iloc[:4, start:end+1]

start = df.index.get_loc('dog')
end   = df.index.get_loc('fish')
df.iloc[start:end+1, 2:5]

Unnamed: 0,num_legs_1178,num_wings,num_specimen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [8]:
start = df.index[2]
end   = df.index[3]
df.loc[start:end, ['legs', 'wings']]

Unnamed: 0,num_legs_1178,num_wings
spider,8,0
fish,0,0


## Set value for a cell

In [None]:
df.at['C', 'a'] = 10 # BY VALUE
df.iat[5,10] = 2     # BY INDEX

## Other

In [9]:
# cross-tabulation of 2+ factors
pd.crosstab(
    df["genus_overall"], 
    df["rating_overall"],
    margins=True, 
    normalize=0,)

# SQLite
import sqlite3
query = 'SELECT * FROM dune_table'
conn  = sqlite3.connect('dune.db')
dune_df = pd.read_sql(query, conn)
dune_df.to_sql('dune_table', conn,
               index=False)

# reshape w/new cols & values + aggr.
pivot_df = pd.pivot_table(
    df, index='Name',
    columns=['Pets', 'Stores'],
    values='Weight'  # col for values
    aggfunc='mean')

rating_overall,1.2,3.4,5.2,7.8
genus_overall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
avian,1.0,0.0,0.0,0.0
canine,0.0,1.0,0.0,0.0
cephalothorax,0.0,0.0,1.0,0.0
pisces,0.0,0.0,0.0,1.0
All,0.25,0.25,0.25,0.25


### 1a. Aggregations on entire dataframe
__WINDOW FUNCTIONS__ - find trends in data graphically by smoothing the curve
* __df.rolling()__ - rolling window calculations; __window__=window size,  
__min_periods__=min num observations in window required to have a value.
* __df.expanding()__ - same as rolling, but uses all the data up to that point in time.  
Equivalent statements: [df.rolling(window=len(df), min_periods=1).mean()] =  
[df.expanding(min_periods=1).mean()]
* __df.ewm()__ - exponentially weighted window similar to expanding window,  
but each prior point is exponentially weighted down relative to the current point

In [None]:
df['column'].rolling(2).sum()
df['column'].rolling(2, min_periods=1).sum()
df['column'].rolling(2).mean()

# Add these features

In [None]:
# Melt - unpivot df from wide to long format
melted_df = pd.melt(df, id_vars=['Name', 'Gender'], value_vars=['Pets', 'Weight'])

In [None]:
df.to_json(file_name)
df = pd.read_json(file_name)

In [None]:
df = pd.read_html()

In [None]:
df = pd.read_clipboard()