# Advanced Pandas
More advanced: https://towardsdatascience.com/learn-advanced-features-for-pythons-main-data-analysis-library-in-20-minutes-d0eedd90d086

In [2]:
import pandas as pd

In [3]:
df = pd.DataFrame({ 'genus_overall': ['avian', 'canine', 'cephalothorax', 'pisces'],
                    'rating_overall': [1.2, 3.4, 5.2, 7.8 ],
                    'num_legs_1178': [2, 4, 8, 0],
                    'num_wings': [2, 0, 0, 0],
                    'num_specimen': [10, 2, 1, 8],
                   
                  },
                  index=['falcon', 'dog', 'spider', 'fish'])
df

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen
falcon,avian,1.2,2,2,10
dog,canine,3.4,4,0,2
spider,cephalothorax,5.2,8,0,1
fish,pisces,7.8,0,0,8


In [25]:
df.dtypes.value_counts()

int64      3
object     1
float64    1
dtype: int64

In [26]:
# SELECT COLS BY TYPE
df.select_dtypes(include=['int'])

Unnamed: 0,num_legs_1178,num_wings,num_specimen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [27]:
# SELECT COLS BY NAME
df.filter(like='num')

Unnamed: 0,num_legs_1178,num_wings,num_specimen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [28]:
# SELECT COLS BY NAME
df.filter(like='overall')

Unnamed: 0,genus_overall,rating_overall
falcon,avian,1.2
dog,canine,3.4
spider,cephalothorax,5.2
fish,pisces,7.8


In [29]:
# SELECT COLS BY REGEX FOR COL NAME
df.filter(regex='\d')

Unnamed: 0,num_legs_1178
falcon,2
dog,4
spider,8
fish,0


This is equivalent to: `df[ df['num_legs_1178'] > 2]`:

In [66]:
df.query('num_legs_1178 > 2')

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen
dog,canine,3.4,4,0,2
spider,cephalothorax,5.2,8,0,1


In [68]:
df['rank'] = df["num_legs_1178"].rank( method="first",  
                                       ascending=False ).astype("int")
df

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen,rank
falcon,avian,1.2,2,2,10,3
dog,canine,3.4,4,0,2,2
spider,cephalothorax,5.2,8,0,1,1
fish,pisces,7.8,0,0,8,4


# Select rows and columns

### Rows and column values may be scalar values, lists, slice objects or boolean
### Simultaneous selection of rows and cols - rows to left of comma, cols to right of comma

In [31]:
# SELECT ROW WITH POSITION 1 (SECOND ROW)
df.iloc[1]

genus_overall     canine
rating_overall       3.4
num_legs_1178          4
num_wings              0
num_specimen           2
Name: dog, dtype: object

In [33]:
# SELECT ROW WITH INDEX=DOG
df.loc['dog']

genus_overall     canine
rating_overall       3.4
num_legs_1178          4
num_wings              0
num_specimen           2
Name: dog, dtype: object

In [44]:
# ALL ROWS + COLS 1 & 4
df.iloc[:,[1,4]]

Unnamed: 0,rating_overall,num_specimen
falcon,1.2,10
dog,3.4,2
spider,5.2,1
fish,7.8,8


In [46]:
# SAME WITH LOC
df.loc[:,['rating_overall','num_specimen']]

Unnamed: 0,rating_overall,num_specimen
falcon,1.2,10
dog,3.4,2
spider,5.2,1
fish,7.8,8


In [4]:
df

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen
falcon,avian,1.2,2,2,10
dog,canine,3.4,4,0,2
spider,cephalothorax,5.2,8,0,1
fish,pisces,7.8,0,0,8


In [49]:
# SLICING - LAST ELEMENT EXCLUSIVE
df.iloc[1:4, 1:4]

Unnamed: 0,rating_overall,num_legs_1178,num_wings
dog,3.4,4,0
spider,5.2,8,0
fish,7.8,0,0


In [55]:
df.iloc[1:4, 3:0:-1]

Unnamed: 0,num_wings,num_legs_1178,rating_overall
dog,0,4,3.4
spider,0,8,5.2
fish,0,0,7.8


In [47]:
# SLICING - LAST ELEMENT INCLUSIVE
df.loc['dog':'fish', 'rating_overall':'num_wings']

Unnamed: 0,rating_overall,num_legs_1178,num_wings
dog,3.4,4,0
spider,5.2,8,0
fish,7.8,0,0


In [34]:
positions = [0, 2, 3]
df.iloc[positions]

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen
falcon,avian,1.2,2,2,10
spider,cephalothorax,5.2,8,0,1
fish,pisces,7.8,0,0,8


In [35]:
idxs = ['falcon', 'spider', 'fish']
df.loc[idxs]

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen
falcon,avian,1.2,2,2,10
spider,cephalothorax,5.2,8,0,1
fish,pisces,7.8,0,0,8


In [37]:
df.iloc[1:4]                       # LIKE LISTS

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen
dog,canine,3.4,4,0,2
spider,cephalothorax,5.2,8,0,1
fish,pisces,7.8,0,0,8


In [43]:
df.loc['dog':'fish']

Unnamed: 0,genus_overall,rating_overall,num_legs_1178,num_wings,num_specimen
dog,canine,3.4,4,0,2
spider,cephalothorax,5.2,8,0,1
fish,pisces,7.8,0,0,8


In [50]:
rows = [0, 2, 3]
cols = [2, 3, 4]
df.iloc[rows, cols]

Unnamed: 0,num_legs_1178,num_wings,num_specimen
falcon,2,2,10
spider,8,0,1
fish,0,0,8


In [5]:
rows = ['falcon', 'dog',]
cols = ['num_legs_1178', 'num_wings', 'num_specimen']
df.loc[rows, cols]

Unnamed: 0,num_legs_1178,num_wings,num_specimen
falcon,2,2,10
dog,4,0,2


# Select rows and columns using "get_loc" and "index" methods

In [6]:
start = df.columns.get_loc('num_legs_1178')
end   = df.columns.get_loc('num_specimen')
df.iloc[:4, start:end+1]

Unnamed: 0,num_legs_1178,num_wings,num_specimen
falcon,2,2,10
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [7]:
start = df.index.get_loc('dog')
end   = df.index.get_loc('fish')
df.iloc[start:end+1, 2:5]

Unnamed: 0,num_legs_1178,num_wings,num_specimen
dog,4,0,2
spider,8,0,1
fish,0,0,8


In [8]:
start = df.index[2]
end   = df.index[3]
df.loc[start:end, ['num_legs_1178', 'num_wings']]

Unnamed: 0,num_legs_1178,num_wings
spider,8,0
fish,0,0


# Set value for particular cell

In [None]:
# https://stackoverflow.com/questions/13842088/set-value-for-particular-cell-in-pandas-dataframe-using-index
df.at['C', 'a'] = 10                          # BY VALUE
df.iat[5,10] = 2                              # BY INDEX


df.set_value('C', 'x', 10)                    # DEPRICATED
df.ix['x','C'] = 10                           # DEPRICATED
df.xs('C')['x'] = 10                          # MODIFIES NEW DF RETURNED BY xs(), NOT THE EXISTING ONE
df['x']['C'] = 10                             # AVOID CHAINED INDEXING NOT TO OPERATE ON COPIES/VIEWS (UNPREDICTABLE)

In [9]:
# THIS IS COOL WHEN MORE DIVERSE DATA IS AVAILABLE
pd.crosstab(
    df["genus_overall"], 
    df["rating_overall"],
    margins=True, 
    normalize=0
)

rating_overall,1.2,3.4,5.2,7.8
genus_overall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
avian,1.0,0.0,0.0,0.0
canine,0.0,1.0,0.0,0.0
cephalothorax,0.0,0.0,1.0,0.0
pisces,0.0,0.0,0.0,1.0
All,0.25,0.25,0.25,0.25


In [5]:
df.groupby("genus_overall")["rating_overall"].mean()

genus_overall
avian            1.2
canine           3.4
cephalothorax    5.2
pisces           7.8
Name: rating_overall, dtype: float64

In [6]:
df.groupby("genus_overall")["rating_overall"].std()

genus_overall
avian           NaN
canine          NaN
cephalothorax   NaN
pisces          NaN
Name: rating_overall, dtype: float64

# Add these features

In [None]:
merged_df = pd.merge(df1, df2, on=['Name', 'Job'])

In [None]:
pivot_df = pd.pivot_table(df, index='Name', values=['Pets', 'Weight'], aggfunc='mean')

In [None]:
melted_df = pd.melt(df, id_vars=['Name', 'Gender'], value_vars=['Pets', 'Weight'])

In [None]:
df['col_name'].rolling(2).sum()
df['col_name'].rolling(2, min_periods=1).sum()
df['col_name'].rolling(2).mean()

In [None]:
df['col_name'].shift(periods=2, [fill_value=0]).sum()

In [None]:
df.to_json(file_name)
df = pd.read_json(file_name)

In [None]:
df = pd.read_html()

In [None]:
df = pd.read_clipboard()

In [None]:
# SQLite
import sqlite3

query = 'SELECT * FROM dune_table'
conn  = sqlite3.connect('dune.db')
dune_df = pd.read_sql(query, conn)
dune_df.to_sql('dune_table', conn, index=False)