In [1]:
import numpy as np
import pandas as pd

### `df.loc[]` to access a group of rows and columns by label(s) or a boolean array.

In [2]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]], # The nested lists are the row values
                  index=['cobra', 'viper', 'sidewinder'],
                  columns=['max_speed', 'shield'])

df

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,5
sidewinder,7,8


In [6]:
# A single lable: return the row values accordingly. 
# - dtype: Series
# - name: row label
# - index: coloumn header

df.loc['cobra']

max_speed    1
shield       2
Name: cobra, dtype: int64

In [5]:
# A list or array of labels
# returned dtype: dfb

label = ['cobra','viper']
df.loc[label]

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,5


In [7]:
# A slice object with labels

df.loc['cobra':'sidewinder']

Unnamed: 0,max_speed,shield
cobra,1,2
viper,4,5
sidewinder,7,8


In [14]:
# Single label for row and column

df.loc['cobra','shield']

2

In [13]:
# Slice with labels for row and single label for column

df.loc[['cobra':'sidewinder'], 'max_speed']

SyntaxError: invalid syntax (<ipython-input-13-92481f18b91f>, line 3)

### `df.index[]` to return the index lables of the data frame

### `df.isnull()` to detect missing values

- Return a same-sized boolean DataFrame.
- Return a mask of bool values for each element in DataFrame

In [2]:
df = pd.DataFrame({'age': [5, 6, np.NaN],
                   'born': [pd.NaT, pd.Timestamp('1939-05-27'),
                            pd.Timestamp('1940-04-25')],
                   'name': ['Alfred', 'Batman', ''],
                   'toy': [None, 'Batmobile', 'Joker']})
df

Unnamed: 0,age,born,name,toy
0,5.0,NaT,Alfred,
1,6.0,1939-05-27,Batman,Batmobile
2,,1940-04-25,,Joker


In [3]:
df.isnull()

Unnamed: 0,age,born,name,toy
0,False,True,False,True
1,False,False,False,False
2,True,False,False,False


### `df.sum()` to return the sum of the values for the requested axis
Parameters:
1. axis: axis for the function to be applied on. 
2. skipna: exclude NA/null values when computing the result.
3. level: 
4. numeric_only
5. min_count
6. **kwargs

In [3]:
# By default, the sum of empty or all-NA series is 0

pd.Series([]).sum()

  This is separate from the ipykernel package so we can avoid doing imports until


0.0

In [10]:
# Construct multiindex series

idx = pd.MultiIndex.from_arrays([
    ['warm', 'warm', 'cold', 'cold'],
    ['dog', 'falcon', 'fish', 'spider']],
    names=['blooded', 'animal'])

s = pd.Series([4, 2, 0, 8], name='legs', index=idx)

print(s)
type(s)

blooded  animal
warm     dog       4
         falcon    2
cold     fish      0
         spider    8
Name: legs, dtype: int64


pandas.core.series.Series

In [17]:
# Sum using level names

s.sum(level='blooded')

blooded
warm    6
cold    8
Name: legs, dtype: int64

In [18]:
# Sum using indices
s.sum(level=0)

blooded
warm    6
cold    8
Name: legs, dtype: int64

In [19]:
# Sum using indices
s.sum(level=1)

animal
dog       4
falcon    2
fish      0
spider    8
Name: legs, dtype: int64

### `df.drop_duplicates()` to return DataFrame with duplicate rows removed / return unique rows a DataFrame has
Parameters:
1. subset: column label or sequence of labels used for identifying duplicates
    - only consider certain columns for identifying duplicates
2. keep: {'first', 'last', False}, default 'first'
    - determine which duplicates (if any) to keep.
3. inplace: bool, default False
    - whether to drop duplicates in place or to return a copy
4. ignore_index: bool, default False
    - if True, the resulting axis will be re-labeled 0,1,...,n-1

In [25]:
df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
    })
df

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [26]:
# By default, it removed duplicates rows based on all columns.
df.drop_duplicates()

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [27]:
# To remove duplicates on specific column(s), use subset
df.drop_duplicates(subset=['brand'])

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5


In [28]:
# To remove duplicates and keep first occurences, use keep
df.drop_duplicates(subset = ['brand','style'], keep='first')

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0


### `df.count()` to count non-NA cells for each column or row. 
Parameters:
1. axis: {0 or 'index', 1 or 'columns'}, default 0
2. level: int or str, optional
3. numeric_only: bool, default False

In [22]:
>>> df = pd.DataFrame({"Person":
                       ["John", "Myla", "Lewis", "John", "Myla"],
                       "Age": [24., np.nan, 21., 33, 26],
                       "Single":[False, True, True, True, False]})
df

Unnamed: 0,Person,Age,Single
0,John,24.0,False
1,Myla,,True
2,Lewis,21.0,True
3,John,33.0,True
4,Myla,26.0,False


In [23]:
# NA values won't get counted

df.count()

Person    5
Age       4
Single    5
dtype: int64

In [24]:
# Counts for each row
df.count(axis=1)

0    3
1    2
2    3
3    3
4    3
dtype: int64

### `df.duplicated()` to return boolean Series denoting duplicate rows
Parameters:
1. subset:
2. keep

In [4]:
df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})
df

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [5]:
# By default, for each set of duplicated values, 
# the first occurrence is set on False and all others on True

df.duplicated()

0    False
1     True
2    False
3    False
4    False
dtype: bool