# Working with a Single Pandas DataFrame

In [1]:
import pandas as pd
import numpy as np

### DataFrame Initialization

In [20]:
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    })
print(df)

   col_0  col_1
0      0      1
1     10     11
2     20     21


In [22]:
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21


In [23]:
df = pd.DataFrame({
        'col_0': {'row_0': 0, 'row_1': 10, 'row_2': 20},
        'col_1': {'row_0': 1, 'row_1': 11, 'row_2': 21}
    })
print(df)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21


### DataFrame Adding/Deleting a Column

In [24]:
# Add a column
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
df['col_2'] = [2, 21, 22]
print(df)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       col_0  col_1  col_2
row_0      0      1      2
row_1     10     11     21
row_2     20     21     22


In [25]:
# Delete a column
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
del df['col_1']
print(df)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       col_0
row_0      0
row_1     10
row_2     20


### DataFrame Index, Columns and Values

In [35]:
# DataFrame index
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.index)
print(df.index.values)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
Index(['row_0', 'row_1', 'row_2'], dtype='object')
['row_0' 'row_1' 'row_2']


In [38]:
# DataFrame columns
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.columns)
print(df.columns.values)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
Index(['col_0', 'col_1'], dtype='object')
['col_0' 'col_1']


In [36]:
# DataFrame values
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.values)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
[[ 0  1]
 [10 11]
 [20 21]]


### DataFrame Transpose

In [41]:
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.T)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       row_0  row_1  row_2
col_0      0     10     20
col_1      1     11     21


### DataFrame Reindexing

In [45]:
# Reindex rows
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
df_reindexed = df.reindex(index=['row_2', 'row_1', 'row_0'])
print(df_reindexed)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       col_0  col_1
row_2     20     21
row_1     10     11
row_0      0      1


In [46]:
# Reindex columns
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
df_reindexed = df.reindex(columns=['col_1', 'col_0'])
print(df_reindexed)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       col_1  col_0
row_0      1      0
row_1     11     10
row_2     21     20


### DataFrame Drop Entries

In [48]:
# Drop rows
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
df_dropped = df.drop(['row_0', 'row_1'])
print(df_dropped)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       col_0  col_1
row_2     20     21


In [49]:
# Drop columns
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
df_dropped = df.drop('col_1', axis=1)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       col_0
row_0      0
row_1     10
row_2     20


### DataFrame Indexing and Selecting

#### Label with .loc

In [60]:
# df.loc[:, col_label] selects columns
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.loc[:, 'col_0']) 

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
row_0     0
row_1    10
row_2    20
Name: col_0, dtype: int64


In [62]:
# df.loc[row_label] selects rows
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.loc['row_0'])

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
col_0    0
col_1    1
Name: row_0, dtype: int64


#### Position with .iloc

In [63]:
# df.iloc[:, col_position] selects columns
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.iloc[:, 0])

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
row_0     0
row_1    10
row_2    20
Name: col_0, dtype: int64


In [64]:
# df.iloc[row_position] selects rows
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.iloc[0])

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
df.iloc[row_position] selects rows
col_0    0
col_1    1
Name: row_0, dtype: int64


#### Label and Position

In [76]:
# df.loc[df.index[row_position], col_label]
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.loc[df.index[2], 'col_1'])

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
21


In [77]:
# df.loc[row_label, df.colums[col_position]]
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.loc['row_2', df.columns[1]])

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
21


#### With [ ]

In [57]:
# df[col_label] selects columns
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df['col_0']) 

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
row_0     0
row_1    10
row_2    20
Name: col_0, dtype: int64


In [73]:
# Convenience 1: [] with a slice to slice rows
# df[row_position_slice]
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df[1:2])

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       col_0  col_1
row_1     10     11


In [80]:
# Convenience 2: [] with a boolean array to filter rows
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df['col_1']>10) # Returns a boolean array
print(df[df['col_0']>10])

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
row_0    False
row_1     True
row_2     True
Name: col_1, dtype: bool
       col_0  col_1
row_2     20     21


In [86]:
# Convenience 3: [] with a boolean DataFrame to filter element
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df>10)
print(df[df>10])
# This is used to set values elementwise based on some criterion
df[df>10] = 999
print(df)

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       col_0  col_1
row_0  False  False
row_1  False   True
row_2   True   True
       col_0  col_1
row_0    NaN    NaN
row_1    NaN   11.0
row_2   20.0   21.0
       col_0  col_1
row_0      0      1
row_1     10    999
row_2    999    999


### DataFrame Sorting

#### Sorting by Index

In [109]:
# Sort rows by index
df = pd.DataFrame(np.arange(9).reshape((3,3)),
                  index=['row_2', 'row_1', 'row_0'],
                  columns=['col_2', 'col_1', 'col_0'])
print(df)
df_sorted = df.sort_index()
print(df_sorted)

       col_2  col_1  col_0
row_2      0      1      2
row_1      3      4      5
row_0      6      7      8
       col_2  col_1  col_0
row_0      6      7      8
row_1      3      4      5
row_2      0      1      2


In [110]:
# Sort columns by index
df = pd.DataFrame(np.arange(9).reshape((3,3)),
                  index=['row_2', 'row_1', 'row_0'],
                  columns=['col_2', 'col_1', 'col_0'])
print(df)
df_sorted = df.sort_index(axis=1)
print(df_sorted)

       col_2  col_1  col_0
row_2      0      1      2
row_1      3      4      5
row_0      6      7      8
       col_0  col_1  col_2
row_2      2      1      0
row_1      5      4      3
row_0      8      7      6


#### Sorting by Values

In [112]:
# Sort rows by values
df = pd.DataFrame(np.arange(9)[::-1].reshape((3,3)),
                  index=['row_0', 'row_1', 'row_2'],
                  columns=['col_0', 'col_1', 'col_2'])
print(df)
df_sorted = df.sort_values(by=['col_0'])
print(df_sorted)

       col_0  col_1  col_2
row_0      8      7      6
row_1      5      4      3
row_2      2      1      0
       col_0  col_1  col_2
row_2      2      1      0
row_1      5      4      3
row_0      8      7      6


In [114]:
# Sort columns by values
df = pd.DataFrame(np.arange(9)[::-1].reshape((3,3)),
                  index=['row_0', 'row_1', 'row_2'],
                  columns=['col_0', 'col_1', 'col_2'])
print(df)
df_sorted = df.sort_values(by=['row_0'], axis=1)
print(df_sorted)

       col_0  col_1  col_2
row_0      8      7      6
row_1      5      4      3
row_2      2      1      0
       col_2  col_1  col_0
row_0      6      7      8
row_1      3      4      5
row_2      0      1      2


### DataFrame Ranking

In [116]:
df = pd.DataFrame([
    [3, 4],
    [2, 5],
    [1, 4],
    [0, 5]
])
print(df)
print(df.rank(method='min'))
# Tie-breaking methods with rank:
# average: average rank of group
# min: lowest rank in group
# max: highest rank in group
# first: ranks assigned in order they appear in the array
# dense: like ‘min’, but rank always increases by 1 between groups

   0  1
0  3  4
1  2  5
2  1  4
3  0  5
     0    1
0  4.0  1.0
1  3.0  3.0
2  2.0  1.0
3  1.0  3.0


### DataFrame Computations

In [117]:
# DataFrame max value
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.max())

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
col_0    20
col_1    21
dtype: int64


In [119]:
# DataFrame min value
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.min(axis=1))

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
row_0     0
row_1    10
row_2    20
dtype: int64


In [121]:
# DataFrame max label
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.idxmax())

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
col_0    row_2
col_1    row_2
dtype: object


In [122]:
# DataFrame min label
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.idxmin(axis=1))

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
row_0    col_0
row_1    col_0
row_2    col_0
dtype: object


In [125]:
# DataFrame mean
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.mean(axis=1))

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
row_0     0.5
row_1    10.5
row_2    20.5
dtype: float64


In [126]:
# DataFrame median
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.median())

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
col_0    10.0
col_1    11.0
dtype: float64


In [127]:
# DataFrame sum
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.sum())

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
col_0    30
col_1    33
dtype: int64


### DataFrame Applying Function

In [89]:
# DataFrame element-wise function application
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.applymap(np.square))

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
       col_0  col_1
row_0      0      1
row_1    100    121
row_2    400    441


In [90]:
# DataFrame column-wise function application
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.apply(lambda x: x.max() - x.min()))

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
col_0    20
col_1    20
dtype: int64


In [96]:
# DataFrame row-wise function application
df = pd.DataFrame({
        'col_0': [0, 10, 20],
        'col_1': [1, 11, 21]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.apply(lambda x: x.max() - x.min(), axis=1))

       col_0  col_1
row_0      0      1
row_1     10     11
row_2     20     21
row_0    1
row_1    1
row_2    1
dtype: int64


### DataFrame Missing Values

#### Counting

In [137]:
# Count the number of null values
df = pd.DataFrame({
        'col_0': [0, 10, np.nan],
        'col_1': [1, np.nan, np.nan]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.count())

       col_0  col_1
row_0    0.0    1.0
row_1   10.0    NaN
row_2    NaN    NaN
col_0    2
col_1    1
dtype: int64


#### Filtering

In [139]:
# Whether the values are null
df = pd.DataFrame({
        'col_0': [0, 10, np.nan],
        'col_1': [1, np.nan, np.nan]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.isnull())

       col_0  col_1
row_0    0.0    1.0
row_1   10.0    NaN
row_2    NaN    NaN
       col_0  col_1
row_0  False  False
row_1  False   True
row_2   True   True


In [145]:
# Whether the values are not null
df = pd.DataFrame({
        'col_0': [0, 10, np.nan],
        'col_1': [1, np.nan, np.nan]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
print(df.notnull())

       col_0  col_1
row_0    0.0    1.0
row_1   10.0    NaN
row_2    NaN    NaN
       col_0  col_1
row_0   True   True
row_1   True  False
row_2  False  False


#### Dropping

In [146]:
# Drop rows with ANY null values
df = pd.DataFrame({
        'col_0': [0, 10, np.nan],
        'col_1': [1, np.nan, np.nan]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
df_dropped = df.dropna()
print(df_dropped)

       col_0  col_1
row_0    0.0    1.0
row_1   10.0    NaN
row_2    NaN    NaN
       col_0  col_1
row_0    0.0    1.0


In [147]:
# Drop rows with ALL null values
df = pd.DataFrame({
        'col_0': [0, 10, np.nan],
        'col_1': [1, np.nan, np.nan]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
df_dropped = df.dropna(how='all')
print(df_dropped)

       col_0  col_1
row_0    0.0    1.0
row_1   10.0    NaN
row_2    NaN    NaN
       col_0  col_1
row_0    0.0    1.0
row_1   10.0    NaN


#### Filling

In [148]:
# Fill the null values
df = pd.DataFrame({
        'col_0': [0, 10, np.nan],
        'col_1': [1, np.nan, np.nan]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
df_filled = df.fillna(999)
print(df_filled)

       col_0  col_1
row_0    0.0    1.0
row_1   10.0    NaN
row_2    NaN    NaN
       col_0  col_1
row_0    0.0    1.0
row_1   10.0  999.0
row_2  999.0  999.0


In [149]:
# Fill the null values, column specific
df = pd.DataFrame({
        'col_0': [0, 10, np.nan],
        'col_1': [1, np.nan, np.nan]
    }, index = ['row_0', 'row_1', 'row_2'])
print(df)
df_filled = df.fillna({'col_0': 777, 'col_1': 888})
print(df_filled)

       col_0  col_1
row_0    0.0    1.0
row_1   10.0    NaN
row_2    NaN    NaN
       col_0  col_1
row_0    0.0    1.0
row_1   10.0  888.0
row_2  777.0  888.0
