# Pandas 102 - Advanced Pandas Operations

This notebook covers essential pandas operations and techniques for data manipulation and analysis.

# Setup and Configuration

Importing pandas and configuring display options to show all columns, rows, and cell contents.

# Creating a DataFrame

Creating a sample DataFrame with different data types including float, timestamp, categorical, and object.

# Basic DataFrame Operations

Displaying the first few rows of the DataFrame using head().

# Viewing Last Rows

Displaying the last few rows of the DataFrame using tail().

# Random Sampling

Selecting random samples from the DataFrame using sample().

# Index Information

Accessing and displaying the DataFrame's index.

# Column Information

Accessing and displaying the DataFrame's columns.

# Data Types

Checking the data types of each column in the DataFrame using dtypes.

# Statistical Summary

Generating a statistical summary of the DataFrame using describe().

# Transposing Data

Transposing the DataFrame to switch rows and columns using T.

# Viewing Data

Displaying the entire DataFrame.

# Sorting Data

Sorting the DataFrame by specific columns using sort_values().

In [3]:
import pandas as pd

## display all columns and rows, no trimming
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## display all data inside the cells
pd.set_option('display.max_colwidth', None)

In [4]:
df = pd.DataFrame({'A': 1., 'B': pd.Timestamp('20130102'),\
                   'C': pd.Series(1, index=list(range(4)), dtype='float32'),\
                   'D': [3] * 4,\
                   'E': pd.Categorical(["test", "train", "test", "train"]),\
                   'F': 'foo'})
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [5]:
## top 5 rows
df.head()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
## last 5 rows
df.tail()

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
## random sample
df.sample(2)

Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,test,foo
0,1.0,2013-01-02,1.0,3,test,foo


In [8]:
## display the index
df.index

Index([0, 1, 2, 3], dtype='int64')

In [9]:
## display the columns
df.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [10]:
## display the data type for each column
df.dtypes

A          float64
B    datetime64[s]
C          float32
D            int64
E         category
F           object
dtype: object

In [11]:
## describe the dataframe
df.describe()

Unnamed: 0,A,B,C,D
count,4.0,4,4.0,4.0
mean,1.0,2013-01-02 00:00:00,1.0,3.0
min,1.0,2013-01-02 00:00:00,1.0,3.0
25%,1.0,2013-01-02 00:00:00,1.0,3.0
50%,1.0,2013-01-02 00:00:00,1.0,3.0
75%,1.0,2013-01-02 00:00:00,1.0,3.0
max,1.0,2013-01-02 00:00:00,1.0,3.0
std,0.0,,0.0,0.0


In [12]:
## transpose dataframe
df.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [13]:
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [14]:
## sort dataframe by certain columns

df.sort_values(by=['A'], axis=0, ascending=False)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [15]:
df.sort_values(by='C')

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [16]:
# sort by A and B, in descending and ascending order
df.sort_values(by=['A', 'B'], axis=0, ascending=[False, True])

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [17]:
## sort by an axis; to be used when index holds something like date
df.sort_index(axis=1, ascending=False)

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2013-01-02,1.0
1,foo,train,3,1.0,2013-01-02,1.0
2,foo,test,3,1.0,2013-01-02,1.0
3,foo,train,3,1.0,2013-01-02,1.0


In [18]:
## display column A
df.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [19]:
# same as above
df['A']

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [20]:
## slice dataframe by row index number; not the actual index
df[1:3]

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo


In [21]:
## select a row
df.loc[0]

A                    1.0
B    2013-01-02 00:00:00
C                    1.0
D                      3
E                   test
F                    foo
Name: 0, dtype: object

In [22]:
## select a row and some columns
df.loc[0, ['A', 'B']]

A                    1.0
B    2013-01-02 00:00:00
Name: 0, dtype: object

In [23]:
## select all rows and some columns
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
0,1.0,2013-01-02
1,1.0,2013-01-02
2,1.0,2013-01-02
3,1.0,2013-01-02


In [24]:
## get a value/ Scalar
df.loc[0, 'A']

np.float64(1.0)

In [25]:
## ILOC vs LOC
"""loc is label-based, which means that you have to specify rows and columns based
on their row and column labels. iloc is integer index based, so you have to specify 
rows and columns by their integer index like you did in the previous exercise."""


'loc is label-based, which means that you have to specify rows and columns based\non their row and column labels. iloc is integer index based, so you have to specify \nrows and columns by their integer index like you did in the previous exercise.'

In [26]:
## using ILOC
df.iloc[3]

A                    1.0
B    2013-01-02 00:00:00
C                    1.0
D                      3
E                  train
F                    foo
Name: 3, dtype: object

In [27]:
## specific rows/ columns using iloc
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
3,1.0,2013-01-02


In [28]:
## specific cell
df.iloc[1, 1]

Timestamp('2013-01-02 00:00:00')

In [29]:
## all rows, specific columns
df.iloc[:, 1:3]

Unnamed: 0,B,C
0,2013-01-02,1.0
1,2013-01-02,1.0
2,2013-01-02,1.0
3,2013-01-02,1.0


In [30]:
## all columns, specific rows
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D,E,F
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo


In [31]:
## faster access using iat
## Access a single value for a row/column pair by integer position.
## Similar to iloc, in that both provide integer-based lookups. Use iat if you 
## only need to get or set a single value in a DataFrame or Series.

df.iat[1, 1]

Timestamp('2013-01-02 00:00:00')

In [33]:
## label-location based indexer, with integer position fallback
## .ix supports mixed integer and label based access. It is primarily label based, 
## but will fall back to integer positional access unless the corresponding axis is 
## of integer type. .ix is the most general and will support any of the inputs in .loc 
## and .iloc. .ix also supports floating point label schemes. .ix is exceptionally
## useful when dealing with mixed positional and label based hierachical indexes.
import pandas as pd
import numpy as np

# Create a sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500]
})
print("Sample DataFrame:")
print(df)

# Instead of df.ix[1,1], use one of these alternatives:

# 1. Using iloc (integer-based indexing)
print("\n1. Using iloc (integer-based indexing):")
print(f"df.iloc[1, 1] = {df.iloc[1, 1]}")  # Row 1, Column 1

# 2. Using loc (label-based indexing)
print("\n2. Using loc (label-based indexing):")
print(f"df.loc[1, 'B'] = {df.loc[1, 'B']}")  # Row with label 1, Column 'B'

# 3. Using iat (faster integer-based indexing for scalar access)
print("\n3. Using iat (faster integer-based indexing):")
print(f"df.iat[1, 1] = {df.iat[1, 1]}")  # Row 1, Column 1

# 4. Using at (faster label-based indexing for scalar access)
print("\n4. Using at (faster label-based indexing):")
print(f"df.at[1, 'B'] = {df.at[1, 'B']}")  # Row with label 1, Column 'B'

Sample DataFrame:
   A   B    C
0  1  10  100
1  2  20  200
2  3  30  300
3  4  40  400
4  5  50  500

1. Using iloc (integer-based indexing):
df.iloc[1, 1] = 20

2. Using loc (label-based indexing):
df.loc[1, 'B'] = 20

3. Using iat (faster integer-based indexing):
df.iat[1, 1] = 20

4. Using at (faster label-based indexing):
df.at[1, 'B'] = 20


In [34]:
## boolean indexing on one column value
df[df['A'] > 0]

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [35]:
## boolean indexing on multiple column values
df[(df['A'] > 0) & (df['C']==1.0)]

Unnamed: 0,A,B,C


In [36]:
## Selecting values from a DataFrame where a boolean condition is met
df[df[['A','C']] > 0]

Unnamed: 0,A,B,C
0,1,,100
1,2,,200
2,3,,300
3,4,,400
4,5,,500


In [37]:
## can also use the following if all columns are dtype int/float
df[df > 0]

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [40]:
import pandas as pd
import numpy as np

# Create a sample DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': ['foo', 'bar', 'baz', 'foo', 'bar'],
    'D': [True, False, True, False, True]
})
print("Sample DataFrame:")
print(df)

# Using isin method to filter with an existing column
print("\nFiltering rows where column 'C' contains 'foo':")
filtered_df = df[df['C'].isin(['foo'])]
print(filtered_df)

# Multiple values
print("\nFiltering rows where column 'C' contains 'foo' or 'bar':")
filtered_multiple = df[df['C'].isin(['foo', 'bar'])]
print(filtered_multiple)

# Numeric column example
print("\nFiltering rows where column 'A' is in [1, 3, 5]:")
filtered_numeric = df[df['A'].isin([1, 3, 5])]
print(filtered_numeric)

# Boolean column example
print("\nFiltering rows where column 'D' is True:")
filtered_boolean = df[df['D'].isin([True])]
print(filtered_boolean)

# Inverse filtering with ~
print("\nFiltering rows where column 'C' is NOT 'foo':")
filtered_inverse = df[~df['C'].isin(['foo'])]
print(filtered_inverse)

Sample DataFrame:
   A   B    C      D
0  1  10  foo   True
1  2  20  bar  False
2  3  30  baz   True
3  4  40  foo  False
4  5  50  bar   True

Filtering rows where column 'C' contains 'foo':
   A   B    C      D
0  1  10  foo   True
3  4  40  foo  False

Filtering rows where column 'C' contains 'foo' or 'bar':
   A   B    C      D
0  1  10  foo   True
1  2  20  bar  False
3  4  40  foo  False
4  5  50  bar   True

Filtering rows where column 'A' is in [1, 3, 5]:
   A   B    C     D
0  1  10  foo  True
2  3  30  baz  True
4  5  50  bar  True

Filtering rows where column 'D' is True:
   A   B    C     D
0  1  10  foo  True
2  3  30  baz  True
4  5  50  bar  True

Filtering rows where column 'C' is NOT 'foo':
   A   B    C      D
1  2  20  bar  False
2  3  30  baz   True
4  5  50  bar   True


In [41]:
## adding a new column
s1 = pd.Series([1, 2, 3, 4])
df['G'] = s1
df

Unnamed: 0,A,B,C,D,G
0,1,10,foo,True,1.0
1,2,20,bar,False,2.0
2,3,30,baz,True,3.0
3,4,40,foo,False,4.0
4,5,50,bar,True,


In [43]:
import pandas as pd
import numpy as np

# Create a sample DataFrame with 5 rows
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': ['foo', 'bar', 'baz', 'qux', 'quux']
})
print("Sample DataFrame (5 rows):")
print(df)

# Correct way: Assign values with matching length
print("\nAdding column 'F' with matching length (5 values):")
df['F'] = ['foo1', 'foo2', 'foo3', 'foo4', 'foo5']  # 5 values to match 5 rows
print(df)

# Alternative: Use a scalar value (will be broadcast to all rows)
print("\nAdding column 'G' with a scalar value:")
df['G'] = 'same_value'  # One value broadcast to all rows
print(df)

# Another approach: Use a Series with the correct index
print("\nAdding column 'H' using a Series with matching index:")
s = pd.Series(['bar1', 'bar2', 'bar3', 'bar4', 'bar5'], index=df.index)
df['H'] = s
print(df)

# Handling mismatched lengths with explicit indexing
print("\nHandling mismatched lengths with explicit indexing:")
values = ['val1', 'val2', 'val3', 'val4']  # Only 4 values
s_short = pd.Series(values, index=[0, 1, 2, 3])  # Series with 4 indices

# This will set NaN for the 5th row
df['I'] = s_short
print(df)

Sample DataFrame (5 rows):
   A   B     C
0  1  10   foo
1  2  20   bar
2  3  30   baz
3  4  40   qux
4  5  50  quux

Adding column 'F' with matching length (5 values):
   A   B     C     F
0  1  10   foo  foo1
1  2  20   bar  foo2
2  3  30   baz  foo3
3  4  40   qux  foo4
4  5  50  quux  foo5

Adding column 'G' with a scalar value:
   A   B     C     F           G
0  1  10   foo  foo1  same_value
1  2  20   bar  foo2  same_value
2  3  30   baz  foo3  same_value
3  4  40   qux  foo4  same_value
4  5  50  quux  foo5  same_value

Adding column 'H' using a Series with matching index:
   A   B     C     F           G     H
0  1  10   foo  foo1  same_value  bar1
1  2  20   bar  foo2  same_value  bar2
2  3  30   baz  foo3  same_value  bar3
3  4  40   qux  foo4  same_value  bar4
4  5  50  quux  foo5  same_value  bar5

Handling mismatched lengths with explicit indexing:
   A   B     C     F           G     H     I
0  1  10   foo  foo1  same_value  bar1  val1
1  2  20   bar  foo2  same_value  b

In [44]:
## what happens when passed more values
try:
    df['F'] = ['foo1', 'foo2', 'foo3', 'foo4', 'foo5', 'foo6']
except Exception as e:
    print('Error')
df

Error


Unnamed: 0,A,B,C,F,G,H,I
0,1,10,foo,foo1,same_value,bar1,val1
1,2,20,bar,foo2,same_value,bar2,val2
2,3,30,baz,foo3,same_value,bar3,val3
3,4,40,qux,foo4,same_value,bar4,val4
4,5,50,quux,foo5,same_value,bar5,


In [45]:
## what happens when passed less values
try:
    df['F'] = ['foo1', 'foo2']
except Exception as e:
    print('Error')
df

Error


Unnamed: 0,A,B,C,F,G,H,I
0,1,10,foo,foo1,same_value,bar1,val1
1,2,20,bar,foo2,same_value,bar2,val2
2,3,30,baz,foo3,same_value,bar3,val3
3,4,40,qux,foo4,same_value,bar4,val4
4,5,50,quux,foo5,same_value,bar5,


In [46]:
## setting a value of a cell at a certain row/ column
df.at[0, 'A'] = 10.0
df

Unnamed: 0,A,B,C,F,G,H,I
0,10,10,foo,foo1,same_value,bar1,val1
1,2,20,bar,foo2,same_value,bar2,val2
2,3,30,baz,foo3,same_value,bar3,val3
3,4,40,qux,foo4,same_value,bar4,val4
4,5,50,quux,foo5,same_value,bar5,


In [47]:
## setting a value of a cell at a certain row/ column labels
df.at[0, 'C'] = 100.0
df

Unnamed: 0,A,B,C,F,G,H,I
0,10,10,100.0,foo1,same_value,bar1,val1
1,2,20,bar,foo2,same_value,bar2,val2
2,3,30,baz,foo3,same_value,bar3,val3
3,4,40,qux,foo4,same_value,bar4,val4
4,5,50,quux,foo5,same_value,bar5,


In [48]:
## setting value by position
df.iat[2, 2] = 5.0
df

Unnamed: 0,A,B,C,F,G,H,I
0,10,10,100.0,foo1,same_value,bar1,val1
1,2,20,bar,foo2,same_value,bar2,val2
2,3,30,5.0,foo3,same_value,bar3,val3
3,4,40,qux,foo4,same_value,bar4,val4
4,5,50,quux,foo5,same_value,bar5,


In [49]:
## assigning values via loc
df.loc[:, 'D'] = [5] * len(df)
df

Unnamed: 0,A,B,C,F,G,H,I,D
0,10,10,100.0,foo1,same_value,bar1,val1,5
1,2,20,bar,foo2,same_value,bar2,val2,5
2,3,30,5.0,foo3,same_value,bar3,val3,5
3,4,40,qux,foo4,same_value,bar4,val4,5
4,5,50,quux,foo5,same_value,bar5,,5


In [50]:
## assigning values via iloc
df.iloc[:, 3] = range(len(df))
df

Unnamed: 0,A,B,C,F,G,H,I,D
0,10,10,100.0,0,same_value,bar1,val1,5
1,2,20,bar,1,same_value,bar2,val2,5
2,3,30,5.0,2,same_value,bar3,val3,5
3,4,40,qux,3,same_value,bar4,val4,5
4,5,50,quux,4,same_value,bar5,,5


In [51]:
## delete a column
df.drop('B', axis=1, inplace=True)

In [52]:
df

Unnamed: 0,A,C,F,G,H,I,D
0,10,100.0,0,same_value,bar1,val1,5
1,2,bar,1,same_value,bar2,val2,5
2,3,5.0,2,same_value,bar3,val3,5
3,4,qux,3,same_value,bar4,val4,5
4,5,quux,4,same_value,bar5,,5
