In [33]:
# make sure you have pandas installed
!pip install pandas




[notice] A new release of pip is available: 24.2 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

# print(pd.__version__)
data_file = 'data.csv' # NOTE: make sure you have this file

## Slicing
- accessing subset of dataframe
- same style as slicing in list

In [21]:
# slicing:
# df[start_idx: stop_idx: step_size]
# default value of start_idx = 0
# default value of stop_idx  = end of df
# default value of step_size = 1

df = pd.read_csv(data_file)
print(df)
print("#################")

# Slice rows from index 2 to 6
print(df[2:7])  # Rows 2,3,4,5,6

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
#################
      Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
2  Charlie   35      Chicago    7 

In [22]:
# Slice rows from index 2 to 8 with step size = 3

print(df)
print("#################")

print(df[2:9:3])  # Rows 2, 5, 8

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
#################
      Name  Age       City  Exp  Exp2   Salary passport  bonus  hours
2  Charlie   35    Chicago    7     

In [23]:
# # (SKIP) Example: out-of-range slicing doesn't error
# print("\nOut-of-bounds slicing [1:3:9] just returns matching row(s):\n", df[1:3:20])  # Row 1 only (step 9 has no effect since only one row in range)

## iloc, loc, iat, at
- .iloc[] – Integer position-based selection
- .loc[]  - Label-based selection: Accesses rows and columns by labels (i.e., names). **Not much used**
- .iat[]  - **Fast** integer-location based scalar accessor
- .at[]   - **Fast** label-based scalar accessor. Much faster when you only need a single value (read or write).


### when to use iloc, loc, iat, at
- **Multiple rows/columns: use .loc[] (label) or .iloc[] (position).**
- **Single cell (scalar): use .at[] (label) or .iat[] (position).**
- Performance tip: .at / .iat are much faster than .loc / .iloc when you’re only working with one value.

In [24]:
# .iloc[] – Integer position-based selection

df = pd.read_csv('data.csv')
print(df)
print("###########################################")

# 1. Access a single row by index:
print(df.iloc[1])  # row with idx=1
print("###########################################")

# 2. Access a specific cell (row 1, column 2):
print(df.iloc[1, 2]) # Los Angeles
print("###########################################")

# 3. Access multiple rows and columns:
print(df.iloc[0:3, 0:2]) # First 3 rows, first 2 columns
print("###########################################")

# 4. Access multiple rows and columns:
print(df.iloc[3:7, 2:5]) # rows with idx=3 to 6, columns with idx=2 to 4
print("###########################################")

# 5. Modify a value:
df.iloc[0, 1] = 26  # Change Alice's age from 25 to 26
print(df)


       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
Name                Bob
Age                  30
City        Los Angeles
Exp                   5
Exp2                  5
Sala

In [25]:
# .loc[] 
# Below I show 4 use cases of .loc[]
    
df = pd.read_csv('data.csv')
print(df)

# 1. df.loc[index_label]  -> direct access by row label.
# e.g.: Get the data of row index 3
print(df.loc[3])
print("###########################################")

# 2. df.loc[condition]  -> filter rows.
# e.g.: Get all employees from Chicago
condition = (df["City"] == "Chicago")
print(df.loc[condition])
print("###########################################")

# 3. df.loc[condition, columns]  -> filter rows + pick columns.
# e.g.: Get the Name and Salary of employees who work in Houston:
condition = (df["City"] == "Houston")
columns   = ["Name", "Salary"]
print(df.loc[condition, columns])
print("###########################################")

# 4. df.loc[condition, "col"] = value -> update values conditionally.
# e.g.: If Salary is missing (NaN), fill it with 50000:
condition = df["Salary"].isna()
df.loc[condition, "Salary"] = 50000
print(df)


       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
Name          David
Age              40
City        Houston
Exp              10
Exp2             10
Salary      90000.0
pass

In [26]:
# SKIP: More use cases of .loc[]
#lets create our own custom index and use it to access data

df = pd.read_csv('data.csv')
print(df)

# lets add labels to the index
l_indx = []
for i in range(1, len(df)+1):
    l_indx.append(f"Emp{i:03d}")
# print(l_indx)

# NOTE: or I can use list comprehension
# l_indx = [f"Emp{i:03d}" for i in range(1, len(df)+1)]

df.index = l_indx
df.index.name = "EmployeeID"

print(df)
print("###########################################")

# 1. Access a single row by label:
print(df.loc['Emp003'])
print("###########################################")

# 2. Access multiple rows by labels:
print(df.loc[['Emp003', 'Emp005', 'Emp009']])
print("###########################################")

# 3. Access specific rows and columns:
print(df.loc[['Emp003', 'Emp005'], ['Name', 'City']])

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
               Name  Age         City  Exp  Exp2   Salary passport  bonus  \
EmployeeID                                     

In [27]:
# .iat[]

df = pd.read_csv('data.csv')
print(df)
print("###########################################")

# 1. Get single value e.g. get col with idx=1 at row with idx=2
print(df.iat[2, 1])       
print("###########################################")

# 2. Change Eva’s bonus (row 4, col 7) to 3000:
df.iat[4, 7] = 3000
print(df)

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
35
       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.

In [28]:
# .at[]

df = pd.read_csv('data.csv')
print(df)
print("###########################################")

# 1. Get Salary for row with idx=0:
print(df.at[0, "Salary"])
print("###########################################")

# 2. Update bonus to 3000 of a certain row with index 4
df.at[4, "bonus"] = 3000
print(df)

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
70000.0
       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  7

## iterate over a dataframe
- **itertuples() is faster and more memory-efficient.**
- **iterrows() is slower** and also converts each row into a Series (which is costly). **NOT USED HERE**

In [29]:
# iteration using itertuples(): 

# lets iterate over a dataframe and send email-message to people over the age > 30
# Show me all rows where age > 30
df = pd.read_csv('data.csv')
print(df)
print("###########################################")

# STEP1: print all rows
for row in df.itertuples():
    print(row)
print("###########################################")

# # STEP2: send email to people Age > 30:  
for row in df.itertuples():
    if row[2] > 30: # Age column is at idx=2
        print(f"EMAIL: You {row[1]} are qualified for discount. Call us")


       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
Pandas(Index=0, Name='Alice', Age=25, City='Chicago', Exp=2, Exp2=2, Salary=70000.0, passport='a43', bonus=1000, hours=100)


## indexing

In [30]:
# Sometime we want to create our own index

df = pd.read_csv('data.csv')
print("Original DataFrame:")
print(df)
print("###########################################")

# 1. Set a column as the index: lets use column passport as index
df.set_index('passport', inplace=True)
print(f"\nSet 'Name' as index:\n{df}")
print("###########################################")

# 2. Reset index: lets change our index back to default
df_reset = df.reset_index()
print(f"\nReset index:\n{df_reset}")
print("###########################################")


# # 3. Lets add our own custom index

l_indx = []
for i in range(1, len(df)+1):
    l_indx.append(f"Emp{i:03d}")
# print(l_indx)

# or I can use list comprehension
# l_indx = [f"Emp{i:03d}" for i in range(1, len(df)+1)]

df.index = l_indx
print("\nManual index change:")
print(df)

Original DataFrame:
       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
000000000000000000000

Set 'Name' as index:
             Name  Age         City  Exp  Exp2   Salary  bon

## sorting

In [31]:
# sorting: lets print df based on some sorted column: 

df = pd.read_csv('data.csv')
print(df)

# 1. Sort by a single column (Age) in ascending order
sorted_by_age = df.sort_values(by='Age', ascending=True)
print("Sorted by Age (ascending):\n", sorted_by_age)
print("###########################################")

# 2. Sort by multiple columns: first by City descending, then by Age ascending
sorted_multi = df.sort_values(by=['City', 'Age'], ascending=[False, True])
print("\nSorted by City (desc) and Age (asc):\n", sorted_multi)

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
Sorted by Age (ascending):
        Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
4       Eva   22      H

# STOP

In [32]:
################# 6. (SKIP) Multi-indexing
df_reset = df.reset_index()
df_multi = df_reset.set_index(['City', 'Name'])
print("\nMulti-indexed DataFrame:")
print(df_multi)


Multi-indexed DataFrame:
                     index  Age  Exp  Exp2   Salary passport  bonus  hours
City        Name                                                          
Chicago     Alice        0   25    2     2  70000.0      a43   1000    100
Los Angeles Bob          1   30    5     5  80000.0      a44   3000    150
Chicago     Charlie      2   35    7     7      NaN      a45      0    200
Houston     David        3   40   10    10  90000.0      a46   7000    100
            Eva          4   22    1     1  48000.0      a47   2555    120
NaN         Frank        5   28    3     3  72000.0      a48      0    100
San Antonio Grace        6   32    6     6  85000.0      a49      0    200
San Diego   Helen        7   26    2     2  62000.0      a50   9000    100
            Helen        8   26    2     2  62000.0      a51      0     80
            Helen        9   26    2     2  62000.0      a52   3000    100
Phoenix     Jerry       10   23    6     6  78000.0      a53      0     80