In [6]:
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

# print(pd.__version__)
data_file = 'data.csv' # NOTE: make sure you have this file

## Slicing
- accessing subset of dataframe
- same style as slicing in list

In [7]:
df = pd.read_csv(data_file)
print(df)

# Slice rows from index 2 to 8
print("\nSlicing rows [2:9]:\n", df[2:9])  # Rows 2,3,... 8

# Slice rows from index 2 to 8, step 3
print("\nSlicing rows [2:9:3]:\n", df[2:9:3])  # Rows 2, 5, 8

# Slice all rows with a step of 3
print("\nEvery 3rd row:\n", df[::3])  # Rows 0, 3, 6, 9

# Reverse the DataFrame
print("\nReversed DataFrame:\n", df[::-1])  # From last to first

# # (SKIP) Example: out-of-range slicing doesn't error
# print("\nOut-of-bounds slicing [1:3:9] just returns matching row(s):\n", df[1:3:20])  # Row 1 only (step 9 has no effect since only one row in range)

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80

Slicing rows [2:9]:
       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
2  Charlie   35      Chicago  

## iloc, loc, iat, at
- .iloc[] – Integer position-based selection
- .loc[]  - Label-based selection: Accesses rows and columns by labels (i.e., names). **Not much used**
- .iat[]  - Fast integer-location based scalar accessor
- .at[]   - Fast label-based scalar accessor

In [8]:
# .iloc[] – Integer position-based selection
# .loc - Label-based selection: Accesses rows and columns by labels (i.e., names). Not much used

df = pd.read_csv('data.csv')
print(df)
# ###########################################
# 1. Access a single row by index:
print(df.iloc[1])  # row with idx=1

# 2. Access a specific cell (row 1, column 2):
print(df.iloc[1, 2]) # Los Angeles

# 3. Access multiple rows and columns:
print(df.iloc[0:3, 0:2]) # First 3 rows, first 2 columns

# 4. Access multiple rows and columns:
print(df.iloc[3:7, 2:5]) # rows with idx=3 to 6, columns with idx=2 to 4


# 5. Modify a value:
df.iloc[0, 1] = 26  # Change Alice's age from 25 to 26
print(df)
##############################################

# 6. Get single value (faster than df.loc[2, "Age"]): e.g. get Age for row with idx=2
print(df.at[2, "Age"])     

# 7. Get single value (faster than df.iloc[2, 1]): e.g. get col with idx=1 at row with idx=2
print(df.iat[2, 1])       


       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
Name                Bob
Age                  30
City        Los Angeles
Exp                   5
Exp2                  5
Sala

In [9]:
# SKIP ############################################
df = pd.read_csv('data.csv')
print(df)

# lets add labels to the index
df.index = [f"Emp{id:03d}" for id in range(1, len(df)+1)]
df.index.name = "EmployeeID"

print(df)

# 1. Access a single row by label:
print(df.loc['Emp003'])

# 2. Access multiple rows by labels:
print(df.loc[['Emp003', 'Emp005', 'Emp009']])

# 3. Access specific rows and columns:
print(df.loc[['Emp003', 'Emp005'], ['Name', 'City']])

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
               Name  Age         City  Exp  Exp2   Salary passport  bonus  \
EmployeeID                                     

In [10]:
# lets iterate over a dataframe and send email-message to people over the age > 30
# Show me all rows where age > 30
df = pd.read_csv('data.csv')
print(df)

# STEP1: print all rows
for i,row in df.iterrows():
    print(row)

# STEP2: send email to people Age > 30:  
for i,row in df.iterrows():
    if row['Age'] > 30:
        print(f"EMAIL: You {row['Name']} are qualified for discount. Call us")


       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
Name          Alice
Age              25
City        Chicago
Exp               2
Exp2              2
Salary      70000.0
pass

## Access columns

In [11]:
# access columns
data_file = 'data.csv'
df = pd.read_csv(data_file)

# style1
age = df['Age']
print(age)
print(type(age)) # <class 'pandas.core.series.Series'>

# style2
age = df.Age
print(age)
print(type(age)) # <class 'pandas.core.series.Series'>

0     25
1     30
2     35
3     40
4     22
5     28
6     32
7     26
8     26
9     26
10    23
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>
0     25
1     30
2     35
3     40
4     22
5     28
6     32
7     26
8     26
9     26
10    23
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


In [12]:
# print column names
names_of_col = df.columns.tolist()
print(names_of_col)

############################

# access mutiple columns
df_temp = df[['Name', 'City']]
print(df_temp)


['Name', 'Age', 'City', 'Exp', 'Exp2', 'Salary', 'passport', 'bonus', 'hours']
       Name         City
0     Alice      Chicago
1       Bob  Los Angeles
2   Charlie      Chicago
3     David      Houston
4       Eva      Houston
5     Frank          NaN
6     Grace  San Antonio
7     Helen    San Diego
8     Helen    San Diego
9     Helen    San Diego
10    Jerry      Phoenix


## Add rows/ columns ; remove rows / columns 

In [13]:
# Now lets ddd rows/ columns ; remove rows / columns
df = pd.read_csv('data.csv')
print(df)

print("####################")
# add a row
# df.loc[len(df)] = ['Kim', 29, 'Tokyo', 4, 4, 67000.0, 'm33'] # INCOREECT because the number of records do not match
df.loc[len(df)] = ['Kim', 29, 'Tokyo', 4, 4, 67000.0, 'm33', 0, 90]
print(df)

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
####################
       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago 

In [14]:
# lets drop row based on index, idx=11
df = df.drop([11,])
print(df)

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80


In [15]:
# lets create a new column 'company' with same values all across rows
df['company'] = "Lucent Shamlodhiya Technologies " 
print("\nAfter :\n",df)


After :
        Name  Age         City  Exp  Exp2   Salary passport  bonus  hours  \
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100   
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150   
2   Charlie   35      Chicago    7     7      NaN      a45      0    200   
3     David   40      Houston   10    10  90000.0      a46   7000    100   
4       Eva   22      Houston    1     1  48000.0      a47   2555    120   
5     Frank   28          NaN    3     3  72000.0      a48      0    100   
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200   
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100   
8     Helen   26    San Diego    2     2  62000.0      a51      0     80   
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100   
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80   

                             company  
0   Lucent Shamlodhiya Technologies   

In [16]:
# Lets drop 2 columns that we do not need
df.drop(columns=['company','Exp2'], inplace=True)
print("\nAfter dropping a column:\n",df)


After dropping a column:
        Name  Age         City  Exp   Salary passport  bonus  hours
0     Alice   25      Chicago    2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7      NaN      a45      0    200
3     David   40      Houston   10  90000.0      a46   7000    100
4       Eva   22      Houston    1  48000.0      a47   2555    120
5     Frank   28          NaN    3  72000.0      a48      0    100
6     Grace   32  San Antonio    6  85000.0      a49      0    200
7     Helen   26    San Diego    2  62000.0      a50   9000    100
8     Helen   26    San Diego    2  62000.0      a51      0     80
9     Helen   26    San Diego    2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6  78000.0      a53      0     80


In [17]:
# lets replace values of column City
df['City'] = df['City'].replace({'Los Angeles': 'LA', 'San Diego': 'SD'})
print(df)

       Name  Age         City  Exp   Salary passport  bonus  hours
0     Alice   25      Chicago    2  70000.0      a43   1000    100
1       Bob   30           LA    5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7      NaN      a45      0    200
3     David   40      Houston   10  90000.0      a46   7000    100
4       Eva   22      Houston    1  48000.0      a47   2555    120
5     Frank   28          NaN    3  72000.0      a48      0    100
6     Grace   32  San Antonio    6  85000.0      a49      0    200
7     Helen   26           SD    2  62000.0      a50   9000    100
8     Helen   26           SD    2  62000.0      a51      0     80
9     Helen   26           SD    2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6  78000.0      a53      0     80


In [18]:
# creating a column by adding values from 2 columns
df['total_salary'] = df['Salary'] + df['bonus']
print(df)

       Name  Age         City  Exp   Salary passport  bonus  hours  \
0     Alice   25      Chicago    2  70000.0      a43   1000    100   
1       Bob   30           LA    5  80000.0      a44   3000    150   
2   Charlie   35      Chicago    7      NaN      a45      0    200   
3     David   40      Houston   10  90000.0      a46   7000    100   
4       Eva   22      Houston    1  48000.0      a47   2555    120   
5     Frank   28          NaN    3  72000.0      a48      0    100   
6     Grace   32  San Antonio    6  85000.0      a49      0    200   
7     Helen   26           SD    2  62000.0      a50   9000    100   
8     Helen   26           SD    2  62000.0      a51      0     80   
9     Helen   26           SD    2  62000.0      a52   3000    100   
10    Jerry   23      Phoenix    6  78000.0      a53      0     80   

    total_salary  
0        71000.0  
1        83000.0  
2            NaN  
3        97000.0  
4        50555.0  
5        72000.0  
6        85000.0  
7      

In [19]:
# lets create a column that shows 10% tax for each person 
df['tax'] = df['total_salary'] * 0.1
print(df)

########################
# lets create a column that shows total_salary + festival bonus of 1000 to each person 
df['total_salary_bonus'] = df['total_salary'] + 1000
print(df)

#######################
# lets create a column that shows salary per hours for each  person 
df['sal_per_hr'] = df['total_salary'] / df['hours']
print(df)

       Name  Age         City  Exp   Salary passport  bonus  hours  \
0     Alice   25      Chicago    2  70000.0      a43   1000    100   
1       Bob   30           LA    5  80000.0      a44   3000    150   
2   Charlie   35      Chicago    7      NaN      a45      0    200   
3     David   40      Houston   10  90000.0      a46   7000    100   
4       Eva   22      Houston    1  48000.0      a47   2555    120   
5     Frank   28          NaN    3  72000.0      a48      0    100   
6     Grace   32  San Antonio    6  85000.0      a49      0    200   
7     Helen   26           SD    2  62000.0      a50   9000    100   
8     Helen   26           SD    2  62000.0      a51      0     80   
9     Helen   26           SD    2  62000.0      a52   3000    100   
10    Jerry   23      Phoenix    6  78000.0      a53      0     80   

    total_salary     tax  
0        71000.0  7100.0  
1        83000.0  8300.0  
2            NaN     NaN  
3        97000.0  9700.0  
4        50555.0  5055.5

## sorting

In [20]:
# lets print df based on some sorted column: 

df = pd.read_csv('data.csv')
print(df)

# 1. Sort by a single column (Age) in ascending order
sorted_by_age = df.sort_values(by='Age', ascending=True)
print("Sorted by Age (ascending):\n", sorted_by_age)

print("##############################")
# SKIP 2. Sort by multiple columns: first by City descending, then by Age ascending
sorted_multi = df.sort_values(by=['City', 'Age'], ascending=[False, True])
print("\nSorted by City (desc) and Age (asc):\n", sorted_multi)

       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
Sorted by Age (ascending):
        Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
4       Eva   22      H

## indexing

In [21]:
# Sometime we want to create our own index

df = pd.read_csv('data.csv')
print("Original DataFrame:")
print(df)
print("000000000000000000000")

# 1. Set a column as the index: lets use column passport as index
df.set_index('passport', inplace=True)
print(f"\nSet 'Name' as index:\n{df}")
print("11111111111111111111")

# 2. Reset index: lets change our index back to default
df_reset = df.reset_index()
print(f"\nReset index:\n{df_reset}")
print("22222222222222222222222")


# # 3. Lets add our own custom index

l_indx = []
for i in range(1, len(df)+1):
    l_indx.append(f"Emp{i:03d}")
# print(l_indx)

# or I can use list comprehension
# l_indx = [f"Emp{i:03d}" for i in range(1, len(df)+1)]

df.index = l_indx
print("\nManual index change:")
print(df)

Original DataFrame:
       Name  Age         City  Exp  Exp2   Salary passport  bonus  hours
0     Alice   25      Chicago    2     2  70000.0      a43   1000    100
1       Bob   30  Los Angeles    5     5  80000.0      a44   3000    150
2   Charlie   35      Chicago    7     7      NaN      a45      0    200
3     David   40      Houston   10    10  90000.0      a46   7000    100
4       Eva   22      Houston    1     1  48000.0      a47   2555    120
5     Frank   28          NaN    3     3  72000.0      a48      0    100
6     Grace   32  San Antonio    6     6  85000.0      a49      0    200
7     Helen   26    San Diego    2     2  62000.0      a50   9000    100
8     Helen   26    San Diego    2     2  62000.0      a51      0     80
9     Helen   26    San Diego    2     2  62000.0      a52   3000    100
10    Jerry   23      Phoenix    6     6  78000.0      a53      0     80
000000000000000000000

Set 'Name' as index:
             Name  Age         City  Exp  Exp2   Salary  bon

# STOP

In [22]:
################# 6. (SKIP) Multi-indexing
df_reset = df.reset_index()
df_multi = df_reset.set_index(['City', 'Name'])
print("\nMulti-indexed DataFrame:")
print(df_multi)


Multi-indexed DataFrame:
                      index  Age  Exp  Exp2   Salary  bonus  hours
City        Name                                                  
Chicago     Alice    Emp001   25    2     2  70000.0   1000    100
Los Angeles Bob      Emp002   30    5     5  80000.0   3000    150
Chicago     Charlie  Emp003   35    7     7      NaN      0    200
Houston     David    Emp004   40   10    10  90000.0   7000    100
            Eva      Emp005   22    1     1  48000.0   2555    120
NaN         Frank    Emp006   28    3     3  72000.0      0    100
San Antonio Grace    Emp007   32    6     6  85000.0      0    200
San Diego   Helen    Emp008   26    2     2  62000.0   9000    100
            Helen    Emp009   26    2     2  62000.0      0     80
            Helen    Emp010   26    2     2  62000.0   3000    100
Phoenix     Jerry    Emp011   23    6     6  78000.0      0     80
