## pandas
- Used for EDA
- used for data cleaning, etc
- importing/exporting data, creating/deleting columns,

#### Topics:
- Series: difference between pandas **Series** and pandas **Dataframe**, access series,
- Series: statistical operation, element-wise function, boolean function, mapping/transformation, missing value, arithmatics function,etc
- Dataframe: data collection->read from CSV XLSX, JSON,pickle, web
- Dataframe: access rows / columns ( loc, iloc ) , slicing, 
- Dataframe: add columns, drop column, add row, remove row, replacing values in a column
- Dataframe: indexng

In [69]:
import pandas as pd
import numpy as np

print(pd.__version__)
data_file = 'data.csv'

2.2.3


# Series
## Difference between pandas Series and pandas Dataframe
```
Feature         Series              DataFrame
Dimensions        1D                  2D
Shape             (n,)             (rows, columns)
Data Structure    Single column   Table with multiple columns
Index             Yes               Yes (rows & columns)
Usage         Single column or row   Full dataset
```

In [70]:
# Series declaration
l = [20, 30, 40] # age
s = pd.Series(l) # From a List
print(type(s)) # <class 'pandas.core.series.Series'>
print(s)

print("#####################")
s = pd.Series(l, index=['ram', 'zakir', 'morales']) # From a List with Custom Index
print(s)

print("$$$$$$$$$$$$$$$$$$$$$$$$$")
data = {'ram': 20, 'zakir': 30, 'morales': 40} #  From a Dictionary
s = pd.Series(data)
print(s)
print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
# s = pd.Series(np.random.randint(10,20,5))
# print(s)


<class 'pandas.core.series.Series'>
0    20
1    30
2    40
dtype: int64
#####################
ram        20
zakir      30
morales    40
dtype: int64
$$$$$$$$$$$$$$$$$$$$$$$$$
ram        20
zakir      30
morales    40
dtype: int64
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


In [71]:
# access series
l = [25, 20, 35, 24, 50] # age

s = pd.Series(l)
print(s[0])
print(s[:3])
print(s[2:])
print(s[1:3])

25
0    25
1    20
2    35
dtype: int64
2    35
3    24
4    50
dtype: int64
1    20
2    35
dtype: int64


### (OPTIONAL) operation on series

In [72]:
# 1. Arithmetic Operations
s = pd.Series([10, 20, 30])

print(s + 5)       # Add 5 to each element
print(s * 2)       # Multiply each element by 2
print(s / 10)      # Divide each element by 10

# 2. Statistical Operations
s = pd.Series([10, 20, 30, 40])

print(s.mean())      # Average
print(s.median())    # Median
print(s.std())       # Standard deviation
print(s.max())       # Maximum
print(s.min())       # Minimum
print(s.sum())       # Sum of all elements
print(s.cumsum())
print(s.cumprod())
print(s.describe())
print(s.quantile(0.25))  # 25th percentile

print(s.sem())  # Standard error mean
print(s.nunique())  # Unique count

print(s.value_counts())  # Value counts (frequency of each unique value)

print(s.idxmin())  # Index of first min value
print(s.idxmax())  # Index of first max value



# 3. Element-wise Functions
s = pd.Series([1, 2, 3, 4])
print(np.sqrt(s))      # Square root
print(np.exp(s))       # Exponential
print(np.log(s))       # Logarithm

# 4. Boolean Filtering
s = pd.Series([10, 20, 30, 40])
print(s>25)
print(s[s > 25])       # Filter elements greater than 25

# 5. Value Counts & Uniqueness
s = pd.Series(['apple', 'banana', 'apple', 'orange'])
print(s.value_counts())   # Frequency of unique values
print(s.unique())         # Unique values

# 6. Mapping / Transformation
s = pd.Series([1, 2, 3])
print(s.map(lambda x: x * 10))  # Apply a function to each element

# 7. String Operations (for string Series)
s = pd.Series(['hello', 'my friend'])
print(s.str.upper())     # Convert to uppercase
print(s.str.len())       # Length of each string

# 8. Handling Missing Data
s = pd.Series([1, 2, None, 4])
print(s.isnull())        # Check for NaNs
print(s.fillna(0))       # Replace NaNs with 0
print(s.dropna())        # Drop NaNs

# 9. Sorting
s = pd.Series([10, 2, 30])
print(s.sort_values())   # Sort by value
print(s.sort_index())    # Sort by index

# 10. Combine / Arithmetic Between Series
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([4, 5, 6], index=['a', 'b', 'c'])
print(s1 + s2)   # Aligns by index and adds

0    15
1    25
2    35
dtype: int64
0    20
1    40
2    60
dtype: int64
0    1.0
1    2.0
2    3.0
dtype: float64
25.0
25.0
12.909944487358056
40
10
100
0     10
1     30
2     60
3    100
dtype: int64
0        10
1       200
2      6000
3    240000
dtype: int64
count     4.000000
mean     25.000000
std      12.909944
min      10.000000
25%      17.500000
50%      25.000000
75%      32.500000
max      40.000000
dtype: float64
17.5
6.454972243679028
4
10    1
20    1
30    1
40    1
Name: count, dtype: int64
0
3
0    1.000000
1    1.414214
2    1.732051
3    2.000000
dtype: float64
0     2.718282
1     7.389056
2    20.085537
3    54.598150
dtype: float64
0    0.000000
1    0.693147
2    1.098612
3    1.386294
dtype: float64
0    False
1    False
2     True
3     True
dtype: bool
2    30
3    40
dtype: int64
apple     2
banana    1
orange    1
Name: count, dtype: int64
['apple' 'banana' 'orange']
0    10
1    20
2    30
dtype: int64
0        HELLO
1    MY FRIEND
dtype: object
0    5
1

## Dataframe

In [73]:
# Create a DataFrame: Example1: hardcode the data
data = {
    'name': ['ash', 'timmy', 'jimmy', 'Samantha'],
    'age': [30, 33, 61, 19],
    'is_working': [True, False, True, True],
    'income': [20.4, 20.1, 34.7, 55.9] 
}
df = pd.DataFrame(data)

print(df)

       name  age  is_working  income
0       ash   30        True    20.4
1     timmy   33       False    20.1
2     jimmy   61        True    34.7
3  Samantha   19        True    55.9


In [74]:
# (SKIP) Create a DataFrame: Example2 hardcode the data using series
data = {
    'A': pd.Series(pd.date_range("2023-01-01", periods=5, freq='D')),  # Time data
    'B': pd.Series([120.5, 123.0, 121.3, 125.6, 124.2]),
    'C': pd.Series(['Buy', 'Sell', 'Hold', 'Buy', 'Sell']),
    'D': pd.Series(np.random.randint(1, 11, size=5))
}
df = pd.DataFrame(data)

print(df)

           A      B     C   D
0 2023-01-01  120.5   Buy   4
1 2023-01-02  123.0  Sell   1
2 2023-01-03  121.3  Hold  10
3 2023-01-04  125.6   Buy   5
4 2023-01-05  124.2  Sell   4


In [75]:
# (SKIP) Create a DataFrame: Example 3 hardcode the data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Helen', 'Helen', 'Helen', 'Jerry'],
    'Age': [25, 30, 35, 40, 22, 28, 32, 26, 26, 26, 23],
    'City': ['Chicago', 'Los Angeles', 'Chicago', 'Houston', 'Houston', np.nan, 'San Antonio', 'San Diego', 'San Diego', 'San Diego', 'Phoenix'],
    'Experience':  [2, 5, 7, 10, 1, 3, 6, 2, 2, 2, 6],
    'Experience2': [2, 5, 7, 10, 1, 3, 6, 2, 2, 2, 6],
    'Salary': [70000.0, 80000.0, np.nan, 90000.0, 48000.0, 72000.0, 85000.0, 62000.0, 62000.0, 62000.0, 78000.0]
}

df = pd.DataFrame(data)
print(df)

print("####################")
# Lets look at each column series
series_age = df['Age']
print(type(series_age)) # <class 'pandas.core.series.Series'>
print(series_age)

       Name  Age         City  Experience  Experience2   Salary
0     Alice   25      Chicago           2            2  70000.0
1       Bob   30  Los Angeles           5            5  80000.0
2   Charlie   35      Chicago           7            7      NaN
3     David   40      Houston          10           10  90000.0
4       Eva   22      Houston           1            1  48000.0
5     Frank   28          NaN           3            3  72000.0
6     Grace   32  San Antonio           6            6  85000.0
7     Helen   26    San Diego           2            2  62000.0
8     Helen   26    San Diego           2            2  62000.0
9     Helen   26    San Diego           2            2  62000.0
10    Jerry   23      Phoenix           6            6  78000.0
####################
<class 'pandas.core.series.Series'>
0     25
1     30
2     35
3     40
4     22
5     28
6     32
7     26
8     26
9     26
10    23
Name: Age, dtype: int64


In [76]:
# (SKIP) You can also generate DF
row_labels = ['bob', 'sam', 'hari']
column_headings = ['age', 'income']
data = np.random.randint(10, 100, size=(len(row_labels), len(column_headings)))

df = pd.DataFrame(data, index=row_labels, columns=column_headings)
print(f"Generated DataFrame:\n{df}")

Generated DataFrame:
      age  income
bob    17      89
sam    70      36
hari   75      36


### export / import data from file 

In [77]:
# Create a DataFrame: Example1: hardcode the data
data = {
    'name': ['ash', 'timmy', 'jimmy', 'Samantha'],
    'age': [30, 33, 61, 19],
    'is_working': [True, False, True, True],
    'income': [20.4, 20.1, 34.7, 55.9] 
}
df = pd.DataFrame(data)
print(df)


# # different ways to export data to file
df.to_csv('junk.csv', index=False) # default sep=','
# df.to_csv('junk.csv', sep=';', index=False)
df.to_excel('junk.xlsx', index=False, sheet_name='Sheet1')
df.to_json('junk.json', orient='records', lines=True)
df.to_pickle('junk.pkl')

#Skip following
# df.to_sql('my_table', conn, if_exists='replace', index=False)  # Requires SQLAlchemy or sqlite3


       name  age  is_working  income
0       ash   30        True    20.4
1     timmy   33       False    20.1
2     jimmy   61        True    34.7
3  Samantha   19        True    55.9


In [78]:
# Or import from CSV with comma seperator
# from pathlib import Path
# data_file = Path.cwd() / 'data.csv'

df = pd.read_csv(data_file) # default sep=','

print("\nDataFrame1:\n", df)
# print(f"\nDataFrame1:\n{}")


DataFrame1:
        Name  Age         City  Experience  Experience2   Salary passport  \
0     Alice   25      Chicago           2            2  70000.0      a43   
1       Bob   30  Los Angeles           5            5  80000.0      a44   
2   Charlie   35      Chicago           7            7      NaN      a45   
3     David   40      Houston          10           10  90000.0      a46   
4       Eva   22      Houston           1            1  48000.0      a47   
5     Frank   28          NaN           3            3  72000.0      a48   
6     Grace   32  San Antonio           6            6  85000.0      a49   
7     Helen   26    San Diego           2            2  62000.0      a50   
8     Helen   26    San Diego           2            2  62000.0      a51   
9     Helen   26    San Diego           2            2  62000.0      a52   
10    Jerry   23      Phoenix           6            6  78000.0      a53   

    bonus  hours  
0    1000    100  
1    3000    150  
2       0    200

In [79]:
# different ways to import

# df = pd.read_csv('semi-colon.txt', sep=';') # file has , in the data
# df = pd.read_csv("semi-colon-california-house.csv", sep=';' 
# print(df)


df = pd.read_csv('https://raw.githubusercontent.com/ash322ash422/tut_pandas_numpy/refs/heads/master/titanic.csv', sep=',')
print(df.head(5))


# df = pd.read_excel('data.xlsx', sheet_name='Sheet1')  # Requires openpyxl or xlrd
# print(df.head(5))


# df = pd.read_excel('data.xlsx', names = ['a', 'b', 'c', 'd', 'e', 'f'], skiprows=[1], sheet_name='Sheet1')  # Requires openpyxl or xlrd
# print(df.head(5))



# df = pd.read_json('data.json',  lines=True)
# print(df.head(5))

# df = pd.read_pickle('data.pkl')
# print(df.head(5))

#NOTE: LEGALITY
# url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population'
# tables = pd.read_html(url)  # returns a list of DataFrames
# print(f"Total tables: {len(tables)}")
# df = tables[2]
# print(df.head(5))


# url = 'https://en.wikipedia.org/wiki/Minnesota'
# tables = pd.read_html(url)  # returns a list of DataFrames
# print(f"Total tables: {len(tables)}")
# df = tables[2]
# print(df.head(5))


### Following skip for now############
# import sqlite3
# conn = sqlite3.connect('my_database.db')
# df = pd.read_sql('SELECT * FROM my_table', conn)



URLError: <urlopen error [WinError 10065] A socket operation was attempted to an unreachable host>

## Access columns

In [80]:
df = pd.read_csv(data_file)

# style1
age = df['Age']
print(type(age)) # <class 'pandas.core.series.Series'>
print(age)

# style2
age = df.Age
print(type(age)) # <class 'pandas.core.series.Series'>
print(age)

<class 'pandas.core.series.Series'>
0     25
1     30
2     35
3     40
4     22
5     28
6     32
7     26
8     26
9     26
10    23
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>
0     25
1     30
2     35
3     40
4     22
5     28
6     32
7     26
8     26
9     26
10    23
Name: Age, dtype: int64


In [81]:
# access mutiple columns
df_temp = df[['Name', 'City']]
print(df_temp)

       Name         City
0     Alice      Chicago
1       Bob  Los Angeles
2   Charlie      Chicago
3     David      Houston
4       Eva      Houston
5     Frank          NaN
6     Grace  San Antonio
7     Helen    San Diego
8     Helen    San Diego
9     Helen    San Diego
10    Jerry      Phoenix


In [82]:
names_of_col = df.columns.tolist()
print(names_of_col)

['Name', 'Age', 'City', 'Experience', 'Experience2', 'Salary', 'passport', 'bonus', 'hours']


### slicing
- accessing subset of dataframe
- same style as slicing in list

In [83]:
# Slice rows from index 2 to 8
print("\nSlicing rows [2:9]:\n", df[2:9])  # Rows 2,3,... 8

# Slice rows from index 2 to 8, step 2
print("\nSlicing rows [2:9:2]:\n", df[2:9:2])  # Rows 2, 4, 6, 8

# Slice all rows with a step of 3
print("\nEvery 3rd row:\n", df[::3])  # Rows 0, 3, 6, 9

# Reverse the DataFrame
print("\nReversed DataFrame:\n", df[::-1])  # From last to first

# Example: out-of-range slicing doesn't error
print("\nOut-of-bounds slicing [1:3:9] just returns matching row(s):\n", df[1:3:20])  # Row 1 only (step 9 has no effect since only one row in range)


Slicing rows [2:9]:
       Name  Age         City  Experience  Experience2   Salary passport  \
2  Charlie   35      Chicago           7            7      NaN      a45   
3    David   40      Houston          10           10  90000.0      a46   
4      Eva   22      Houston           1            1  48000.0      a47   
5    Frank   28          NaN           3            3  72000.0      a48   
6    Grace   32  San Antonio           6            6  85000.0      a49   
7    Helen   26    San Diego           2            2  62000.0      a50   
8    Helen   26    San Diego           2            2  62000.0      a51   

   bonus  hours  
2      0    200  
3   7000    100  
4   2555    120  
5      0    100  
6      0    200  
7   9000    100  
8      0     80  

Slicing rows [2:9:2]:
       Name  Age         City  Experience  Experience2   Salary passport  \
2  Charlie   35      Chicago           7            7      NaN      a45   
4      Eva   22      Houston           1            1  4800

## iloc, loc
- .iloc[] – Integer position-based selection
- loc - Label-based selection: Accesses rows and columns by labels (i.e., names). **Not much used**


In [84]:
df = pd.read_csv('data.csv')
print(df)
# ###########################################
# 1. Access a single row by index:
print(df.iloc[1])  # second row

# 2. Access a specific cell (row 1, column 2):
print(df.iloc[1, 2]) # Los Angeles

# 3. Access multiple rows and columns:
df.iloc[0:3, 0:2]  # First 3 rows, first 2 columns

# 4. Modify a value:
df.iloc[0, 1] = 26  # Change Alice's age from 25 to 26
print(df)
#############################################
# lets add labels to the index
df.index = [f"Emp{id:03d}" for id in range(1, len(df)+1)]
df.index.name = "EmployeeID"

print(df)

# 1. Access a single row by label:
print(df.loc['Emp003'])

# 2. Access multiple rows by labels:
print(df.loc[['Emp003', 'Emp005', 'Emp009']])

# 3. Access specific rows and columns:
print(df.loc[['Emp003', 'Emp005'], ['Name', 'City']])

       Name  Age         City  Experience  Experience2   Salary passport  \
0     Alice   25      Chicago           2            2  70000.0      a43   
1       Bob   30  Los Angeles           5            5  80000.0      a44   
2   Charlie   35      Chicago           7            7      NaN      a45   
3     David   40      Houston          10           10  90000.0      a46   
4       Eva   22      Houston           1            1  48000.0      a47   
5     Frank   28          NaN           3            3  72000.0      a48   
6     Grace   32  San Antonio           6            6  85000.0      a49   
7     Helen   26    San Diego           2            2  62000.0      a50   
8     Helen   26    San Diego           2            2  62000.0      a51   
9     Helen   26    San Diego           2            2  62000.0      a52   
10    Jerry   23      Phoenix           6            6  78000.0      a53   

    bonus  hours  
0    1000    100  
1    3000    150  
2       0    200  
3    7000  

## Add rows/ columns ; remove rows / columns 

In [85]:
df = pd.read_csv('data.csv')
print(df)

       Name  Age         City  Experience  Experience2   Salary passport  \
0     Alice   25      Chicago           2            2  70000.0      a43   
1       Bob   30  Los Angeles           5            5  80000.0      a44   
2   Charlie   35      Chicago           7            7      NaN      a45   
3     David   40      Houston          10           10  90000.0      a46   
4       Eva   22      Houston           1            1  48000.0      a47   
5     Frank   28          NaN           3            3  72000.0      a48   
6     Grace   32  San Antonio           6            6  85000.0      a49   
7     Helen   26    San Diego           2            2  62000.0      a50   
8     Helen   26    San Diego           2            2  62000.0      a51   
9     Helen   26    San Diego           2            2  62000.0      a52   
10    Jerry   23      Phoenix           6            6  78000.0      a53   

    bonus  hours  
0    1000    100  
1    3000    150  
2       0    200  
3    7000  

In [86]:
# add a recoord
 
# df.loc[len(df)] = ['Mihindou', 29, 'Tokyo', 4, 4, 67000.0, 'm33'] # INCOREECT
df.loc[len(df)] = ['Mihindou', 29, 'Tokyo', 4, 4, 67000.0, 'm33', 0, 90]
print(df)

        Name  Age         City  Experience  Experience2   Salary passport  \
0      Alice   25      Chicago           2            2  70000.0      a43   
1        Bob   30  Los Angeles           5            5  80000.0      a44   
2    Charlie   35      Chicago           7            7      NaN      a45   
3      David   40      Houston          10           10  90000.0      a46   
4        Eva   22      Houston           1            1  48000.0      a47   
5      Frank   28          NaN           3            3  72000.0      a48   
6      Grace   32  San Antonio           6            6  85000.0      a49   
7      Helen   26    San Diego           2            2  62000.0      a50   
8      Helen   26    San Diego           2            2  62000.0      a51   
9      Helen   26    San Diego           2            2  62000.0      a52   
10     Jerry   23      Phoenix           6            6  78000.0      a53   
11  Mihindou   29        Tokyo           4            4  67000.0      m33   

In [87]:
# drop row based on index
df = df.drop([11,])
print(df)

       Name  Age         City  Experience  Experience2   Salary passport  \
0     Alice   25      Chicago           2            2  70000.0      a43   
1       Bob   30  Los Angeles           5            5  80000.0      a44   
2   Charlie   35      Chicago           7            7      NaN      a45   
3     David   40      Houston          10           10  90000.0      a46   
4       Eva   22      Houston           1            1  48000.0      a47   
5     Frank   28          NaN           3            3  72000.0      a48   
6     Grace   32  San Antonio           6            6  85000.0      a49   
7     Helen   26    San Diego           2            2  62000.0      a50   
8     Helen   26    San Diego           2            2  62000.0      a51   
9     Helen   26    San Diego           2            2  62000.0      a52   
10    Jerry   23      Phoenix           6            6  78000.0      a53   

    bonus  hours  
0    1000    100  
1    3000    150  
2       0    200  
3    7000  

In [88]:
# lets create a new column 'company' with same values all across rows
df['company'] = "Lucent Technologies" 
print("\nAfter :\n",df)


After :
        Name  Age         City  Experience  Experience2   Salary passport  \
0     Alice   25      Chicago           2            2  70000.0      a43   
1       Bob   30  Los Angeles           5            5  80000.0      a44   
2   Charlie   35      Chicago           7            7      NaN      a45   
3     David   40      Houston          10           10  90000.0      a46   
4       Eva   22      Houston           1            1  48000.0      a47   
5     Frank   28          NaN           3            3  72000.0      a48   
6     Grace   32  San Antonio           6            6  85000.0      a49   
7     Helen   26    San Diego           2            2  62000.0      a50   
8     Helen   26    San Diego           2            2  62000.0      a51   
9     Helen   26    San Diego           2            2  62000.0      a52   
10    Jerry   23      Phoenix           6            6  78000.0      a53   

    bonus  hours              company  
0    1000    100  Lucent Technologies

In [89]:
# Lets drop columns
df.drop(columns=['company','Experience2'], inplace=True)
print("\nAfter dropping a column:\n",df)


After dropping a column:
        Name  Age         City  Experience   Salary passport  bonus  hours
0     Alice   25      Chicago           2  70000.0      a43   1000    100
1       Bob   30  Los Angeles           5  80000.0      a44   3000    150
2   Charlie   35      Chicago           7      NaN      a45      0    200
3     David   40      Houston          10  90000.0      a46   7000    100
4       Eva   22      Houston           1  48000.0      a47   2555    120
5     Frank   28          NaN           3  72000.0      a48      0    100
6     Grace   32  San Antonio           6  85000.0      a49      0    200
7     Helen   26    San Diego           2  62000.0      a50   9000    100
8     Helen   26    San Diego           2  62000.0      a51      0     80
9     Helen   26    San Diego           2  62000.0      a52   3000    100
10    Jerry   23      Phoenix           6  78000.0      a53      0     80


In [90]:
# replace values of columns
df['City'] = df['City'].replace({'Los Angeles': 'LA', 'San Diego': 'SD'})
print(df)

       Name  Age         City  Experience   Salary passport  bonus  hours
0     Alice   25      Chicago           2  70000.0      a43   1000    100
1       Bob   30           LA           5  80000.0      a44   3000    150
2   Charlie   35      Chicago           7      NaN      a45      0    200
3     David   40      Houston          10  90000.0      a46   7000    100
4       Eva   22      Houston           1  48000.0      a47   2555    120
5     Frank   28          NaN           3  72000.0      a48      0    100
6     Grace   32  San Antonio           6  85000.0      a49      0    200
7     Helen   26           SD           2  62000.0      a50   9000    100
8     Helen   26           SD           2  62000.0      a51      0     80
9     Helen   26           SD           2  62000.0      a52   3000    100
10    Jerry   23      Phoenix           6  78000.0      a53      0     80


In [91]:
# creating a column by add values from 2 columns
df['total_salary'] = df['Salary'] + df['bonus']
print(df)

       Name  Age         City  Experience   Salary passport  bonus  hours  \
0     Alice   25      Chicago           2  70000.0      a43   1000    100   
1       Bob   30           LA           5  80000.0      a44   3000    150   
2   Charlie   35      Chicago           7      NaN      a45      0    200   
3     David   40      Houston          10  90000.0      a46   7000    100   
4       Eva   22      Houston           1  48000.0      a47   2555    120   
5     Frank   28          NaN           3  72000.0      a48      0    100   
6     Grace   32  San Antonio           6  85000.0      a49      0    200   
7     Helen   26           SD           2  62000.0      a50   9000    100   
8     Helen   26           SD           2  62000.0      a51      0     80   
9     Helen   26           SD           2  62000.0      a52   3000    100   
10    Jerry   23      Phoenix           6  78000.0      a53      0     80   

    total_salary  
0        71000.0  
1        83000.0  
2            NaN  

In [92]:
# create a column that shows 10% tax for each person 
df['tax'] = df['total_salary'] * 0.1
print(df)

# create a column that shows festival bonus for each person 
df['fest_bonus'] = df['total_salary'] + 1000
print(df)

# create a column that shows salary per hours for each  person 
df['sal_per_hr'] = df['total_salary'] / df['hours']
print(df)

       Name  Age         City  Experience   Salary passport  bonus  hours  \
0     Alice   25      Chicago           2  70000.0      a43   1000    100   
1       Bob   30           LA           5  80000.0      a44   3000    150   
2   Charlie   35      Chicago           7      NaN      a45      0    200   
3     David   40      Houston          10  90000.0      a46   7000    100   
4       Eva   22      Houston           1  48000.0      a47   2555    120   
5     Frank   28          NaN           3  72000.0      a48      0    100   
6     Grace   32  San Antonio           6  85000.0      a49      0    200   
7     Helen   26           SD           2  62000.0      a50   9000    100   
8     Helen   26           SD           2  62000.0      a51      0     80   
9     Helen   26           SD           2  62000.0      a52   3000    100   
10    Jerry   23      Phoenix           6  78000.0      a53      0     80   

    total_salary     tax  
0        71000.0  7100.0  
1        83000.0  830

## indexing

In [93]:
# import pandas as pd

# # Sample DataFrame
# data = {
#     'Name': ['Rahim', 'Alice', 'Timmy', 'David'],
#     'Age': [25, 30, 35, 40],
#     'City': ['New York', 'LA', 'Chicago', 'Houston']
# }

# df = pd.DataFrame(data)

df = pd.read_csv('data.csv')
print("Original DataFrame:")
print(df)
print("000000000000000000000")

# 1. Set a column as the index
df.set_index('passport', inplace=True)
print(f"\nSet 'Name' as index:\n{df}")
print("11111111111111111111")

# 2. Reset index
df_reset = df.reset_index()
print(f"\nReset index:\n{df_reset}")
print("22222222222222222222222")


# # 3. Change the index manually
l_indx = [f"Emp{i:03d}" for i in range(1, len(df)+1)]
# print(l_indx)
df.index = l_indx
print("\nManual index change:")
print(df)


################# 6. (OPTIONAL) Multi-indexing
df_reset = df.reset_index()
df_multi = df_reset.set_index(['City', 'Name'])
print("\nMulti-indexed DataFrame:")
print(df_multi)

# 7. Sorting index
df_sorted = df.sort_index()
print("\nSorted by index:")
print(df_sorted)


Original DataFrame:
       Name  Age         City  Experience  Experience2   Salary passport  \
0     Alice   25      Chicago           2            2  70000.0      a43   
1       Bob   30  Los Angeles           5            5  80000.0      a44   
2   Charlie   35      Chicago           7            7      NaN      a45   
3     David   40      Houston          10           10  90000.0      a46   
4       Eva   22      Houston           1            1  48000.0      a47   
5     Frank   28          NaN           3            3  72000.0      a48   
6     Grace   32  San Antonio           6            6  85000.0      a49   
7     Helen   26    San Diego           2            2  62000.0      a50   
8     Helen   26    San Diego           2            2  62000.0      a51   
9     Helen   26    San Diego           2            2  62000.0      a52   
10    Jerry   23      Phoenix           6            6  78000.0      a53   

    bonus  hours  
0    1000    100  
1    3000    150  
2       0 