<a href="https://colab.research.google.com/github/ayan6943/python-lib-notebooks/blob/main/Unit_2_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
ser=pd.Series([1,0.25,'s',56,100000])
ser
ser.values

array([1, 0.25, 's', 56, 100000], dtype=object)

In [None]:
ser.index

RangeIndex(start=0, stop=5, step=1)

In [None]:
ser1=pd.Series([1,0.25,'s',56,100000],index=['a','b','c','d','e'])
ser1

Unnamed: 0,0
a,1
b,0.25
c,s
d,56
e,100000


In [None]:
population_dict = {'California': 39538223, 'Texas': 29145505,
                   'Florida': 21538187, 'New York': 20201249,
                   'Pennsylvania': 13002700}
population = pd.Series(population_dict)
population

Unnamed: 0,0
California,39538223
Texas,29145505
Florida,21538187
New York,20201249
Pennsylvania,13002700


In [None]:
area_dict = {'California': 423967, 'Texas': 695662, 'Florida': 170312,
             'New York': 141297, 'Pennsylvania': 119280}
area = pd.Series(area_dict)
area

Unnamed: 0,0
California,423967
Texas,695662
Florida,170312
New York,141297
Pennsylvania,119280


In [None]:
states = pd.DataFrame({'population': population,
                       'area': area})
states

Unnamed: 0,population,area
California,39538223,423967
Texas,29145505,695662
Florida,21538187,170312
New York,20201249,141297
Pennsylvania,13002700,119280


In [None]:
states.index

Index(['California', 'Texas', 'Florida', 'New York', 'Pennsylvania'], dtype='object')

In [None]:
states.columns

Index(['population', 'area'], dtype='object')

In [None]:
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,39538223
Texas,29145505
Florida,21538187
New York,20201249
Pennsylvania,13002700


#**Data Indexing and Selection**

In [None]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

Unnamed: 0,0
a,0.25
b,0.5
c,0.75
d,1.0


In [None]:
data['a':'c']
data['b']
mask=data>0.3
data[mask]
fancy=['b','d']
data[fancy]

Unnamed: 0,0
b,0.5
d,1.0


**Indexers: loc and iloc**

**Label-based indexing** You use .loc when you want to select elements based on the index labels.

**Integer position-based indexing** Here, you pick elements based solely on their position (starting at 0), regardless of the label.

In [None]:
data.loc['b']

0.5

In [None]:
data.iloc[0]

0.25

In [None]:
# Slicing with .loc: from label 'a' to 'b' includes both 'a' and 'b'
print(data.loc['a':'b'])

a    0.25
b    0.50
dtype: float64


In [None]:
# Slicing with .iloc: from position 0 to 2 returns positions 0 and 1
print(data.iloc[0:2])

a    0.25
b    0.50
dtype: float64


In [None]:
import pandas as pd
import numpy as np

#################################################
# 1. Create a Sample DataFrame and Column Selection
#################################################

# Create a simple DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Carol'],
    'Age': [25, 30, 27],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# --- Column Selection ---

# Single column as a Series:
print("\nSingle Column 'Name':")
print(df['Name'])

# Multiple columns as a DataFrame:
print("\nMultiple Columns 'Name' and 'City':")
print(df[['Name', 'City']])

#################################################
# 2. Row Selection Using Default Indexing and Boolean Filtering
#################################################

# Boolean indexing: Filter rows where Age > 25
filtered_df = df[df['Age'] > 25]
print("\nRows where Age > 25:")
print(filtered_df)

#######################################
# 3. Label-Based Indexing with .loc
#######################################

# Using .loc to select a row by label (default index value is used here)
row_one = df.loc[1]
print("\nRow with index label 1 using .loc:")
print(row_one)

# Selecting a specific element: row label 1 and column 'City'
city_val = df.loc[1, 'City']
print("\nElement at row label 1 and column 'City':")
print(city_val)

# Slicing rows and selecting specific columns.
# Note: When slicing with .loc, the endpoint is inclusive.
sliced_df = df.loc[0:1, ['Name', 'Age']]
print("\nSliced DataFrame using .loc (rows 0 to 1, columns 'Name' and 'Age'):")
print(sliced_df)

#################################################
# 4. Integer Position-Based Indexing with .iloc
#################################################

# Using .iloc to select the first row (position 0)
first_row = df.iloc[0]
print("\nFirst row using .iloc:")
print(first_row)

# Selecting an element by position: first row, third column.
element = df.iloc[0, 2]
print("\nElement at row position 0 and column position 2 using .iloc:")
print(element)

# Slicing rows and columns.
# Reminder: With .iloc, slicing follows standard Python behavior (endpoint exclusive)
sliced_iloc = df.iloc[0:2, 0:2]
print("\nSliced DataFrame using .iloc (rows 0:2, columns 0:2):")
print(sliced_iloc)

#################################################
# 5. Boolean Masking with Multiple Conditions
#################################################

# Filter rows where Age > 25 and City is 'Los Angeles'
mask = (df['Age'] > 25) & (df['City'] == 'Los Angeles')
filtered_multi = df[mask]
print("\nRows where Age > 25 and City is 'Los Angeles':")
print(filtered_multi)

#################################################
# 6. Setting a Custom Index and Using .loc
#################################################

# Set 'Name' as the index for a more meaningful label-based selection
df_indexed = df.set_index('Name')
print("\nDataFrame with 'Name' as the index:")
print(df_indexed)

# Select row by label using .loc after setting a custom index
alice_data = df_indexed.loc['Alice']
print("\nRow for 'Alice' using .loc on custom index:")
print(alice_data)

#################################################
# 7. Using .at and .iat for Fast Scalar Access
#################################################

# .at is used for label-based fast access of a single element.
bob_city = df_indexed.at['Bob', 'City']
print("\nUsing .at to get the 'City' value for Bob (custom index):")
print(bob_city)

# .iat is used for integer position-based fast access of a single element.
element_iat = df.iat[1, 2]  # Row position 1, column position 2 in the original DataFrame
print("\nUsing .iat to get element at row position 1, column position 2:")
print(element_iat)

#################################################
# 8. MultiIndex Example
#################################################

# Create a MultiIndex using two arrays
arrays = [
    ['North', 'North', 'South', 'South'],
    ['New York', 'Boston', 'Houston', 'Atlanta']
]
multi_index = pd.MultiIndex.from_arrays(arrays, names=('Region', 'City'))

# Create a DataFrame with the multi-index
data_multi = {
    'Population': [8.5, 0.7, 2.3, 0.5],  # in millions
    'Area': [468.9, 89.6, 600.2, 134.0]     # in square miles
}
df_multi = pd.DataFrame(data_multi, index=multi_index)
print("\nDataFrame with MultiIndex:")
print(df_multi)

# Using .loc with a MultiIndex to select all sub-entries from a main index.
north_data = df_multi.loc['North']
print("\nData for the 'North' region using .loc on MultiIndex:")
print(north_data)

# For a specific multiple-index selection: select ('South', 'Houston')
south_houston = df_multi.loc[('South', 'Houston')]
print("\nData for ('South', 'Houston') using .loc:")
print(south_houston)


Original DataFrame:
    Name  Age         City
0  Alice   25     New York
1    Bob   30  Los Angeles
2  Carol   27      Chicago

Single Column 'Name':
0    Alice
1      Bob
2    Carol
Name: Name, dtype: object

Multiple Columns 'Name' and 'City':
    Name         City
0  Alice     New York
1    Bob  Los Angeles
2  Carol      Chicago

Rows where Age > 25:
    Name  Age         City
1    Bob   30  Los Angeles
2  Carol   27      Chicago

Row with index label 1 using .loc:
Name            Bob
Age              30
City    Los Angeles
Name: 1, dtype: object

Element at row label 1 and column 'City':
Los Angeles

Sliced DataFrame using .loc (rows 0 to 1, columns 'Name' and 'Age'):
    Name  Age
0  Alice   25
1    Bob   30

First row using .iloc:
Name       Alice
Age           25
City    New York
Name: 0, dtype: object

Element at row position 0 and column position 2 using .iloc:
New York

Sliced DataFrame using .iloc (rows 0:2, columns 0:2):
    Name  Age
0  Alice   25
1    Bob   30

Rows wher

In [None]:
dict={
    'names':['ayan','adnan','eshan','aman'],
    'age':[19,18,15,14]
}
new=pd.DataFrame(dict,index=['a','b','c','d'])
print(new)

   names  age
a   ayan   19
b  adnan   18
c  eshan   15
d   aman   14


In [None]:
import pandas as pd

# Data provided as a list of rows (each row is a list)
data = [
    ['Alice', 25, 'New York'],
    ['Bob', 30, 'Los Angeles'],
    ['Carol', 22, 'Chicago']
]

# Create the DataFrame with custom column names
df = pd.DataFrame(data, columns=['Name', 'Age', 'City'])

print("DataFrame with custom columns:")
print(df)


DataFrame with custom columns:
    Name  Age         City
0  Alice   25     New York
1    Bob   30  Los Angeles
2  Carol   22      Chicago


In [None]:
import numpy as np
new2=pd.DataFrame(np.arange(0,12).reshape(3,4),columns=['A', 'B', 'C', 'D'])
print(new2)

   A  B   C   D
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


In [None]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 39538223, 'Texas': 29145505,
                        'Florida': 21538187}, name='population')
print(area)

Alaska        1723337
Texas          695662
California     423967
Name: area, dtype: int64


In [None]:
import pandas as pd

# Creating a DataFrame
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]}, index=['x', 'y', 'z'])

# Creating a Series
s = pd.Series([10, 20, 30], index=['x', 'y', 'z'])

# Adding Series to DataFrame
result = df.add(s, axis=0)  # Aligns with row index
print(result)


    A   B
x  11  14
y  22  25
z  33  36


In [None]:
df = pd.DataFrame({'A': [1, 2, None], 'B': [None, 5, 6]})
print(df.isna())  # True for missing values


       A      B
0  False   True
1  False  False
2   True  False


In [None]:
import pandas as pd
import numpy as np

# Creating a DataFrame with missing values
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [10, 11, 12, np.nan]
}

df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:\n", df)

# Detecting missing values
print("\nMissing Values (True indicates NaN):\n", df.isna())

# Filling missing values with a specified value
df_filled = df.fillna(0)
print("\nDataFrame after filling NaN with 0:\n", df_filled)

# Filling missing values with column mean
df_mean_filled = df.fillna(df.mean())
print("\nDataFrame after filling NaN with column mean:\n", df_mean_filled)

# Forward fill method
df_ffill = df.fillna(method='ffill')
print("\nDataFrame after forward fill:\n", df_ffill)

# Backward fill method
df_bfill = df.fillna(method='bfill')
print("\nDataFrame after backward fill:\n", df_bfill)

# Dropping rows with missing values
df_dropna_rows = df.dropna()
print("\nDataFrame after dropping rows with NaN:\n", df_dropna_rows)

# Dropping columns with missing values
df_dropna_cols = df.dropna(axis=1)
print("\nDataFrame after dropping columns with NaN:\n", df_dropna_cols)


Original DataFrame with Missing Values:
      A    B     C
0  1.0  5.0  10.0
1  2.0  NaN  11.0
2  NaN  NaN  12.0
3  4.0  8.0   NaN

Missing Values (True indicates NaN):
        A      B      C
0  False  False  False
1  False   True  False
2   True   True  False
3  False  False   True

DataFrame after filling NaN with 0:
      A    B     C
0  1.0  5.0  10.0
1  2.0  0.0  11.0
2  0.0  0.0  12.0
3  4.0  8.0   0.0

DataFrame after filling NaN with column mean:
           A    B     C
0  1.000000  5.0  10.0
1  2.000000  6.5  11.0
2  2.333333  6.5  12.0
3  4.000000  8.0  11.0

DataFrame after forward fill:
      A    B     C
0  1.0  5.0  10.0
1  2.0  5.0  11.0
2  2.0  5.0  12.0
3  4.0  8.0  12.0

DataFrame after backward fill:
      A    B     C
0  1.0  5.0  10.0
1  2.0  8.0  11.0
2  4.0  8.0  12.0
3  4.0  8.0   NaN

DataFrame after dropping rows with NaN:
      A    B     C
0  1.0  5.0  10.0

DataFrame after dropping columns with NaN:
 Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]


  df_ffill = df.fillna(method='ffill')
  df_bfill = df.fillna(method='bfill')


In [None]:
import pandas as pd

# Creating a DataFrame
df = pd.DataFrame({'A': [10, 20, 30], 'B': [40, 50, 60]}, index=['x', 'y', 'z'])
print("Original DataFrame:\n", df)

# Creating a Series
s = pd.Series([1, 2, 3], index=['x', 'y', 'z'])
print("\nOriginal Series:\n", s)

# Adding Series to DataFrame (Row-wise alignment)
df_add = df.add(s, axis=0)
print("\nDataFrame after adding Series row-wise:\n", df_add)

# Subtracting Series from DataFrame (Column-wise alignment)
s_col = pd.Series({'A': 5, 'B': 10})
df_subtract = df.subtract(s_col, axis=1)
print("\nDataFrame after subtracting Series column-wise:\n", df_subtract)

# Multiplication of Series with DataFrame
df_mul = df.multiply(s, axis=0)
print("\nDataFrame after multiplying with Series row-wise:\n", df_mul)

# Division of DataFrame by Series
df_div = df.div(s, axis=0)
print("\nDataFrame after dividing by Series row-wise:\n", df_div)


Original DataFrame:
     A   B
x  10  40
y  20  50
z  30  60

Original Series:
 x    1
y    2
z    3
dtype: int64

DataFrame after adding Series row-wise:
     A   B
x  11  41
y  22  52
z  33  63

DataFrame after subtracting Series column-wise:
     A   B
x   5  30
y  15  40
z  25  50

DataFrame after multiplying with Series row-wise:
     A    B
x  10   40
y  40  100
z  90  180

DataFrame after dividing by Series row-wise:
       A     B
x  10.0  40.0
y  10.0  25.0
z  10.0  20.0
