In [None]:
Pandas is the most widely used Python library for data analysis and manipulation.
    Built on top of NumPy, offering fast, flexible data structures.
    Ideal for data cleaning, transformation, visualization, and analytics.

Key Data Structures
    Series
        1-dimensional labelled array.
        Similar to an Excel column.
    DataFrame
        2-dimensional, tabular data (rows & columns).
        Most commonly used for business and analytics operations.

Major Features
    Data Loading: Read/write CSV, Excel, SQL, JSON, HTML, Parquet.
    Data Cleaning: Handling missing values, duplicates, type conversion, renaming.
    Filtering & Selection: .loc[], .iloc[], boolean indexing.
    Aggregation & Statistics:groupby(), mean, sum, count, min, max.
    Merging & Joining:Combine multiple datasets using merge, join, concat.
    Time Series Support:Resampling, date indexing, rolling windows.

In [3]:
import pandas as pd
# Series
s = pd.Series([10, 20, 30], index=['a','b','c'])
print(s)


a    10
b    20
c    30
dtype: int64


In [4]:
#creating series with dictionary
d = {'A': 10, 'B': 20, 'C': 30}
s = pd.Series(d)
print(s)


A    10
B    20
C    30
dtype: int64


In [6]:
#Access by Label
s = pd.Series([100, 200, 300], index=["x", "y", "z"])
print(s["y"])
print(s[["x", "z"]])

200
x    100
z    300
dtype: int64


In [8]:
#boolan indixing
s = pd.Series([12, 25, 30, 45, 50])
print(s[s > 30])




3    45
4    50
dtype: int64


In [10]:
# vector operation
s = pd.Series([1, 2, 3, 4])
print(s + 5)
print(s * 2)
print(s ** 2)


0    6
1    7
2    8
3    9
dtype: int64
0    2
1    4
2    6
3    8
dtype: int64
0     1
1     4
2     9
3    16
dtype: int64


In [11]:
# checking null values
s = pd.Series([10, None, 30, None])
print(s.isnull())
print(s.notnull())
print(s.fillna(0))


0    False
1     True
2    False
3     True
dtype: bool
0     True
1    False
2     True
3    False
dtype: bool
0    10.0
1     0.0
2    30.0
3     0.0
dtype: float64


In [12]:
#Replaing values
s = pd.Series([1, 2, 3, 2, 1])
print(s.replace({1: 100, 2: 200}))


0    100
1    200
2      3
3    200
4    100
dtype: int64


In [10]:
#arithmetic operation
s1 = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s2 = pd.Series([1, 2, 3], index=['a', 'c', 'd'])

print(s1 + s2)    


a    11.0
b     NaN
c    32.0
d     NaN
dtype: float64


In [175]:
#statistical functions

s = pd.Series([5, 10, 15, 20])
print(s.sum())
print(s.mean())
print(s.median())
print(s.std())
print(s.max())
print(s.min())
print(len(s))

50
12.5
12.5
6.454972243679028
20
5
4


In [17]:
#unique & count
s = pd.Series(['A', 'B', 'A', 'C', 'A'])

# print(s.unique())
print(s.nunique())
print(s.value_counts())


3
A    3
B    1
C    1
Name: count, dtype: int64


In [20]:
#head & tail
s = pd.Series(range(1, 11))
# print(s)
# print(s.head(3))
print(s.tail(3))


7     8
8     9
9    10
dtype: int64


In [23]:
#converting type
s = pd.Series([1, 2, 3])
print(s.astype(str))


0    1
1    2
2    3
dtype: object


In [25]:
#sort
s = pd.Series([50, 10, 40, 20])
print(s.sort_values(ascending=False))


0    50
2    40
3    20
1    10
dtype: int64


In [28]:
#apply functions
s = pd.Series(["apple", "cat", "key board", "marker"])
print(s.apply(len))


0    5
1    3
2    9
3    6
dtype: int64


In [29]:
#appending series
s1 = pd.Series([1, 2])
s2 = pd.Series([3, 4])

print(pd.concat([s1, s2]))


0    1
1    2
0    3
1    4
dtype: int64


In [31]:
#creating data frame
data = {
    'Name': ['Arun', 'Bala', 'Chitra'],
    'Age': [25, 30, 28],
    'City': ['Chennai', 'Bengaluru', 'Hyderabad']
}

df = pd.DataFrame(data, index=['row1','row2','row3'])
print(df)


        Name  Age       City
row1    Arun   25    Chennai
row2    Bala   30  Bengaluru
row3  Chitra   28  Hyderabad


In [32]:
# from list of dictionary
data = [
    {'Name': 'Arun', 'Age': 25},
    {'Name': 'Bala', 'Age': 30},
    {'Name': 'Chitra', 'Age': 28}
]

df = pd.DataFrame(data)
print(df)


     Name  Age
0    Arun   25
1    Bala   30
2  Chitra   28


In [33]:
# from 2d list wiht colum names
rows = [
    [1, 'Arun', 50000],
    [2, 'Bala', 60000],
    [3, 'Chitra', 55000]
]

df = pd.DataFrame(rows, columns=['ID', 'Name', 'Salary'])
print(df)



   ID    Name  Salary
0   1    Arun   50000
1   2    Bala   60000
2   3  Chitra   55000


In [35]:
print(df.head(2))       # first 5 rows


   ID  Name  Salary
0   1  Arun   50000
1   2  Bala   60000


In [37]:
print(df.tail(2))      # last 2 rows


   ID    Name  Salary
1   2    Bala   60000
2   3  Chitra   55000


In [38]:
print(df.shape)        # (rows, columns)


(3, 3)


In [39]:
print(df.columns)      # column names


Index(['ID', 'Name', 'Salary'], dtype='object')


In [40]:
print(df.index)        # index labels


RangeIndex(start=0, stop=3, step=1)


In [41]:
print(df.dtypes)       # data types


ID         int64
Name      object
Salary     int64
dtype: object


In [42]:
print(df.info())       # full summary


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   ID      3 non-null      int64 
 1   Name    3 non-null      object
 2   Salary  3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes
None


In [48]:
print(df.Salary.describe())   # numeric summary statistics


count        3.0
mean     55000.0
std       5000.0
min      50000.0
25%      52500.0
50%      55000.0
75%      57500.0
max      60000.0
Name: Salary, dtype: float64


In [49]:
print(df['Name'])            # single column (Series)


0      Arun
1      Bala
2    Chitra
Name: Name, dtype: object


In [35]:
# Selecting rows by index position – iloc
print(df.iloc[0])       # first row


ID            1
Name       Arun
Salary    50000
Name: 0, dtype: object


In [50]:
print(df[['Name', 'Salary']]) # multiple columns (DataFrame)

     Name  Salary
0    Arun   50000
1    Bala   60000
2  Chitra   55000


In [36]:
print(df.iloc[0:2])     # first two rows


   ID  Name  Salary
0   1  Arun   50000
1   2  Bala   60000


In [37]:
print(df.iloc[[0, 2]])  # specific rows

   ID    Name  Salary
0   1    Arun   50000
2   3  Chitra   55000


In [51]:
# creating dataframe with custom index
data = [
    {'Name': 'Arun', 'Age': 25,'Salary':30000},
    {'Name': 'Bala', 'Age': 30,'Salary':40000},
    {'Name': 'Chitra', 'Age': 28,'Salary':50000}
]

df = pd.DataFrame(data, index=['emp1', 'emp2', 'emp3'])
print(df)


        Name  Age  Salary
emp1    Arun   25   30000
emp2    Bala   30   40000
emp3  Chitra   28   50000


In [56]:
df.iloc[0:2]

Unnamed: 0,Name,Age,Salary
emp1,Arun,25,30000
emp2,Bala,30,40000


In [41]:
# assuming df index = ['emp1', 'emp2', 'emp3']
print(df.loc['emp1'])


Name    Arun
Age       25
Name: emp1, dtype: object


In [42]:
print(df.loc['emp1':'emp2'])      # slice by labels


      Name  Age
emp1  Arun   25
emp2  Bala   30


In [43]:
print(df.loc[['emp1', 'emp3']])   # list of labels


        Name  Age
emp1    Arun   25
emp3  Chitra   28


In [69]:
#filtering data
high_salary = df[df['Age'] > 25]

high_salary

Unnamed: 0,Name,Age,Salary
emp2,Bala,30,40000
emp3,Chitra,28,50000


In [72]:
# Age between 26 and 30
age_range = df[(df['Age'] >= 26) |(df['Age'] <= 30)]

age_range

Unnamed: 0,Name,Age,Salary
emp1,Arun,25,30000
emp2,Bala,30,40000
emp3,Chitra,28,50000


In [73]:
# Employees not in Chennai
not_chitra = df[df['Name'] != 'Chitra']
not_chitra


Unnamed: 0,Name,Age,Salary
emp1,Arun,25,30000
emp2,Bala,30,40000


In [79]:
# Filter using .isin()
multi = df[df['Name'].isin(['Chitra', 'Bala'])]
multi


Unnamed: 0,Name,Age,Salary
emp2,Bala,30,40000
emp3,Chitra,28,50000


In [80]:
#adding new column
df['Bonus'] = df['Salary'] * 0.10
df

Unnamed: 0,Name,Age,Salary,Bonus
emp1,Arun,25,30000,3000.0
emp2,Bala,30,40000,4000.0
emp3,Chitra,28,50000,5000.0


In [81]:
#updating column
df['Bonus'] = df['Salary'] * 1.05  # 5% increment
df

Unnamed: 0,Name,Age,Salary,Bonus
emp1,Arun,25,30000,31500.0
emp2,Bala,30,40000,42000.0
emp3,Chitra,28,50000,52500.0


In [82]:
#conditional column
import numpy as np
df['Level'] = np.where(df['Salary'] > 40000, 'Senior', 'Junior')
df

Unnamed: 0,Name,Age,Salary,Bonus,Level
emp1,Arun,25,30000,31500.0,Junior
emp2,Bala,30,40000,42000.0,Junior
emp3,Chitra,28,50000,52500.0,Senior


In [88]:
#doping column
df.drop('Bonus', axis=1, inplace=True)


In [89]:
df

Unnamed: 0,Name,Age,Salary,Level
emp1,Arun,25,30000,Junior
emp2,Bala,30,40000,Junior
emp3,Chitra,28,50000,Senior


In [90]:
df.drop('emp1')

Unnamed: 0,Name,Age,Salary,Level
emp2,Bala,30,40000,Junior
emp3,Chitra,28,50000,Senior


In [91]:
df

Unnamed: 0,Name,Age,Salary,Level
emp1,Arun,25,30000,Junior
emp2,Bala,30,40000,Junior
emp3,Chitra,28,50000,Senior


In [92]:
#adding new row
new_row = {'Name': 'Divya', 'Salary': 62000,'Level':'Senior'}
df = pd.concat([df, pd.DataFrame([new_row], index=['emp4'])])


In [93]:
df

Unnamed: 0,Name,Age,Salary,Level
emp1,Arun,25.0,30000,Junior
emp2,Bala,30.0,40000,Junior
emp3,Chitra,28.0,50000,Senior
emp4,Divya,,62000,Senior


In [87]:
df = df.drop('emp1')        # drop row with index 0


In [88]:
df

Unnamed: 0,Name,Age,Salary,Level
emp2,Bala,30.0,40000,Junior
emp3,Chitra,28.0,50000,Senior
emp4,Divya,,62000,Senior


In [95]:
#Handling missing values
import numpy as np
data = {
    'Name': ['Arun', 'Bala', 'Chitra', 'Divya'],
    'Age': [25, np.nan, 28, np.nan],
    'Salary': [50000, 60000, np.nan, 65000]
}

df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Salary
0,Arun,25.0,50000.0
1,Bala,,60000.0
2,Chitra,28.0,
3,Divya,,65000.0


In [98]:
print(df.isnull().sum())  

Name      0
Age       2
Salary    1
dtype: int64


In [104]:
# Fill missing values
df_filled = df.fillna(0) 

df_filled

Unnamed: 0,Name,Age,Salary
0,Arun,25.0,50000.0
1,Bala,0.0,60000.0
2,Chitra,28.0,0.0
3,Divya,0.0,65000.0


In [107]:
# Column-wise filling
data = {
    'Name': ['Arun', 'Bala', 'Chitra', 'Divya'],
    'Age': [25, np.nan, 28, np.nan],
    'Salary': [50000, 60000, np.nan, 65000]
}

df = pd.DataFrame(data)
df['Age'] = df['Age'].fillna(df['Age'].mean())
df

Unnamed: 0,Name,Age,Salary
0,Arun,25.0,50000.0
1,Bala,26.5,60000.0
2,Chitra,28.0,
3,Divya,26.5,65000.0


In [114]:


# Drop rows with any NaN

data = {
    'Name': ['Arun', np.nan, 'Chitra', 'Divya'],
    'Age': [25, np.nan, 28, np.nan],
    'Salary': [50000, np.nan, np.nan, 65000]
}

df = pd.DataFrame(data)
df.dropna(how='all')

Unnamed: 0,Name,Age,Salary
0,Arun,25.0,50000.0
2,Chitra,28.0,
3,Divya,,65000.0


In [112]:
# Drop rows where all values are df_drop_allNaN
df_drop_all = df.dropna(how='all')
df_drop_all

Unnamed: 0,Name,Age,Salary
0,Arun,25.0,50000.0
1,Bala,,60000.0
2,Chitra,28.0,
3,Divya,,65000.0


In [117]:
# Sort by one column
data = [
    {'Name': 'Arun', 'Age': 25,'Salary':30000},
    {'Name': 'Bala', 'Age': 30,'Salary':40000},
    {'Name': 'Chitra', 'Age': 30,'Salary':50000}
]

df = pd.DataFrame(data, index=['emp1', 'emp2', 'emp3'])
print(df)

df_sorted_age = df.sort_values('Age',ascending=False)

df_sorted_age


        Name  Age  Salary
emp1    Arun   25   30000
emp2    Bala   30   40000
emp3  Chitra   30   50000


Unnamed: 0,Name,Age,Salary
emp2,Bala,30,40000
emp3,Chitra,30,50000
emp1,Arun,25,30000


In [118]:
df_sorted = df.sort_values(['Age', 'Salary'], ascending=[True, False])
df_sorted

Unnamed: 0,Name,Age,Salary
emp1,Arun,25,30000
emp3,Chitra,30,50000
emp2,Bala,30,40000


In [119]:
#rename the colum
# Rename columns
df = df.rename(columns={'Name': 'EmployeeName', 'Salary': 'MonthlySalary'})
df


Unnamed: 0,EmployeeName,Age,MonthlySalary
emp1,Arun,25,30000
emp2,Bala,30,40000
emp3,Chitra,30,50000


In [125]:
# Change type
df['Age'] = df['Age'].astype('int32')  # if no NaNs


In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, emp1 to emp3
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   EmployeeName   3 non-null      object
 1   Age            3 non-null      int32 
 2   MonthlySalary  3 non-null      int64 
dtypes: int32(1), int64(1), object(1)
memory usage: 84.0+ bytes


In [129]:
#Grouping and aggrigation
data = {
    'Department': ['Accounts', 'Accounts', 'HR', 'HR', 'Sales'],
    'Name': ['Arun', 'Bala', 'Chitra', 'Divya', 'Eshwar'],
    'Salary': [50000, 55000, 45000, 48000, 60000],
    'Age': [25, 28, 30, 27, 35]
}
df = pd.DataFrame(data)

# Average salary by department
avg_salary = df.groupby('Department')['Salary'].sum()

avg_salary

Department
Accounts    105000
HR           93000
Sales        60000
Name: Salary, dtype: int64

In [131]:
# Multiple aggregations
dept_summary = df.groupby('Department').agg({
    'Salary': ['mean', 'sum', 'max'],
    'Age': ['mean', 'min', 'max']
})
dept_summary

Unnamed: 0_level_0,Salary,Salary,Salary
Unnamed: 0_level_1,mean,sum,max
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Accounts,52500.0,105000,55000
HR,46500.0,93000,48000
Sales,60000.0,60000,60000


In [138]:
#merge
employees = pd.DataFrame({
    'EmpID': [1, 2, 3],
    'Name': ['Arun', 'Bala', 'Chitra'],
    'DeptID': [10, 20, 30]
})

departments = pd.DataFrame({
    'DeptID': [10, 20,40],
    'DeptName': ['Accounts', 'HR','IT']
})

# Inner join on DeptID
merged = pd.merge(employees, departments, on='DeptID', how='outer')

merged

Unnamed: 0,EmpID,Name,DeptID,DeptName
0,1.0,Arun,10,Accounts
1,2.0,Bala,20,HR
2,3.0,Chitra,30,
3,,,40,IT


In [114]:
# Left join (all employees, dept info if available)
merged_left = pd.merge(employees, departments, on='DeptID', how='left')
merged_left

Unnamed: 0,EmpID,Name,DeptID,DeptName
0,1,Arun,10,Accounts
1,2,Bala,20,HR
2,3,Chitra,10,Accounts


In [142]:
#concating dataframe
df1 = pd.DataFrame({'ID': [1, 2], 'Name': ['Arun', 'Bala']})
df2 = pd.DataFrame({'ID': [3, 4], 'Name': ['Chitra', 'Divya']})

df_row = pd.concat([df1, df2], axis=1)

df_row

Unnamed: 0,ID,Name,ID.1,Name.1
0,1,Arun,3,Chitra
1,2,Bala,4,Divya


In [117]:
# Column-wise (side by side)
df_col = pd.concat([df1, df2], axis=1)
df_col

Unnamed: 0,ID,Name,ID.1,Name.1
0,1,Arun,3,Chitra
1,2,Bala,4,Divya


In [145]:
#apply function
df = pd.DataFrame({
    'Name': ['Arun', 'Bala', 'Chitra'],
    'Maths': [80, 90, 85],
    'Science': [75, 88, 92]
})

# Column-wise: total marks
df['Total'] = df[['Maths', 'Science']].sum(axis=1)

df

Unnamed: 0,Name,Maths,Science,Total
0,Arun,80,75,155
1,Bala,90,88,178
2,Chitra,85,92,177


In [146]:
# Using apply with custom function
def grade(total):
    if total >= 170:
        return 'A'
    elif total >= 160:
        return 'B'
    else:
        return 'C'

df['Grade'] = df['Total'].apply(grade)

In [120]:
df

Unnamed: 0,Name,Maths,Science,Total,Grade
0,Arun,80,75,155,C
1,Bala,90,88,178,A
2,Chitra,85,92,177,A


In [147]:
# value count
city_df = pd.DataFrame({
    'City': ['Chennai', 'Chennai', 'Bengaluru', 'Hyderabad', 'Chennai']
})

print(city_df['City'].unique())        # unique cities


['Chennai' 'Bengaluru' 'Hyderabad']


In [148]:
print(city_df['City'].nunique())    


3


In [149]:
print(city_df['City'].value_counts())  # frequency of each city


City
Chennai      3
Bengaluru    1
Hyderabad    1
Name: count, dtype: int64


In [153]:
# Remove duplicate
df = pd.DataFrame({
    'Name': ['Arun', 'Bala', 'Arun', 'Chitra','Chitra'],
    'City': ['Chennai', 'Bengaluru', 'Chennai', 'Hyderabad','Mumbai']
})
df


Unnamed: 0,Name,City
0,Arun,Chennai
1,Bala,Bengaluru
2,Arun,Chennai
3,Chitra,Hyderabad
4,Chitra,Mumbai


In [128]:
duplicates = df.duplicated()              # full row duplicates
duplicates

0    False
1    False
2     True
3    False
dtype: bool

In [156]:
df_unique = df.drop_duplicates()          # remove fully duplicate rows
df_unique

Unnamed: 0,Name,City
0,Arun,Chennai
1,Bala,Bengaluru
3,Chitra,Hyderabad
4,Chitra,Mumbai


In [155]:
# Drop duplicates based on particular column(s)
df_unique_name = df.drop_duplicates(subset=['Name'])
df_unique_name

Unnamed: 0,Name,City
0,Arun,Chennai
1,Bala,Bengaluru
3,Chitra,Hyderabad


In [158]:
# To CSV
df.to_csv('employees_out.csv', index=False)



In [157]:
# Read from CSV
df2 = pd.read_csv('employees.csv')

df2


Unnamed: 0,Name,City
0,Arun,Chennai
1,Bala,Bengaluru
2,Arun,Chennai
3,Chitra,Hyderabad


In [159]:
# To Excel
df.to_excel('employees.xlsx', index=False, sheet_name='employees')





In [160]:
df.to_html('employees.html')


In [134]:
# From Excel
df3 = pd.read_excel('employees.xlsx', sheet_name='Sheet1')
df3

Unnamed: 0,Name,City
0,Arun,Chennai
1,Bala,Bengaluru
2,Arun,Chennai
3,Chitra,Hyderabad


In [None]:
%Pandas Performance Optimization 
1. Why Optimize Pandas?
    Because Pandas becomes slow when:
    Data is large (1–50 million rows)
    You use Python loops instead of vectorization
    Data types are not optimized
    Too many intermediate DataFrames are created


In [161]:
df = pd.read_csv('transaction.csv')
df.head()

Unnamed: 0,transaction_id,transaction_date,customer_id,item_id,item_name,category,quantity,price,store_location,payment_method
0,18076,2025-01-29,1374,503,Eggs 12pc,Grocery,3,60,Bangalore,UPI
1,11196,2025-07-20,242,501,Milk 1L,Grocery,4,50,Pune,UPI
2,12971,2025-05-31,381,507,Banana 1kg,Fruits,2,40,Bangalore,UPI
3,19156,2025-11-06,565,506,Apple 1kg,Fruits,1,150,Pune,Credit Card
4,2859,2025-04-22,121,505,Cheese Slice,Dairy,3,120,Delhi,UPI


In [162]:
for i in range(len(df)):
    df.loc[i, "Total"] = df.loc[i, "quantity"] * df.loc[i, "price"]
df.head()

Unnamed: 0,transaction_id,transaction_date,customer_id,item_id,item_name,category,quantity,price,store_location,payment_method,Total
0,18076,2025-01-29,1374,503,Eggs 12pc,Grocery,3,60,Bangalore,UPI,180.0
1,11196,2025-07-20,242,501,Milk 1L,Grocery,4,50,Pune,UPI,200.0
2,12971,2025-05-31,381,507,Banana 1kg,Fruits,2,40,Bangalore,UPI,80.0
3,19156,2025-11-06,565,506,Apple 1kg,Fruits,1,150,Pune,Credit Card,150.0
4,2859,2025-04-22,121,505,Cheese Slice,Dairy,3,120,Delhi,UPI,360.0


In [14]:
df.drop('Total', axis=1,inplace=True)

In [163]:
# Fast (vectorized)
# Pandas works best when you perform operations on entire columns at once —not one row at a time.
df["Total"] = df["quantity"] * df["price"]
df.head()


Unnamed: 0,transaction_id,transaction_date,customer_id,item_id,item_name,category,quantity,price,store_location,payment_method,Total
0,18076,2025-01-29,1374,503,Eggs 12pc,Grocery,3,60,Bangalore,UPI,180
1,11196,2025-07-20,242,501,Milk 1L,Grocery,4,50,Pune,UPI,200
2,12971,2025-05-31,381,507,Banana 1kg,Fruits,2,40,Bangalore,UPI,80
3,19156,2025-11-06,565,506,Apple 1kg,Fruits,1,150,Pune,Credit Card,150
4,2859,2025-04-22,121,505,Cheese Slice,Dairy,3,120,Delhi,UPI,360


In [164]:
# Slow – Python function applied row-by-row
df["Discount"] = df.apply(lambda r: r.price * 0.1, axis=1)



In [20]:
df

Unnamed: 0,transaction_id,transaction_date,customer_id,item_id,item_name,category,quantity,price,store_location,payment_method,Total,Discount
0,18076,2025-01-29,1374,503,Eggs 12pc,Grocery,3,60,Bangalore,UPI,180,6.0
1,11196,2025-07-20,242,501,Milk 1L,Grocery,4,50,Pune,UPI,200,5.0
2,12971,2025-05-31,381,507,Banana 1kg,Fruits,2,40,Bangalore,UPI,80,4.0
3,19156,2025-11-06,565,506,Apple 1kg,Fruits,1,150,Pune,Credit Card,150,15.0
4,2859,2025-04-22,121,505,Cheese Slice,Dairy,3,120,Delhi,UPI,360,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,7955,2025-08-18,1423,504,Butter,Dairy,1,80,Pune,Cash,80,8.0
9996,1728,2025-04-08,15,505,Cheese Slice,Dairy,2,120,Hyderabad,Cash,240,12.0
9997,17274,2025-07-06,1685,505,Cheese Slice,Dairy,3,120,Delhi,Cash,360,12.0
9998,13651,2025-03-16,1240,508,Chocolate Bar,Snacks,4,20,Hyderabad,Cash,80,2.0


In [21]:
df.head()

Unnamed: 0,transaction_id,transaction_date,customer_id,item_id,item_name,category,quantity,price,store_location,payment_method,Total,Discount
0,18076,2025-01-29,1374,503,Eggs 12pc,Grocery,3,60,Bangalore,UPI,180,6.0
1,11196,2025-07-20,242,501,Milk 1L,Grocery,4,50,Pune,UPI,200,5.0
2,12971,2025-05-31,381,507,Banana 1kg,Fruits,2,40,Bangalore,UPI,80,4.0
3,19156,2025-11-06,565,506,Apple 1kg,Fruits,1,150,Pune,Credit Card,150,15.0
4,2859,2025-04-22,121,505,Cheese Slice,Dairy,3,120,Delhi,UPI,360,12.0


In [22]:
df.drop('Discount', axis=1, inplace=True)

In [166]:

df["Discount"] = df["Total"] * 0.10

In [167]:
df.dtypes

transaction_id        int64
transaction_date     object
customer_id           int64
item_id               int64
item_name            object
category             object
quantity              int64
price                 int64
store_location       object
payment_method       object
Total                 int64
Discount            float64
dtype: object

In [168]:
# Rule #3 – Optimize Data Types
# “Data type mistakes waste BOTH time and memory.”
#Saves 50–80% RAM, groupby/merge becomes faster.


df["customer_id"]= df["customer_id"].astype("int32")


In [26]:
df.dtypes

transaction_id       int64
transaction_date    object
customer_id          int32
item_id              int64
item_name           object
category            object
quantity             int64
price                int64
store_location      object
payment_method      object
Total                int64
dtype: object

In [169]:
#slow
df = pd.read_csv("transaction.csv")

In [170]:
#fast
# usecols → skip unnecessary columns
# dtype → avoids type guessing
# parse_dates → automatically converts dates

df = pd.read_csv(
    "transaction.csv",
    dtype={"item_id ": "int32", "item_name": "object", "category":"object", "quantity":"int32","price":"int32"},
    parse_dates=["transaction_date"],
    usecols=["item_id", "item_name", "category", "quantity","price","transaction_date"]
)

df.head()

Unnamed: 0,transaction_date,item_id,item_name,category,quantity,price
0,2025-01-29,503,Eggs 12pc,Grocery,3,60
1,2025-07-20,501,Milk 1L,Grocery,4,50
2,2025-05-31,507,Banana 1kg,Fruits,2,40
3,2025-11-06,506,Apple 1kg,Fruits,1,150
4,2025-04-22,505,Cheese Slice,Dairy,3,120


In [171]:
# Use Boolean Masks Instead of Complex Conditions
# ❌ Slow
df_filtered = df[df["price"] > 100]
df_filtered = df_filtered[df_filtered["quantity"] > 2]
df_filtered.head()

Unnamed: 0,transaction_date,item_id,item_name,category,quantity,price
4,2025-04-22,505,Cheese Slice,Dairy,3,120
13,2025-08-22,505,Cheese Slice,Dairy,4,120
14,2025-08-19,506,Apple 1kg,Fruits,4,150
22,2025-03-26,506,Apple 1kg,Fruits,4,150
27,2025-04-06,506,Apple 1kg,Fruits,3,150


In [33]:
# Fast
df_filtered = df[(df["price"] > 100) & (df["quantity"] > 2)]
df.head()

Unnamed: 0,transaction_date,item_id,item_name,category,quantity,price
0,2025-01-29,503,Eggs 12pc,Grocery,3,60
1,2025-07-20,501,Milk 1L,Grocery,4,50
2,2025-05-31,507,Banana 1kg,Fruits,2,40
3,2025-11-06,506,Apple 1kg,Fruits,1,150
4,2025-04-22,505,Cheese Slice,Dairy,3,120


In [35]:
# Avoid Too Many Temporary Copies
# Poor practice
df1 = df[df.price > 0]
df2 = df1[df1.quantity > 2]
df3 = df2[["price", "quantity"]]
df3.head()


Unnamed: 0,price,quantity
0,60,3
1,50,4
4,120,3
6,80,4
9,50,3


In [46]:
# Good practice
df1 = df[(df['price'] > 0) & (df['quantity'] > 2)][['price', 'quantity']]
df1.head()

Unnamed: 0,price,quantity
0,60,3
1,50,4
4,120,3
6,80,4
9,50,3


In [48]:
# Use Index for Faster Lookup
# Without index (slower):
df[df["item_id"] == 503]



Unnamed: 0,transaction_date,item_id,item_name,category,quantity,price
0,2025-01-29,503,Eggs 12pc,Grocery,3,60
25,2025-01-18,503,Eggs 12pc,Grocery,4,60
26,2025-01-31,503,Eggs 12pc,Grocery,2,60
30,2025-08-12,503,Eggs 12pc,Grocery,1,60
31,2025-01-31,503,Eggs 12pc,Grocery,4,60
...,...,...,...,...,...,...
9967,2025-09-03,503,Eggs 12pc,Grocery,2,60
9969,2025-04-29,503,Eggs 12pc,Grocery,1,60
9973,2025-06-28,503,Eggs 12pc,Grocery,1,60
9976,2025-10-19,503,Eggs 12pc,Grocery,3,60


In [53]:
#Faster method
# df = df.set_index("item_id")
df.loc[503]

Unnamed: 0_level_0,transaction_date,item_name,category,quantity,price
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
503,2025-01-29,Eggs 12pc,Grocery,3,60
503,2025-01-18,Eggs 12pc,Grocery,4,60
503,2025-01-31,Eggs 12pc,Grocery,2,60
503,2025-08-12,Eggs 12pc,Grocery,1,60
503,2025-01-31,Eggs 12pc,Grocery,4,60
...,...,...,...,...,...
503,2025-09-03,Eggs 12pc,Grocery,2,60
503,2025-04-29,Eggs 12pc,Grocery,1,60
503,2025-06-28,Eggs 12pc,Grocery,1,60
503,2025-10-19,Eggs 12pc,Grocery,3,60


In [172]:
# Use query() for complex filters
# More readable + often faster.

df_small = df.query("category == 'Grocery' and price > 10")
df_small.head()

Unnamed: 0,transaction_date,item_id,item_name,category,quantity,price
0,2025-01-29,503,Eggs 12pc,Grocery,3,60
1,2025-07-20,501,Milk 1L,Grocery,4,50
9,2025-04-26,501,Milk 1L,Grocery,3,50
17,2025-08-14,501,Milk 1L,Grocery,1,50
19,2024-12-01,501,Milk 1L,Grocery,3,50


In [173]:
# Use NumPy for heavy math

amount = df["price"].to_numpy()
qty = df["quantity"].to_numpy()

df["Total"] = amount * qty


In [174]:
df.head()

Unnamed: 0,transaction_date,item_id,item_name,category,quantity,price,Total
0,2025-01-29,503,Eggs 12pc,Grocery,3,60,180
1,2025-07-20,501,Milk 1L,Grocery,4,50,200
2,2025-05-31,507,Banana 1kg,Fruits,2,40,80
3,2025-11-06,506,Apple 1kg,Fruits,1,150,150
4,2025-04-22,505,Cheese Slice,Dairy,3,120,360


In [None]:
# Memory Management Techniques for Large Datasets in Python & Pandas
#     Handling millions of rows efficiently
#     Reducing RAM usage
#     Optimizing performance

# Why Memory Management Matters
    # Challenges with Large Datasets
    # Operations become slow
    # Machine runs out of memory
    # Unnecessary swapping / crashes
    # Data loading takes more time
    # GroupBy, Merge, Sort become heavy
# Typical problems:
    # Pandas loads everything into memory
    # Defaults use heavy data types (int64, float64)
    # Unoptimized files inflate RAM usage

# How Pandas Stores Data
#     Pandas DataFrame = Collection of Columns
#     Each column stored as a NumPy array
#     Uses contiguous memory
#     Supports vectorized operations


