### Data Maipulation and Analysis with Pandas

Data manipulation and analysis are key tasks in any data science or data analysis project. Pandas provides a wide range of functions for data manipulation and analysis, making it easier to clean, transform and extract insights from data.

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("coffee_shop_sales (10.3).csv")
df.head()

Unnamed: 0,Date,Transaction_ID,Customer_Name,Age,Gender,Item,Category,Quantity,Price,Payment_Method,Branch,Rating
0,2025-10-01,T001,Arjun Mehta,25.0,M,Latte,Drink,2.0,150.0,Cash,Connaught Place,4.0
1,2025-10-01,T002,Simran Kaur,31.0,F,Cappuccino,Drink,1.0,180.0,Card,Connaught Place,5.0
2,2025-10-02,T003,Rahul Sharma,,M,Blueberry Muffin,Food,1.0,120.0,UPI,Saket,3.0
3,2025-10-02,T004,Neha Singh,27.0,F,Espresso,Drink,1.0,,Cash,Hauz Khas,4.0
4,2025-10-03,T005,Manish Gupta,45.0,M,Americano,Drink,3.0,140.0,Card,Hauz Khas,


In [13]:
df.describe()

Unnamed: 0,Age,Quantity,Price,Rating
count,12.0,14.0,13.0,12.0
mean,31.0,1.357143,166.153846,4.166667
std,6.410219,0.633324,43.308671,0.834847
min,23.0,1.0,100.0,3.0
25%,26.75,1.0,140.0,3.75
50%,29.5,1.0,150.0,4.0
75%,33.5,1.75,190.0,5.0
max,45.0,3.0,250.0,5.0


In [14]:
df.dtypes

Date               object
Transaction_ID     object
Customer_Name      object
Age               float64
Gender             object
Item               object
Category           object
Quantity          float64
Price             float64
Payment_Method     object
Branch             object
Rating            float64
dtype: object

In [None]:
## Handling missing values
## isnull() --> helps to find null values and gives a boolean value
## any() --> gives which particular column has got null values
df.isnull().any()

Date              False
Transaction_ID    False
Customer_Name     False
Age                True
Gender            False
Item              False
Category          False
Quantity           True
Price              True
Payment_Method     True
Branch             True
Rating             True
dtype: bool

In [16]:
## sum() --> helps find the number of null values in particular column
df.isnull().sum()

Date              0
Transaction_ID    0
Customer_Name     0
Age               3
Gender            0
Item              0
Category          0
Quantity          1
Price             2
Payment_Method    1
Branch            1
Rating            3
dtype: int64

In [None]:
## filling missing values with the mean of the column
## fillna() -> is a function that is used to fill the null values

df['Age_fillNA'] = df['Age'].fillna(df['Age'].mean())
df

Unnamed: 0,Date,Transaction_ID,Customer_Name,Age,Gender,Item,Category,Quantity,Price,Payment_Method,Branch,Rating,Age_fillNA
0,2025-10-01,T001,Arjun Mehta,25.0,M,Latte,Drink,2.0,150.0,Cash,Connaught Place,4.0,25.0
1,2025-10-01,T002,Simran Kaur,31.0,F,Cappuccino,Drink,1.0,180.0,Card,Connaught Place,5.0,31.0
2,2025-10-02,T003,Rahul Sharma,,M,Blueberry Muffin,Food,1.0,120.0,UPI,Saket,3.0,31.0
3,2025-10-02,T004,Neha Singh,27.0,F,Espresso,Drink,1.0,,Cash,Hauz Khas,4.0,27.0
4,2025-10-03,T005,Manish Gupta,45.0,M,Americano,Drink,3.0,140.0,Card,Hauz Khas,,45.0
5,2025-10-03,T006,Aarti Patel,,F,Chocolate Croissant,Food,2.0,200.0,,Saket,5.0,31.0
6,2025-10-04,T007,Kunal Verma,30.0,M,Mocha,Drink,1.0,190.0,UPI,Connaught Place,4.0,30.0
7,2025-10-04,T008,Priya Bansal,23.0,F,Latte,Drink,1.0,150.0,Card,Saket,5.0,23.0
8,2025-10-05,T009,Ritika Joshi,26.0,F,Cheese Sandwich,Food,,220.0,Cash,Hauz Khas,3.0,26.0
9,2025-10-05,T010,Sameer Khan,35.0,M,Cold Brew,Drink,1.0,250.0,Card,Connaught Place,4.0,35.0


In [18]:
## rating
df['Rating_fillna'] = df['Rating'].fillna(df['Rating'].mean())
df

Unnamed: 0,Date,Transaction_ID,Customer_Name,Age,Gender,Item,Category,Quantity,Price,Payment_Method,Branch,Rating,Age_fillNA,Rating_fillna
0,2025-10-01,T001,Arjun Mehta,25.0,M,Latte,Drink,2.0,150.0,Cash,Connaught Place,4.0,25.0,4.0
1,2025-10-01,T002,Simran Kaur,31.0,F,Cappuccino,Drink,1.0,180.0,Card,Connaught Place,5.0,31.0,5.0
2,2025-10-02,T003,Rahul Sharma,,M,Blueberry Muffin,Food,1.0,120.0,UPI,Saket,3.0,31.0,3.0
3,2025-10-02,T004,Neha Singh,27.0,F,Espresso,Drink,1.0,,Cash,Hauz Khas,4.0,27.0,4.0
4,2025-10-03,T005,Manish Gupta,45.0,M,Americano,Drink,3.0,140.0,Card,Hauz Khas,,45.0,4.166667
5,2025-10-03,T006,Aarti Patel,,F,Chocolate Croissant,Food,2.0,200.0,,Saket,5.0,31.0,5.0
6,2025-10-04,T007,Kunal Verma,30.0,M,Mocha,Drink,1.0,190.0,UPI,Connaught Place,4.0,30.0,4.0
7,2025-10-04,T008,Priya Bansal,23.0,F,Latte,Drink,1.0,150.0,Card,Saket,5.0,23.0,5.0
8,2025-10-05,T009,Ritika Joshi,26.0,F,Cheese Sandwich,Food,,220.0,Cash,Hauz Khas,3.0,26.0,3.0
9,2025-10-05,T010,Sameer Khan,35.0,M,Cold Brew,Drink,1.0,250.0,Card,Connaught Place,4.0,35.0,4.0


In [6]:
## Rename column
df = df.rename(columns = {'Price':'Amount'})
df.head()


Unnamed: 0,Date,Transaction_ID,Customer_Name,Age,Gender,Item,Category,Quantity,Amount,Payment_Method,Branch,Rating
0,2025-10-01,T001,Arjun Mehta,25.0,M,Latte,Drink,2.0,150.0,Cash,Connaught Place,4.0
1,2025-10-01,T002,Simran Kaur,31.0,F,Cappuccino,Drink,1.0,180.0,Card,Connaught Place,5.0
2,2025-10-02,T003,Rahul Sharma,,M,Blueberry Muffin,Food,1.0,120.0,UPI,Saket,3.0
3,2025-10-02,T004,Neha Singh,27.0,F,Espresso,Drink,1.0,,Cash,Hauz Khas,4.0
4,2025-10-03,T005,Manish Gupta,45.0,M,Americano,Drink,3.0,140.0,Card,Hauz Khas,


In [7]:
## change the data types
df['Amount_new']=df['Amount'].fillna(df['Amount'].mean())
df.head()

Unnamed: 0,Date,Transaction_ID,Customer_Name,Age,Gender,Item,Category,Quantity,Amount,Payment_Method,Branch,Rating,Amount_new
0,2025-10-01,T001,Arjun Mehta,25.0,M,Latte,Drink,2.0,150.0,Cash,Connaught Place,4.0,150.0
1,2025-10-01,T002,Simran Kaur,31.0,F,Cappuccino,Drink,1.0,180.0,Card,Connaught Place,5.0,180.0
2,2025-10-02,T003,Rahul Sharma,,M,Blueberry Muffin,Food,1.0,120.0,UPI,Saket,3.0,120.0
3,2025-10-02,T004,Neha Singh,27.0,F,Espresso,Drink,1.0,,Cash,Hauz Khas,4.0,166.153846
4,2025-10-03,T005,Manish Gupta,45.0,M,Americano,Drink,3.0,140.0,Card,Hauz Khas,,140.0


In [8]:
df['Amount_int']=df['Amount_new'].astype(int)
df.head()

Unnamed: 0,Date,Transaction_ID,Customer_Name,Age,Gender,Item,Category,Quantity,Amount,Payment_Method,Branch,Rating,Amount_new,Amount_int
0,2025-10-01,T001,Arjun Mehta,25.0,M,Latte,Drink,2.0,150.0,Cash,Connaught Place,4.0,150.0,150
1,2025-10-01,T002,Simran Kaur,31.0,F,Cappuccino,Drink,1.0,180.0,Card,Connaught Place,5.0,180.0,180
2,2025-10-02,T003,Rahul Sharma,,M,Blueberry Muffin,Food,1.0,120.0,UPI,Saket,3.0,120.0,120
3,2025-10-02,T004,Neha Singh,27.0,F,Espresso,Drink,1.0,,Cash,Hauz Khas,4.0,166.153846,166
4,2025-10-03,T005,Manish Gupta,45.0,M,Americano,Drink,3.0,140.0,Card,Hauz Khas,,140.0,140


In [10]:
df['Increased_Price'] = df['Amount'].apply(lambda x:x*2)
df.head()

Unnamed: 0,Date,Transaction_ID,Customer_Name,Age,Gender,Item,Category,Quantity,Amount,Payment_Method,Branch,Rating,Amount_new,Amount_int,Increased_Price
0,2025-10-01,T001,Arjun Mehta,25.0,M,Latte,Drink,2.0,150.0,Cash,Connaught Place,4.0,150.0,150,300.0
1,2025-10-01,T002,Simran Kaur,31.0,F,Cappuccino,Drink,1.0,180.0,Card,Connaught Place,5.0,180.0,180,360.0
2,2025-10-02,T003,Rahul Sharma,,M,Blueberry Muffin,Food,1.0,120.0,UPI,Saket,3.0,120.0,120,240.0
3,2025-10-02,T004,Neha Singh,27.0,F,Espresso,Drink,1.0,,Cash,Hauz Khas,4.0,166.153846,166,
4,2025-10-03,T005,Manish Gupta,45.0,M,Americano,Drink,3.0,140.0,Card,Hauz Khas,,140.0,140,280.0


In [11]:
## Data Aggregating and grouping

grouped_mean = df.groupby('Item')['Amount'].mean()
print(grouped_mean)

Item
Americano              140.0
Blueberry Muffin       120.0
Cappuccino             180.0
Cheese Sandwich        220.0
Chocolate Cake           NaN
Chocolate Croissant    200.0
Cold Brew              250.0
Espresso               100.0
Latte                  150.0
Mocha                  190.0
Name: Amount, dtype: float64


In [12]:
grouped_sum = df.groupby(['Item', 'Branch'])['Amount'].sum()
print(grouped_sum)

Item                 Branch         
Americano            Hauz Khas          140.0
Blueberry Muffin     Connaught Place    120.0
                     Saket              120.0
Cappuccino           Connaught Place    180.0
Cheese Sandwich      Hauz Khas          220.0
Chocolate Cake       Connaught Place      0.0
Chocolate Croissant  Saket              200.0
Cold Brew            Connaught Place    250.0
Espresso             Hauz Khas          100.0
Latte                Connaught Place    150.0
                     Saket              150.0
Mocha                Connaught Place    190.0
                     Saket              190.0
Name: Amount, dtype: float64


In [13]:
## aggregate multiple functions

grouped_agg = df.groupby('Branch')['Amount'].agg(['mean','sum','count'])
print(grouped_agg)

                       mean    sum  count
Branch                                   
Connaught Place  178.000000  890.0      5
Hauz Khas        153.333333  460.0      3
Saket            165.000000  660.0      4


In [16]:
## Merging and joining data frames

df1 = pd.DataFrame({'Id': [1,2,3], 'Name': ['anushka','puneet','jack']})
df2 = pd.DataFrame({'Id': [1,2,3], 'City': ['Menton','Delhi','Bengalore']})
pd.merge(df1,df2,on = 'Id')

Unnamed: 0,Id,Name,City
0,1,anushka,Menton
1,2,puneet,Delhi
2,3,jack,Bengalore


In [18]:
df1 = pd.DataFrame({'Id': [1,2,4], 'Name': ['anushka','puneet','jack']})
df2 = pd.DataFrame({'Id': [1,2,3], 'City': ['Menton','Delhi','Bengalore']})
pd.merge(df1,df2,on = 'Id', how ='outer')

Unnamed: 0,Id,Name,City
0,1,anushka,Menton
1,2,puneet,Delhi
2,3,,Bengalore
3,4,jack,


In [19]:
pd.merge(df1,df2,on = 'Id', how ='left')

Unnamed: 0,Id,Name,City
0,1,anushka,Menton
1,2,puneet,Delhi
2,4,jack,


In [20]:
pd.merge(df1,df2,on = 'Id', how ='right')

Unnamed: 0,Id,Name,City
0,1,anushka,Menton
1,2,puneet,Delhi
2,3,,Bengalore
