[Pandas Data Cleaning and Preprocessing](https://www.w3resource.com/python-exercises/pandas/pandas-data-cleaning-and-preprocessing.php)

In [1]:
import pandas as pd
import numpy as np

<div class="alert alert-warning">

**1. Handling Missing Data in Pandas**

**Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1641,185,7,94.98,0.1,598.374,4,West,Cash,...,2,36,Female,Electronics,3.7,Returned,Positive,6.77,,4.9
1,2023-01-02,1317,121,6,79.28,0.06,447.1392,5,North,Cash,...,5,50,Female,Clothing,1.1,Returned,Neutral,13.78,SAVE10,
2,2023-01-03,1046,175,5,61.0,0.04,292.8,6,West,Online,...,4,26,Male,Electronics,2.6,Returned,Neutral,6.86,,
3,2023-01-04,1396,144,1,18.67,0.29,13.2557,8,East,Online,...,5,65,Female,Food,4.5,Not Returned,Positive,7.05,WELCOME5,1.9
4,2023-01-05,1068,112,9,48.52,0.15,371.178,6,West,Cash,...,2,59,Female,Clothing,3.2,Returned,Negative,,WELCOME5,4.9
5,2023-01-06,1028,173,4,88.06,0.29,250.0904,5,North,Cash,...,3,41,Other,Food,4.8,Returned,Negative,16.95,,
6,2023-01-07,1559,194,3,42.36,0.25,95.31,1,East,Cash,...,1,69,Male,Electronics,4.5,Returned,Negative,19.98,,
7,2023-01-08,1773,101,3,26.8,0.03,77.988,7,East,Online,...,5,52,Female,Books,3.0,Not Returned,Neutral,6.43,FREESHIP,1.5
8,2023-01-09,1253,114,6,78.57,0.02,461.9916,9,North,Cash,...,3,64,Other,Clothing,3.9,Not Returned,Neutral,18.66,SAVE10,2.9
9,2023-01-10,1296,198,3,50.02,0.01,148.5594,2,South,Cash,...,5,25,Female,Books,4.5,Not Returned,Neutral,11.24,,4.7


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Handling Missing Data in Pandas
# Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().
result_df = df.fillna({
    'shipping_cost': df['shipping_cost'].mean(),
    'coupon_code': 'No Coupon',
    'delivery_rating': df['delivery_rating'].mean()
})

result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1641,185,7,94.98,0.1,598.374,4,West,Cash,...,2,36,Female,Electronics,3.7,Returned,Positive,6.77,,4.9
1,2023-01-02,1317,121,6,79.28,0.06,447.1392,5,North,Cash,...,5,50,Female,Clothing,1.1,Returned,Neutral,13.78,SAVE10,3.321053
2,2023-01-03,1046,175,5,61.0,0.04,292.8,6,West,Online,...,4,26,Male,Electronics,2.6,Returned,Neutral,6.86,,3.321053
3,2023-01-04,1396,144,1,18.67,0.29,13.2557,8,East,Online,...,5,65,Female,Food,4.5,Not Returned,Positive,7.05,WELCOME5,1.9
4,2023-01-05,1068,112,9,48.52,0.15,371.178,6,West,Cash,...,2,59,Female,Clothing,3.2,Returned,Negative,12.10963,WELCOME5,4.9
5,2023-01-06,1028,173,4,88.06,0.29,250.0904,5,North,Cash,...,3,41,Other,Food,4.8,Returned,Negative,16.95,,3.321053
6,2023-01-07,1559,194,3,42.36,0.25,95.31,1,East,Cash,...,1,69,Male,Electronics,4.5,Returned,Negative,19.98,,3.321053
7,2023-01-08,1773,101,3,26.8,0.03,77.988,7,East,Online,...,5,52,Female,Books,3.0,Not Returned,Neutral,6.43,FREESHIP,1.5
8,2023-01-09,1253,114,6,78.57,0.02,461.9916,9,North,Cash,...,3,64,Other,Clothing,3.9,Not Returned,Neutral,18.66,SAVE10,2.9
9,2023-01-10,1296,198,3,50.02,0.01,148.5594,2,South,Cash,...,5,25,Female,Books,4.5,Not Returned,Neutral,11.24,,4.7


<div class="alert alert-warning">

**2. Removing Missing Data observations in Pandas**

**Write a Pandas program to drop rows with missing data.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1991,118,7,39.46,0.21,218.2138,6,East,Credit Card,...,5,27,Male,Electronics,1.4,Not Returned,Positive,8.63,FREESHIP,
1,2023-01-02,1958,156,6,32.69,0.2,156.912,1,East,Cash,...,6,24,Female,Electronics,1.1,Returned,Positive,14.82,,2.2
2,2023-01-03,1594,169,9,18.02,0.14,139.4748,5,West,Credit Card,...,1,65,Female,Clothing,4.9,Not Returned,Neutral,5.27,WELCOME5,
3,2023-01-04,1390,141,2,23.68,0.21,37.4144,7,South,Credit Card,...,5,22,Female,Electronics,1.6,Returned,Positive,17.2,FREESHIP,3.1
4,2023-01-05,1543,165,8,17.42,0.12,122.6368,2,East,Online,...,3,45,Female,Food,4.7,Not Returned,Negative,7.38,,3.8
5,2023-01-06,1099,186,6,32.75,0.21,155.235,2,East,Cash,...,2,46,Other,Food,3.1,Returned,Positive,19.15,SAVE10,3.0
6,2023-01-07,1156,120,1,40.19,0.07,37.3767,4,North,Credit Card,...,6,30,Male,Books,2.4,Not Returned,Neutral,13.28,,4.0
7,2023-01-08,1737,139,1,12.27,0.24,9.3252,8,South,Cash,...,1,27,Female,Food,3.3,Not Returned,Neutral,,WELCOME5,
8,2023-01-09,1440,154,8,38.23,0.05,290.548,4,West,Online,...,4,34,Male,Electronics,2.0,Not Returned,Neutral,16.25,WELCOME5,
9,2023-01-10,1931,107,9,18.92,0.18,139.6296,7,West,Cash,...,5,67,Other,Clothing,3.4,Not Returned,Neutral,14.34,,1.8


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Handling Duplicates in Pandas
# Write a Pandas program to drop rows with missing data.

result_df = df.dropna()
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
1,2023-01-02,1958,156,6,32.69,0.2,156.912,1,East,Cash,...,6,24,Female,Electronics,1.1,Returned,Positive,14.82,,2.2
3,2023-01-04,1390,141,2,23.68,0.21,37.4144,7,South,Credit Card,...,5,22,Female,Electronics,1.6,Returned,Positive,17.2,FREESHIP,3.1
4,2023-01-05,1543,165,8,17.42,0.12,122.6368,2,East,Online,...,3,45,Female,Food,4.7,Not Returned,Negative,7.38,,3.8
5,2023-01-06,1099,186,6,32.75,0.21,155.235,2,East,Cash,...,2,46,Other,Food,3.1,Returned,Positive,19.15,SAVE10,3.0
6,2023-01-07,1156,120,1,40.19,0.07,37.3767,4,North,Credit Card,...,6,30,Male,Books,2.4,Not Returned,Neutral,13.28,,4.0
9,2023-01-10,1931,107,9,18.92,0.18,139.6296,7,West,Cash,...,5,67,Other,Clothing,3.4,Not Returned,Neutral,14.34,,1.8
10,2023-01-11,1323,177,5,13.49,0.26,49.913,9,West,Cash,...,5,24,Male,Clothing,4.1,Not Returned,Neutral,16.37,,4.7
11,2023-01-12,1533,102,6,69.34,0.29,295.3884,8,South,Cash,...,1,26,Male,Food,3.1,Returned,Positive,17.8,SAVE10,3.6
13,2023-01-14,1293,108,2,50.85,0.03,98.649,2,East,Cash,...,3,48,Female,Electronics,2.3,Returned,Neutral,6.35,,3.6
14,2023-01-15,1871,188,8,98.74,0.19,639.8352,4,East,Credit Card,...,6,28,Female,Electronics,3.8,Returned,Negative,11.53,FREESHIP,3.2


<div class="alert alert-warning">

**3. Detecting Duplicates in Pandas**

**Write a Pandas program to detect duplicates using duplicated() method.**

</div>

In [6]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1390,135,9,66.86,0.19,487.4094,3,East,Online,...,4,60,Other,Electronics,1.1,Not Returned,Positive,,SAVE10,1.8
1,2023-01-02,1096,140,1,68.74,0.09,62.5534,2,West,Online,...,6,32,Other,Food,2.1,Returned,Negative,5.84,SAVE10,4.2
2,2023-01-03,1762,110,9,12.28,0.24,83.9952,4,South,Online,...,4,29,Male,Clothing,1.5,Returned,Positive,5.37,WELCOME5,2.9
3,2023-01-04,1118,199,5,17.1,0.26,63.27,4,South,Online,...,3,54,Other,Food,2.6,Not Returned,Neutral,,,
4,2023-01-05,1816,199,4,56.86,0.24,172.8544,2,East,Cash,...,5,21,Female,Clothing,1.2,Returned,Negative,13.37,,3.6
5,2023-01-06,1543,146,8,15.58,0.09,113.4224,8,North,Credit Card,...,1,32,Female,Books,2.5,Not Returned,Negative,,FREESHIP,2.8
6,2023-01-07,1517,161,5,82.93,0.07,385.6245,6,North,Cash,...,2,64,Other,Clothing,1.9,Not Returned,Positive,,,
7,2023-01-08,1445,138,8,64.93,0.28,373.9968,3,South,Online,...,3,41,Female,Clothing,3.2,Returned,Neutral,17.89,FREESHIP,
8,2023-01-09,1785,164,6,74.82,0.1,404.028,9,East,Online,...,5,18,Other,Electronics,4.7,Returned,Neutral,,,4.1
9,2023-01-10,1777,168,2,57.84,0.22,90.2304,5,West,Credit Card,...,5,24,Other,Electronics,3.7,Returned,Neutral,12.6,FREESHIP,4.8


<div class="alert alert-success">

**Solution 03:**
</div>

In [7]:
# 3. Data Cleaning Techniques
# Write a Pandas program to detect duplicates using duplicated() method.
duplicates = df.duplicated()
duplicates_df = df[duplicates]
duplicates_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
35,2023-01-27,1407,108,5,29.25,0.03,141.8625,8,North,Cash,...,6,29,Male,Books,4.1,Not Returned,Positive,10.55,SAVE10,4.2
36,2023-01-09,1785,164,6,74.82,0.1,404.028,9,East,Online,...,5,18,Other,Electronics,4.7,Returned,Neutral,,,4.1
37,2023-02-02,1440,156,6,64.78,0.27,283.7364,7,South,Online,...,4,36,Female,Clothing,4.1,Not Returned,Negative,,FREESHIP,3.9
38,2023-02-04,1094,194,3,50.7,0.23,117.117,9,East,Cash,...,5,68,Male,Food,1.2,Not Returned,Negative,11.64,,
39,2023-01-17,1730,109,9,14.99,0.27,98.4843,9,South,Cash,...,5,48,Female,Clothing,1.8,Not Returned,Negative,,FREESHIP,


<div class="alert alert-warning">

**4. Handling Duplicates in Pandas**

**Write a Pandas program to remove duplicates rows from a DataFrame.**

</div>

In [8]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1913,132,9,13.57,0.0,122.13,4,North,Credit Card,...,4,55,Male,Books,1.5,Not Returned,Neutral,12.67,FREESHIP,3.0
1,2023-01-02,1649,156,6,56.51,0.04,325.4976,2,South,Credit Card,...,2,37,Other,Food,1.3,Returned,Positive,9.35,FREESHIP,4.3
2,2023-01-03,1760,184,1,65.14,0.21,51.4606,1,South,Credit Card,...,1,42,Other,Food,3.4,Returned,Neutral,16.71,,
3,2023-01-04,1696,169,6,84.96,0.16,428.1984,5,East,Online,...,2,43,Male,Clothing,4.4,Returned,Neutral,16.29,WELCOME5,1.6
4,2023-01-05,1852,102,9,52.15,0.12,413.028,9,East,Credit Card,...,6,44,Other,Clothing,1.9,Not Returned,Positive,16.16,,
5,2023-01-06,1872,102,4,14.52,0.26,42.9792,5,East,Online,...,6,26,Female,Food,2.2,Not Returned,Neutral,16.78,FREESHIP,2.7
6,2023-01-07,1782,154,9,96.0,0.24,656.64,6,North,Online,...,3,52,Female,Food,1.5,Returned,Positive,15.44,FREESHIP,3.4
7,2023-01-08,1376,109,6,86.36,0.24,393.8016,8,West,Credit Card,...,1,36,Other,Electronics,4.1,Returned,Neutral,,FREESHIP,1.9
8,2023-01-09,1261,102,2,10.1,0.28,14.544,7,West,Credit Card,...,4,58,Other,Food,3.4,Not Returned,Negative,16.59,,1.1
9,2023-01-10,1414,173,9,18.09,0.16,136.7604,1,East,Online,...,1,67,Male,Books,2.1,Not Returned,Positive,10.83,,3.5


<div class="alert alert-success">

**Solution 04:**
</div>

In [9]:
df.drop_duplicates()

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1913,132,9,13.57,0.0,122.13,4,North,Credit Card,...,4,55,Male,Books,1.5,Not Returned,Neutral,12.67,FREESHIP,3.0
1,2023-01-02,1649,156,6,56.51,0.04,325.4976,2,South,Credit Card,...,2,37,Other,Food,1.3,Returned,Positive,9.35,FREESHIP,4.3
2,2023-01-03,1760,184,1,65.14,0.21,51.4606,1,South,Credit Card,...,1,42,Other,Food,3.4,Returned,Neutral,16.71,,
3,2023-01-04,1696,169,6,84.96,0.16,428.1984,5,East,Online,...,2,43,Male,Clothing,4.4,Returned,Neutral,16.29,WELCOME5,1.6
4,2023-01-05,1852,102,9,52.15,0.12,413.028,9,East,Credit Card,...,6,44,Other,Clothing,1.9,Not Returned,Positive,16.16,,
5,2023-01-06,1872,102,4,14.52,0.26,42.9792,5,East,Online,...,6,26,Female,Food,2.2,Not Returned,Neutral,16.78,FREESHIP,2.7
6,2023-01-07,1782,154,9,96.0,0.24,656.64,6,North,Online,...,3,52,Female,Food,1.5,Returned,Positive,15.44,FREESHIP,3.4
7,2023-01-08,1376,109,6,86.36,0.24,393.8016,8,West,Credit Card,...,1,36,Other,Electronics,4.1,Returned,Neutral,,FREESHIP,1.9
8,2023-01-09,1261,102,2,10.1,0.28,14.544,7,West,Credit Card,...,4,58,Other,Food,3.4,Not Returned,Negative,16.59,,1.1
9,2023-01-10,1414,173,9,18.09,0.16,136.7604,1,East,Online,...,1,67,Male,Books,2.1,Not Returned,Positive,10.83,,3.5


In [10]:
# 4. Handling Duplicates in Pandas
# Write a Pandas program to remove duplicates rows from a DataFrame.

df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)

<div class="alert alert-warning">

**5. Handling Outliers with Z-Score Method**

**Write a Pandas program to handle outliers in a DataFrame with Z-Score method.**

</div>

In [11]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1944,194,5,6183.04,0.24,23495.552,5,North,Online,...,3,61,Male,Food,1.7,Not Returned,Negative,,SAVE10,2.2
1,2023-01-02,1369,100,7,2687.88,0.17,15616.5828,1,West,Online,...,5,56,Female,Food,4.9,Returned,Positive,10.05,,2.3
2,2023-01-03,1289,119,3,5551.75,0.04,15989.04,1,North,Cash,...,5,28,Female,Clothing,2.0,Not Returned,Positive,16.72,SAVE10,2.3
3,2023-01-04,1450,161,1,75.3,0.11,67.017,7,East,Cash,...,3,56,Female,Clothing,4.0,Returned,Positive,19.5,,1.7
4,2023-01-05,1619,134,1,5182.29,0.16,4353.1236,7,North,Online,...,4,46,Male,Food,3.3,Not Returned,Neutral,18.93,SAVE10,
5,2023-01-06,1266,176,5,8482.91,0.08,39021.386,2,East,Cash,...,4,24,Other,Books,3.9,Not Returned,Neutral,9.48,,4.9
6,2023-01-07,1165,118,8,5801.56,0.22,36201.7344,2,West,Online,...,3,41,Other,Clothing,4.3,Returned,Negative,,FREESHIP,3.2
7,2023-01-08,1972,145,4,9678.11,0.13,33679.8228,3,West,Online,...,2,56,Female,Clothing,2.3,Returned,Positive,7.57,,4.7
8,2023-01-09,1894,102,4,5355.71,0.19,17352.5004,3,South,Online,...,5,43,Male,Electronics,2.4,Not Returned,Negative,6.95,,2.6
9,2023-01-10,1221,163,4,1138.46,0.07,4235.0712,7,West,Online,...,4,35,Female,Clothing,3.6,Returned,Negative,9.62,SAVE10,4.6


<div class="alert alert-success">

**Solution 05:**
</div>

In [12]:
# 5. Handling Outliers with Z-Score Method
# Write a Pandas program to handle outliers in a DataFrame with Z-Score method.

def handle_outliers_z_score(df, column):
    from scipy import stats
    z_scores = np.abs(stats.zscore(df[column]))
    threshold = 1.5
    return df[(z_scores < threshold)]

result_df = handle_outliers_z_score(df, 'price')

In [13]:
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1944,194,5,6183.04,0.24,23495.552,5,North,Online,...,3,61,Male,Food,1.7,Not Returned,Negative,,SAVE10,2.2
1,2023-01-02,1369,100,7,2687.88,0.17,15616.5828,1,West,Online,...,5,56,Female,Food,4.9,Returned,Positive,10.05,,2.3
2,2023-01-03,1289,119,3,5551.75,0.04,15989.04,1,North,Cash,...,5,28,Female,Clothing,2.0,Not Returned,Positive,16.72,SAVE10,2.3
4,2023-01-05,1619,134,1,5182.29,0.16,4353.1236,7,North,Online,...,4,46,Male,Food,3.3,Not Returned,Neutral,18.93,SAVE10,
5,2023-01-06,1266,176,5,8482.91,0.08,39021.386,2,East,Cash,...,4,24,Other,Books,3.9,Not Returned,Neutral,9.48,,4.9
6,2023-01-07,1165,118,8,5801.56,0.22,36201.7344,2,West,Online,...,3,41,Other,Clothing,4.3,Returned,Negative,,FREESHIP,3.2
8,2023-01-09,1894,102,4,5355.71,0.19,17352.5004,3,South,Online,...,5,43,Male,Electronics,2.4,Not Returned,Negative,6.95,,2.6
9,2023-01-10,1221,163,4,1138.46,0.07,4235.0712,7,West,Online,...,4,35,Female,Clothing,3.6,Returned,Negative,9.62,SAVE10,4.6
10,2023-01-11,1366,178,5,9101.57,0.08,41867.222,4,East,Online,...,2,42,Male,Books,3.2,Not Returned,Neutral,,WELCOME5,
11,2023-01-12,1770,178,6,6675.22,0.17,33242.5956,7,West,Cash,...,1,61,Female,Clothing,4.2,Returned,Negative,8.74,,4.0


<div class="alert alert-warning">

**6. Normalizing Data with Min-Max Scaling**

**Write a Pandas program that normalizes data with Min-Max scaling.**

</div>

In [14]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1753,169,7,5481.41,0.16,32230.6908,7,North,Cash,...,5,63,Other,Clothing,1.3,Not Returned,Negative,13.66,FREESHIP,2.1
1,2023-01-02,1090,131,9,3671.46,0.06,31060.5516,8,North,Credit Card,...,5,29,Other,Clothing,4.4,Not Returned,Neutral,13.7,WELCOME5,
2,2023-01-03,1587,121,3,5099.81,0.09,13922.4813,2,East,Credit Card,...,3,55,Female,Electronics,3.6,Returned,Neutral,17.98,SAVE10,3.4
3,2023-01-04,1483,197,9,3737.62,0.29,23883.3918,2,South,Cash,...,6,61,Male,Food,3.7,Returned,Positive,,,
4,2023-01-05,1285,111,7,4443.74,0.23,23951.7586,4,East,Cash,...,6,40,Other,Clothing,4.8,Not Returned,Positive,6.23,SAVE10,
5,2023-01-06,1727,132,5,6326.89,0.13,27521.9715,9,South,Credit Card,...,6,35,Other,Books,2.1,Returned,Negative,14.39,FREESHIP,3.1
6,2023-01-07,1752,179,6,7250.37,0.01,43067.1978,1,West,Cash,...,3,65,Female,Books,4.6,Returned,Neutral,12.97,WELCOME5,2.0
7,2023-01-08,1840,131,3,6001.29,0.07,16743.5991,8,West,Online,...,2,39,Other,Food,4.7,Returned,Negative,7.85,SAVE10,
8,2023-01-09,1481,149,6,230.58,0.23,1065.2796,5,North,Online,...,2,58,Male,Electronics,4.8,Returned,Neutral,15.89,,2.8
9,2023-01-10,1037,133,5,6961.55,0.21,27498.1225,6,South,Credit Card,...,3,39,Male,Clothing,3.9,Not Returned,Positive,8.54,SAVE10,3.5


<div class="alert alert-success">

**Solution 06:**
</div>

In [15]:
# 6. Normalizing Data with Min-Max Scaling
# Write a Pandas program that normalizes data with Min-Max scaling.

def normalize_min_max(df, column):
    min_val = df[column].min()
    max_val = df[column].max()
    df[column] = (df[column] - min_val) / (max_val - min_val)
    return df

result_df = normalize_min_max(df.copy(), 'price')

result_df


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1753,169,7,0.542702,0.16,32230.6908,7,North,Cash,...,5,63,Other,Clothing,1.3,Not Returned,Negative,13.66,FREESHIP,2.1
1,2023-01-02,1090,131,9,0.358411,0.06,31060.5516,8,North,Credit Card,...,5,29,Other,Clothing,4.4,Not Returned,Neutral,13.7,WELCOME5,
2,2023-01-03,1587,121,3,0.503847,0.09,13922.4813,2,East,Credit Card,...,3,55,Female,Electronics,3.6,Returned,Neutral,17.98,SAVE10,3.4
3,2023-01-04,1483,197,9,0.365148,0.29,23883.3918,2,South,Cash,...,6,61,Male,Food,3.7,Returned,Positive,,,
4,2023-01-05,1285,111,7,0.437046,0.23,23951.7586,4,East,Cash,...,6,40,Other,Clothing,4.8,Not Returned,Positive,6.23,SAVE10,
5,2023-01-06,1727,132,5,0.62879,0.13,27521.9715,9,South,Credit Card,...,6,35,Other,Books,2.1,Returned,Negative,14.39,FREESHIP,3.1
6,2023-01-07,1752,179,6,0.722819,0.01,43067.1978,1,West,Cash,...,3,65,Female,Books,4.6,Returned,Neutral,12.97,WELCOME5,2.0
7,2023-01-08,1840,131,3,0.595637,0.07,16743.5991,8,West,Online,...,2,39,Other,Food,4.7,Returned,Negative,7.85,SAVE10,
8,2023-01-09,1481,149,6,0.008058,0.23,1065.2796,5,North,Online,...,2,58,Male,Electronics,4.8,Returned,Neutral,15.89,,2.8
9,2023-01-10,1037,133,5,0.693411,0.21,27498.1225,6,South,Credit Card,...,3,39,Male,Clothing,3.9,Not Returned,Positive,8.54,SAVE10,3.5


<div class="alert alert-warning">

**7. Binning Data into Categories**

**Write a Pandas program to bin data into categories.**

</div>

In [16]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1523,117,1,8616.23,0.07,8013.0939,6,East,Credit Card,...,1,18,Male,Electronics,3.3,Not Returned,Negative,11.82,FREESHIP,2.4
1,2023-01-02,1486,155,1,5844.62,0.28,4208.1264,9,North,Cash,...,6,54,Other,Books,1.3,Not Returned,Positive,10.56,,1.9
2,2023-01-03,1283,182,3,7505.68,0.29,15987.0984,4,North,Online,...,2,60,Male,Books,2.9,Returned,Negative,,SAVE10,2.9
3,2023-01-04,1302,104,5,8948.97,0.08,41165.262,8,South,Online,...,5,50,Female,Clothing,1.2,Returned,Positive,17.15,,
4,2023-01-05,1876,190,4,6604.55,0.14,22719.652,1,East,Online,...,4,59,Other,Food,2.7,Returned,Neutral,7.03,SAVE10,3.7
5,2023-01-06,1836,116,2,3505.92,0.01,6941.7216,5,North,Credit Card,...,6,25,Male,Books,3.1,Not Returned,Positive,19.31,FREESHIP,3.7
6,2023-01-07,1960,111,3,9001.91,0.14,23224.9278,8,East,Online,...,2,66,Female,Electronics,5.0,Not Returned,Positive,,,3.8
7,2023-01-08,1218,177,4,7068.81,0.17,23468.4492,3,North,Cash,...,3,35,Male,Clothing,1.3,Not Returned,Positive,,FREESHIP,2.9
8,2023-01-09,1340,117,1,8301.34,0.07,7720.2462,1,South,Online,...,4,23,Female,Books,4.0,Returned,Neutral,15.4,SAVE10,2.3
9,2023-01-10,1639,191,9,2423.11,0.07,20281.4307,3,North,Online,...,5,58,Female,Clothing,1.5,Not Returned,Neutral,,SAVE10,4.0


<div class="alert alert-success">

**Solution 07:**
</div>

In [17]:
# binning price
bins = [0, 100, 500, 1000, 5000, 10000]
labels = ['0-100', '100-500', '500-1000', '1000-5000', '5000-10000']
df['price_binned'] = pd.cut(
    df['price']
    , bins=bins
    , labels=labels
    , right=False
)

df['price_binned_q'] = pd.qcut(
    df['price']
    , q=[0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
    , duplicates='drop'
)
df['price_binned_q_no_labels'] = pd.qcut(
    df['price']
    , q=[0, 0.2, 0.4, 0.6, 0.8, 1.0]
    , labels=False
)

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,price_binned,price_binned_q,price_binned_q_no_labels
0,2023-01-01,1523,117,1,8616.23,0.07,8013.0939,6,East,Credit Card,...,Electronics,3.3,Not Returned,Negative,11.82,FREESHIP,2.4,5000-10000,High,3
1,2023-01-02,1486,155,1,5844.62,0.28,4208.1264,9,North,Cash,...,Books,1.3,Not Returned,Positive,10.56,,1.9,5000-10000,Medium,2
2,2023-01-03,1283,182,3,7505.68,0.29,15987.0984,4,North,Online,...,Books,2.9,Returned,Negative,,SAVE10,2.9,5000-10000,High,3
3,2023-01-04,1302,104,5,8948.97,0.08,41165.262,8,South,Online,...,Clothing,1.2,Returned,Positive,17.15,,,5000-10000,Very High,4
4,2023-01-05,1876,190,4,6604.55,0.14,22719.652,1,East,Online,...,Food,2.7,Returned,Neutral,7.03,SAVE10,3.7,5000-10000,Medium,2
5,2023-01-06,1836,116,2,3505.92,0.01,6941.7216,5,North,Credit Card,...,Books,3.1,Not Returned,Positive,19.31,FREESHIP,3.7,1000-5000,Low,1
6,2023-01-07,1960,111,3,9001.91,0.14,23224.9278,8,East,Online,...,Electronics,5.0,Not Returned,Positive,,,3.8,5000-10000,Very High,4
7,2023-01-08,1218,177,4,7068.81,0.17,23468.4492,3,North,Cash,...,Clothing,1.3,Not Returned,Positive,,FREESHIP,2.9,5000-10000,Medium,2
8,2023-01-09,1340,117,1,8301.34,0.07,7720.2462,1,South,Online,...,Books,4.0,Returned,Neutral,15.4,SAVE10,2.3,5000-10000,High,3
9,2023-01-10,1639,191,9,2423.11,0.07,20281.4307,3,North,Online,...,Clothing,1.5,Not Returned,Neutral,,SAVE10,4.0,1000-5000,Very Low,0


<div class="alert alert-warning">

**8. Converting Data Types and Column Operations**

**Write a Pandas program that handles text data with str.replace().**

</div>

In [18]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1667,178,6,4302.45,0.2,20651.76,3,North,Cash,...,6,50,Female,Books,4.2,Returned,Positive,13.88,,4.8
1,2023-01-02,1699,184,1,46.8,0.26,34.632,2,East,Cash,...,1,44,Male,Clothing,4.8,Not Returned,Negative,16.9,SAVE10,2.7
2,2023-01-03,1641,139,5,2817.4,0.14,12114.82,3,South,Cash,...,1,67,Other,Food,2.4,Not Returned,Positive,,,
3,2023-01-04,1876,177,2,3982.94,0.2,6372.704,4,East,Online,...,6,19,Male,Electronics,1.0,Returned,Negative,18.17,SAVE10,3.1
4,2023-01-05,1232,141,6,7841.24,0.24,35756.0544,3,East,Online,...,4,35,Female,Books,4.7,Returned,Neutral,17.53,WELCOME5,
5,2023-01-06,1390,187,2,7184.91,0.19,11639.5542,9,North,Credit Card,...,1,45,Female,Books,1.4,Not Returned,Positive,,,
6,2023-01-07,1689,146,4,8082.59,0.16,27157.5024,9,South,Credit Card,...,6,64,Other,Clothing,2.9,Not Returned,Neutral,,,
7,2023-01-08,1558,131,5,5143.9,0.05,24433.525,2,South,Online,...,1,34,Other,Clothing,4.4,Not Returned,Positive,19.03,FREESHIP,1.4
8,2023-01-09,1952,146,9,6805.04,0.06,57570.6384,3,North,Cash,...,5,35,Male,Books,2.3,Returned,Positive,,WELCOME5,
9,2023-01-10,1667,112,6,8814.56,0.25,39665.52,9,North,Cash,...,4,41,Female,Books,3.7,Returned,Positive,,,3.2


<div class="alert alert-success">

**Solution 08:**
</div>

In [19]:
# 8. Converting Data Types and Column Operations
# Write a Pandas program that handles text data with str.replace().

df['region_arrows'] = df['region'].str.replace('North', '↑')
df['region_arrows'] = df['region_arrows'].str.replace('South', '↓')
df['region_arrows'] = df['region_arrows'].str.replace('East', '→')
df['region_arrows'] = df['region_arrows'].str.replace('West', '←')

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,region_arrows
0,2023-01-01,1667,178,6,4302.45,0.2,20651.76,3,North,Cash,...,50,Female,Books,4.2,Returned,Positive,13.88,,4.8,↑
1,2023-01-02,1699,184,1,46.8,0.26,34.632,2,East,Cash,...,44,Male,Clothing,4.8,Not Returned,Negative,16.9,SAVE10,2.7,→
2,2023-01-03,1641,139,5,2817.4,0.14,12114.82,3,South,Cash,...,67,Other,Food,2.4,Not Returned,Positive,,,,↓
3,2023-01-04,1876,177,2,3982.94,0.2,6372.704,4,East,Online,...,19,Male,Electronics,1.0,Returned,Negative,18.17,SAVE10,3.1,→
4,2023-01-05,1232,141,6,7841.24,0.24,35756.0544,3,East,Online,...,35,Female,Books,4.7,Returned,Neutral,17.53,WELCOME5,,→
5,2023-01-06,1390,187,2,7184.91,0.19,11639.5542,9,North,Credit Card,...,45,Female,Books,1.4,Not Returned,Positive,,,,↑
6,2023-01-07,1689,146,4,8082.59,0.16,27157.5024,9,South,Credit Card,...,64,Other,Clothing,2.9,Not Returned,Neutral,,,,↓
7,2023-01-08,1558,131,5,5143.9,0.05,24433.525,2,South,Online,...,34,Other,Clothing,4.4,Not Returned,Positive,19.03,FREESHIP,1.4,↓
8,2023-01-09,1952,146,9,6805.04,0.06,57570.6384,3,North,Cash,...,35,Male,Books,2.3,Returned,Positive,,WELCOME5,,↑
9,2023-01-10,1667,112,6,8814.56,0.25,39665.52,9,North,Cash,...,41,Female,Books,3.7,Returned,Positive,,,3.2,↑


<div class="alert alert-warning">

**9. Replacing Missing Data with Mean Value**

**Replace Missing Data with Mean Value**

</div>

In [20]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1076,175,5,8160.36,0.04,39169.728,1,West,Online,...,1,37,Other,Books,2.3,Not Returned,Negative,5.85,FREESHIP,2.5
1,2023-01-02,1694,129,3,7892.31,0.24,17994.4668,2,East,Cash,...,6,53,Male,Electronics,2.3,Returned,Positive,5.06,SAVE10,2.5
2,2023-01-03,1943,188,8,2365.08,0.12,16650.1632,1,West,Cash,...,6,50,Other,Books,1.9,Not Returned,Neutral,8.29,SAVE10,
3,2023-01-04,1130,178,9,8823.65,0.04,76236.336,1,East,Online,...,2,58,Male,Food,1.8,Not Returned,Positive,19.0,SAVE10,1.7
4,2023-01-05,1637,198,6,5445.64,0.23,25158.8568,1,East,Cash,...,5,65,Female,Electronics,2.7,Not Returned,Positive,,FREESHIP,2.5
5,2023-01-06,1017,150,4,3664.42,0.27,10700.1064,4,North,Online,...,1,23,Male,Electronics,4.2,Returned,Neutral,18.89,WELCOME5,1.1
6,2023-01-07,1177,175,9,4548.4,0.13,35613.972,4,South,Online,...,5,46,Female,Electronics,4.5,Returned,Negative,,FREESHIP,
7,2023-01-08,1773,146,6,6875.22,0.26,30525.9768,2,South,Credit Card,...,2,65,Female,Electronics,3.5,Returned,Neutral,8.14,,4.9
8,2023-01-09,1271,192,4,5573.87,0.21,17613.4292,9,North,Credit Card,...,1,68,Female,Electronics,1.4,Not Returned,Negative,14.78,FREESHIP,1.3
9,2023-01-10,1346,104,1,6006.09,0.09,5465.5419,3,South,Credit Card,...,3,63,Female,Food,3.9,Not Returned,Positive,5.85,,2.6


<div class="alert alert-success">

**Solution 09:**
</div>

In [21]:
# 9. Replacing Missing Data with Mean Value
# replace missing values in 'delivery_rating' column with mean
df['delivery_rating'] = df['delivery_rating'].fillna(df['delivery_rating'].mean())

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1076,175,5,8160.36,0.04,39169.728,1,West,Online,...,1,37,Other,Books,2.3,Not Returned,Negative,5.85,FREESHIP,2.5
1,2023-01-02,1694,129,3,7892.31,0.24,17994.4668,2,East,Cash,...,6,53,Male,Electronics,2.3,Returned,Positive,5.06,SAVE10,2.5
2,2023-01-03,1943,188,8,2365.08,0.12,16650.1632,1,West,Cash,...,6,50,Other,Books,1.9,Not Returned,Neutral,8.29,SAVE10,3.052778
3,2023-01-04,1130,178,9,8823.65,0.04,76236.336,1,East,Online,...,2,58,Male,Food,1.8,Not Returned,Positive,19.0,SAVE10,1.7
4,2023-01-05,1637,198,6,5445.64,0.23,25158.8568,1,East,Cash,...,5,65,Female,Electronics,2.7,Not Returned,Positive,,FREESHIP,2.5
5,2023-01-06,1017,150,4,3664.42,0.27,10700.1064,4,North,Online,...,1,23,Male,Electronics,4.2,Returned,Neutral,18.89,WELCOME5,1.1
6,2023-01-07,1177,175,9,4548.4,0.13,35613.972,4,South,Online,...,5,46,Female,Electronics,4.5,Returned,Negative,,FREESHIP,3.052778
7,2023-01-08,1773,146,6,6875.22,0.26,30525.9768,2,South,Credit Card,...,2,65,Female,Electronics,3.5,Returned,Neutral,8.14,,4.9
8,2023-01-09,1271,192,4,5573.87,0.21,17613.4292,9,North,Credit Card,...,1,68,Female,Electronics,1.4,Not Returned,Negative,14.78,FREESHIP,1.3
9,2023-01-10,1346,104,1,6006.09,0.09,5465.5419,3,South,Credit Card,...,3,63,Female,Food,3.9,Not Returned,Positive,5.85,,2.6


<div class="alert alert-warning">

**10. Removing Columns with Too Many Missing Values**

**Remove columns with 5 or more missing values**

</div>

In [22]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1837,138,7,4652.69,0.07,30289.0119,1,West,Online,...,6,57,Female,Clothing,2.7,Not Returned,Neutral,,FREESHIP,4.9
1,2023-01-02,1979,139,8,2796.89,0.14,19242.6032,7,South,Cash,...,1,37,Male,Electronics,2.1,Returned,Positive,17.6,,1.2
2,2023-01-03,1546,170,8,8519.55,0.19,55206.684,2,South,Cash,...,5,47,Other,Electronics,2.8,Returned,Negative,15.75,,
3,2023-01-04,1077,166,4,1684.54,0.02,6603.3968,6,East,Online,...,4,50,Male,Clothing,2.5,Not Returned,Positive,,SAVE10,4.9
4,2023-01-05,1358,173,6,4306.66,0.19,20930.3676,3,North,Credit Card,...,4,34,Male,Electronics,1.9,Returned,Neutral,12.02,WELCOME5,4.6
5,2023-01-06,1610,155,4,2763.29,0.28,7958.2752,6,South,Cash,...,6,21,Other,Books,2.3,Returned,Negative,19.99,SAVE10,2.4
6,2023-01-07,1307,120,4,8292.11,0.02,32505.0712,8,West,Credit Card,...,6,64,Male,Clothing,3.7,Returned,Negative,8.75,,4.1
7,2023-01-08,1561,198,9,2848.06,0.05,24350.913,3,West,Credit Card,...,3,51,Other,Food,4.0,Not Returned,Negative,5.46,WELCOME5,3.5
8,2023-01-09,1260,127,8,9224.88,0.28,53135.3088,1,West,Online,...,6,39,Other,Clothing,3.5,Not Returned,Negative,6.99,WELCOME5,2.9
9,2023-01-10,1910,119,7,2112.94,0.27,10797.1234,6,North,Online,...,3,50,Female,Electronics,1.8,Returned,Positive,,WELCOME5,2.8


<div class="alert alert-success">

**Solution 10:**
</div>

In [23]:
# 10. Removing Columns with Too Many Missing Values
# Remove columns with 5 or more missing values
df.dropna(thresh=len(df) - 5, axis=1, inplace=True)

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code
0,2023-01-01,1837,138,7,4652.69,0.07,30289.0119,1,West,Online,True,6,57,Female,Clothing,2.7,Not Returned,Neutral,,FREESHIP
1,2023-01-02,1979,139,8,2796.89,0.14,19242.6032,7,South,Cash,False,1,37,Male,Electronics,2.1,Returned,Positive,17.6,
2,2023-01-03,1546,170,8,8519.55,0.19,55206.684,2,South,Cash,False,5,47,Other,Electronics,2.8,Returned,Negative,15.75,
3,2023-01-04,1077,166,4,1684.54,0.02,6603.3968,6,East,Online,False,4,50,Male,Clothing,2.5,Not Returned,Positive,,SAVE10
4,2023-01-05,1358,173,6,4306.66,0.19,20930.3676,3,North,Credit Card,False,4,34,Male,Electronics,1.9,Returned,Neutral,12.02,WELCOME5
5,2023-01-06,1610,155,4,2763.29,0.28,7958.2752,6,South,Cash,True,6,21,Other,Books,2.3,Returned,Negative,19.99,SAVE10
6,2023-01-07,1307,120,4,8292.11,0.02,32505.0712,8,West,Credit Card,True,6,64,Male,Clothing,3.7,Returned,Negative,8.75,
7,2023-01-08,1561,198,9,2848.06,0.05,24350.913,3,West,Credit Card,False,3,51,Other,Food,4.0,Not Returned,Negative,5.46,WELCOME5
8,2023-01-09,1260,127,8,9224.88,0.28,53135.3088,1,West,Online,False,6,39,Other,Clothing,3.5,Not Returned,Negative,6.99,WELCOME5
9,2023-01-10,1910,119,7,2112.94,0.27,10797.1234,6,North,Online,True,3,50,Female,Electronics,1.8,Returned,Positive,,WELCOME5
