[Pandas Data Cleaning and Preprocessing](https://www.w3resource.com/python-exercises/pandas/pandas-data-cleaning-and-preprocessing.php)

In [1]:
import pandas as pd
import numpy as np

<div class="alert alert-warning">

**1. Handling Missing Data in Pandas**

**Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1578,133,7,49.86,0.28,251.2944,8,North,Online,...,4,19,Male,Food,4.4,Not Returned,Neutral,,SAVE10,3.7
1,2023-01-02,1248,120,3,32.08,0.26,71.2176,6,South,Online,...,4,29,Male,Electronics,2.8,Not Returned,Positive,13.48,FREESHIP,
2,2023-01-03,1462,115,2,93.38,0.26,138.2024,2,South,Credit Card,...,6,57,Other,Electronics,3.9,Not Returned,Neutral,14.39,,4.2
3,2023-01-04,1749,180,9,95.71,0.06,809.7066,9,North,Cash,...,2,28,Male,Electronics,4.3,Not Returned,Positive,10.14,,
4,2023-01-05,1564,109,9,81.4,0.05,695.97,7,East,Credit Card,...,4,31,Female,Food,4.6,Not Returned,Neutral,7.75,,2.5
5,2023-01-06,1399,145,6,44.36,0.22,207.6048,4,North,Online,...,6,36,Male,Food,4.5,Not Returned,Negative,5.54,,
6,2023-01-07,1256,179,6,56.06,0.3,235.452,3,South,Online,...,5,64,Male,Food,4.3,Returned,Negative,14.21,,4.7
7,2023-01-08,1582,111,8,11.68,0.29,66.3424,7,East,Cash,...,5,64,Female,Books,1.4,Not Returned,Negative,12.02,,
8,2023-01-09,1329,180,2,35.63,0.07,66.2718,1,North,Online,...,5,33,Other,Books,2.8,Returned,Positive,8.75,SAVE10,2.0
9,2023-01-10,1670,142,4,82.37,0.04,316.3008,6,North,Online,...,5,21,Male,Electronics,5.0,Not Returned,Negative,19.87,FREESHIP,


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Handling Missing Data in Pandas
# Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().
result_df = df.fillna({
    'shipping_cost': df['shipping_cost'].mean(),
    'coupon_code': 'No Coupon',
    'delivery_rating': df['delivery_rating'].mean()
})

result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1578,133,7,49.86,0.28,251.2944,8,North,Online,...,4,19,Male,Food,4.4,Not Returned,Neutral,11.56697,SAVE10,3.7
1,2023-01-02,1248,120,3,32.08,0.26,71.2176,6,South,Online,...,4,29,Male,Electronics,2.8,Not Returned,Positive,13.48,FREESHIP,3.166667
2,2023-01-03,1462,115,2,93.38,0.26,138.2024,2,South,Credit Card,...,6,57,Other,Electronics,3.9,Not Returned,Neutral,14.39,,4.2
3,2023-01-04,1749,180,9,95.71,0.06,809.7066,9,North,Cash,...,2,28,Male,Electronics,4.3,Not Returned,Positive,10.14,,3.166667
4,2023-01-05,1564,109,9,81.4,0.05,695.97,7,East,Credit Card,...,4,31,Female,Food,4.6,Not Returned,Neutral,7.75,,2.5
5,2023-01-06,1399,145,6,44.36,0.22,207.6048,4,North,Online,...,6,36,Male,Food,4.5,Not Returned,Negative,5.54,,3.166667
6,2023-01-07,1256,179,6,56.06,0.3,235.452,3,South,Online,...,5,64,Male,Food,4.3,Returned,Negative,14.21,,4.7
7,2023-01-08,1582,111,8,11.68,0.29,66.3424,7,East,Cash,...,5,64,Female,Books,1.4,Not Returned,Negative,12.02,,3.166667
8,2023-01-09,1329,180,2,35.63,0.07,66.2718,1,North,Online,...,5,33,Other,Books,2.8,Returned,Positive,8.75,SAVE10,2.0
9,2023-01-10,1670,142,4,82.37,0.04,316.3008,6,North,Online,...,5,21,Male,Electronics,5.0,Not Returned,Negative,19.87,FREESHIP,3.166667


<div class="alert alert-warning">

**2. Removing Missing Data observations in Pandas**

**Write a Pandas program to drop rows with missing data.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1866,185,5,36.22,0.15,153.935,2,North,Credit Card,...,5,21,Male,Books,3.9,Not Returned,Negative,13.67,,3.9
1,2023-01-02,1002,121,2,98.76,0.17,163.9416,1,West,Cash,...,5,37,Male,Clothing,2.5,Returned,Neutral,13.0,SAVE10,1.2
2,2023-01-03,1456,113,2,36.4,0.12,64.064,1,South,Online,...,1,36,Male,Clothing,2.7,Not Returned,Negative,,FREESHIP,
3,2023-01-04,1592,174,1,78.62,0.06,73.9028,2,West,Cash,...,4,28,Other,Food,4.3,Returned,Neutral,10.74,,
4,2023-01-05,1743,160,5,17.81,0.07,82.8165,1,South,Credit Card,...,5,60,Male,Food,4.7,Not Returned,Positive,13.02,,4.5
5,2023-01-06,1024,122,1,47.71,0.2,38.168,2,North,Cash,...,1,27,Male,Books,3.1,Not Returned,Positive,16.43,,3.6
6,2023-01-07,1956,134,4,74.02,0.11,263.5112,6,East,Online,...,6,40,Other,Electronics,3.5,Not Returned,Positive,16.29,FREESHIP,3.2
7,2023-01-08,1321,113,4,97.76,0.05,371.488,6,East,Online,...,5,38,Female,Clothing,5.0,Returned,Positive,,FREESHIP,
8,2023-01-09,1620,166,4,54.32,0.18,178.1696,4,North,Credit Card,...,4,23,Other,Electronics,1.8,Not Returned,Negative,9.12,WELCOME5,1.5
9,2023-01-10,1035,106,8,32.23,0.24,195.9584,6,West,Online,...,3,38,Female,Electronics,3.3,Returned,Neutral,,WELCOME5,


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Handling Duplicates in Pandas
# Write a Pandas program to drop rows with missing data.

result_df = df.dropna()
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1866,185,5,36.22,0.15,153.935,2,North,Credit Card,...,5,21,Male,Books,3.9,Not Returned,Negative,13.67,,3.9
1,2023-01-02,1002,121,2,98.76,0.17,163.9416,1,West,Cash,...,5,37,Male,Clothing,2.5,Returned,Neutral,13.0,SAVE10,1.2
4,2023-01-05,1743,160,5,17.81,0.07,82.8165,1,South,Credit Card,...,5,60,Male,Food,4.7,Not Returned,Positive,13.02,,4.5
5,2023-01-06,1024,122,1,47.71,0.2,38.168,2,North,Cash,...,1,27,Male,Books,3.1,Not Returned,Positive,16.43,,3.6
6,2023-01-07,1956,134,4,74.02,0.11,263.5112,6,East,Online,...,6,40,Other,Electronics,3.5,Not Returned,Positive,16.29,FREESHIP,3.2
8,2023-01-09,1620,166,4,54.32,0.18,178.1696,4,North,Credit Card,...,4,23,Other,Electronics,1.8,Not Returned,Negative,9.12,WELCOME5,1.5
10,2023-01-11,1176,120,2,76.5,0.23,117.81,7,West,Cash,...,6,41,Female,Books,3.3,Returned,Neutral,11.11,SAVE10,4.1
11,2023-01-12,1607,195,4,20.88,0.27,60.9696,9,North,Online,...,6,32,Female,Books,4.0,Returned,Positive,10.09,FREESHIP,1.5
13,2023-01-14,1669,176,9,52.03,0.23,360.5679,4,South,Cash,...,4,63,Other,Food,3.0,Returned,Negative,17.09,SAVE10,2.3
16,2023-01-17,1422,172,7,65.43,0.07,425.9493,2,East,Cash,...,2,37,Other,Food,4.3,Returned,Neutral,9.96,WELCOME5,2.1


<div class="alert alert-warning">

**3. Detecting Duplicates in Pandas**

**Write a Pandas program to detect duplicates using duplicated() method.**

</div>

In [6]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1499,131,6,53.43,0.09,291.7278,9,South,Cash,...,2,36,Other,Electronics,3.9,Returned,Positive,8.49,,1.5
1,2023-01-02,1865,163,1,82.34,0.02,80.6932,6,East,Online,...,3,69,Male,Books,3.7,Returned,Negative,13.01,,4.4
2,2023-01-03,1045,175,9,30.65,0.07,256.5405,8,East,Credit Card,...,3,66,Female,Books,4.2,Not Returned,Neutral,14.75,,3.9
3,2023-01-04,1626,169,4,46.0,0.02,180.32,1,North,Credit Card,...,1,36,Male,Food,1.7,Returned,Positive,8.5,,3.4
4,2023-01-05,1875,104,4,80.0,0.29,227.2,6,South,Online,...,2,43,Other,Clothing,3.1,Not Returned,Negative,16.75,FREESHIP,1.8
5,2023-01-06,1256,162,8,44.49,0.04,341.6832,6,North,Online,...,2,63,Female,Clothing,2.1,Returned,Positive,,SAVE10,
6,2023-01-07,1176,120,9,24.52,0.26,163.3032,4,West,Online,...,6,19,Other,Clothing,1.6,Not Returned,Positive,12.81,,
7,2023-01-08,1766,159,1,31.22,0.18,25.6004,1,North,Cash,...,4,18,Other,Food,1.3,Not Returned,Neutral,13.83,WELCOME5,4.6
8,2023-01-09,1472,187,8,88.41,0.1,636.552,3,South,Cash,...,1,19,Female,Books,4.0,Not Returned,Neutral,12.12,,1.1
9,2023-01-10,1390,146,8,43.6,0.28,251.136,1,North,Cash,...,5,66,Male,Clothing,1.5,Returned,Negative,,,


<div class="alert alert-success">

**Solution 03:**
</div>

In [7]:
# 3. Data Cleaning Techniques
# Write a Pandas program to detect duplicates using duplicated() method.
duplicates = df.duplicated()
duplicates_df = df[duplicates]
duplicates_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
35,2023-01-25,1587,176,8,70.35,0.23,433.356,5,East,Credit Card,...,1,23,Male,Clothing,4.6,Not Returned,Negative,,FREESHIP,
36,2023-01-24,1915,111,7,52.26,0.13,318.2634,4,South,Credit Card,...,2,68,Female,Clothing,3.3,Returned,Neutral,7.38,,3.7
37,2023-01-01,1499,131,6,53.43,0.09,291.7278,9,South,Cash,...,2,36,Other,Electronics,3.9,Returned,Positive,8.49,,1.5
38,2023-01-03,1045,175,9,30.65,0.07,256.5405,8,East,Credit Card,...,3,66,Female,Books,4.2,Not Returned,Neutral,14.75,,3.9
39,2023-01-30,1673,174,3,49.1,0.2,117.84,4,West,Cash,...,5,40,Female,Food,4.9,Returned,Positive,15.87,FREESHIP,2.6


<div class="alert alert-warning">

**4. Handling Duplicates in Pandas**

**Write a Pandas program to remove duplicates rows from a DataFrame.**

</div>

In [8]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1771,106,5,64.55,0.27,235.6075,6,East,Cash,...,1,67,Male,Clothing,3.9,Returned,Neutral,16.17,,
1,2023-01-02,1814,152,6,55.81,0.15,284.631,8,East,Cash,...,1,60,Other,Books,2.0,Not Returned,Positive,14.19,WELCOME5,2.4
2,2023-01-03,1414,129,9,83.72,0.21,595.2492,4,North,Cash,...,5,40,Male,Clothing,4.7,Returned,Positive,18.58,,1.3
3,2023-01-04,1497,155,8,88.98,0.1,640.656,5,North,Online,...,4,35,Female,Clothing,2.1,Not Returned,Positive,6.77,,
4,2023-01-05,1631,197,4,78.09,0.06,293.6184,8,North,Credit Card,...,3,23,Male,Clothing,4.8,Not Returned,Neutral,8.16,FREESHIP,2.0
5,2023-01-06,1023,169,9,97.62,0.19,711.6498,2,North,Credit Card,...,4,62,Other,Electronics,4.6,Not Returned,Positive,17.69,WELCOME5,
6,2023-01-07,1344,161,1,23.48,0.24,17.8448,5,East,Credit Card,...,5,55,Other,Food,1.5,Returned,Neutral,6.45,,3.4
7,2023-01-08,1443,149,8,89.52,0.08,658.8672,6,South,Credit Card,...,5,48,Male,Books,1.6,Returned,Neutral,14.44,SAVE10,3.2
8,2023-01-09,1444,160,8,88.66,0.23,546.1456,3,South,Online,...,1,66,Female,Books,4.7,Returned,Neutral,15.73,SAVE10,
9,2023-01-10,1619,187,2,98.17,0.3,137.438,4,East,Cash,...,4,69,Male,Books,1.3,Returned,Positive,18.27,,1.3


<div class="alert alert-success">

**Solution 04:**
</div>

In [9]:
df.drop_duplicates()

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1771,106,5,64.55,0.27,235.6075,6,East,Cash,...,1,67,Male,Clothing,3.9,Returned,Neutral,16.17,,
1,2023-01-02,1814,152,6,55.81,0.15,284.631,8,East,Cash,...,1,60,Other,Books,2.0,Not Returned,Positive,14.19,WELCOME5,2.4
2,2023-01-03,1414,129,9,83.72,0.21,595.2492,4,North,Cash,...,5,40,Male,Clothing,4.7,Returned,Positive,18.58,,1.3
3,2023-01-04,1497,155,8,88.98,0.1,640.656,5,North,Online,...,4,35,Female,Clothing,2.1,Not Returned,Positive,6.77,,
4,2023-01-05,1631,197,4,78.09,0.06,293.6184,8,North,Credit Card,...,3,23,Male,Clothing,4.8,Not Returned,Neutral,8.16,FREESHIP,2.0
5,2023-01-06,1023,169,9,97.62,0.19,711.6498,2,North,Credit Card,...,4,62,Other,Electronics,4.6,Not Returned,Positive,17.69,WELCOME5,
6,2023-01-07,1344,161,1,23.48,0.24,17.8448,5,East,Credit Card,...,5,55,Other,Food,1.5,Returned,Neutral,6.45,,3.4
7,2023-01-08,1443,149,8,89.52,0.08,658.8672,6,South,Credit Card,...,5,48,Male,Books,1.6,Returned,Neutral,14.44,SAVE10,3.2
8,2023-01-09,1444,160,8,88.66,0.23,546.1456,3,South,Online,...,1,66,Female,Books,4.7,Returned,Neutral,15.73,SAVE10,
9,2023-01-10,1619,187,2,98.17,0.3,137.438,4,East,Cash,...,4,69,Male,Books,1.3,Returned,Positive,18.27,,1.3


In [10]:
# 4. Handling Duplicates in Pandas
# Write a Pandas program to remove duplicates rows from a DataFrame.

df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)

<div class="alert alert-warning">

**5. Handling Outliers with Z-Score Method**

**Write a Pandas program to handle outliers in a DataFrame with Z-Score method.**

</div>

In [11]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1552,144,5,8733.34,0.14,37553.362,8,West,Online,...,3,67,Male,Books,3.2,Not Returned,Positive,11.98,,2.1
1,2023-01-02,1773,117,6,8408.4,0.07,46918.872,7,North,Credit Card,...,1,53,Other,Food,1.9,Returned,Positive,18.04,,4.9
2,2023-01-03,1462,103,5,3943.25,0.01,19519.0875,9,West,Online,...,1,33,Other,Books,3.3,Not Returned,Positive,,,3.0
3,2023-01-04,1875,166,6,6184.76,0.1,33397.704,7,East,Credit Card,...,1,20,Female,Books,3.6,Returned,Neutral,13.64,FREESHIP,
4,2023-01-05,1279,186,9,240.53,0.22,1688.5206,9,North,Credit Card,...,4,34,Other,Food,3.4,Not Returned,Negative,7.22,FREESHIP,
5,2023-01-06,1125,199,9,9033.93,0.01,80492.3163,4,West,Cash,...,4,55,Other,Books,3.4,Not Returned,Negative,13.61,SAVE10,2.6
6,2023-01-07,1698,104,5,8925.68,0.28,32132.448,1,East,Credit Card,...,4,65,Female,Food,3.2,Returned,Neutral,13.26,WELCOME5,2.0
7,2023-01-08,1752,160,5,1300.75,0.0,6503.75,3,West,Cash,...,3,22,Female,Clothing,2.5,Not Returned,Negative,16.18,FREESHIP,3.9
8,2023-01-09,1508,144,1,6648.23,0.14,5717.4778,8,East,Online,...,5,48,Other,Clothing,1.8,Not Returned,Negative,,SAVE10,4.8
9,2023-01-10,1896,157,7,5830.61,0.09,37140.9857,2,West,Online,...,2,42,Male,Food,3.1,Returned,Negative,,FREESHIP,


<div class="alert alert-success">

**Solution 05:**
</div>

In [12]:
# 5. Handling Outliers with Z-Score Method
# Write a Pandas program to handle outliers in a DataFrame with Z-Score method.

def handle_outliers_z_score(df, column):
    from scipy import stats
    z_scores = np.abs(stats.zscore(df[column]))
    threshold = 1.5
    return df[(z_scores < threshold)]

result_df = handle_outliers_z_score(df, 'price')

In [13]:
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1552,144,5,8733.34,0.14,37553.362,8,West,Online,...,3,67,Male,Books,3.2,Not Returned,Positive,11.98,,2.1
1,2023-01-02,1773,117,6,8408.4,0.07,46918.872,7,North,Credit Card,...,1,53,Other,Food,1.9,Returned,Positive,18.04,,4.9
2,2023-01-03,1462,103,5,3943.25,0.01,19519.0875,9,West,Online,...,1,33,Other,Books,3.3,Not Returned,Positive,,,3.0
3,2023-01-04,1875,166,6,6184.76,0.1,33397.704,7,East,Credit Card,...,1,20,Female,Books,3.6,Returned,Neutral,13.64,FREESHIP,
5,2023-01-06,1125,199,9,9033.93,0.01,80492.3163,4,West,Cash,...,4,55,Other,Books,3.4,Not Returned,Negative,13.61,SAVE10,2.6
6,2023-01-07,1698,104,5,8925.68,0.28,32132.448,1,East,Credit Card,...,4,65,Female,Food,3.2,Returned,Neutral,13.26,WELCOME5,2.0
8,2023-01-09,1508,144,1,6648.23,0.14,5717.4778,8,East,Online,...,5,48,Other,Clothing,1.8,Not Returned,Negative,,SAVE10,4.8
9,2023-01-10,1896,157,7,5830.61,0.09,37140.9857,2,West,Online,...,2,42,Male,Food,3.1,Returned,Negative,,FREESHIP,
10,2023-01-11,1174,143,2,7074.05,0.22,11035.518,9,South,Online,...,6,30,Male,Food,4.9,Not Returned,Negative,9.93,,2.3
11,2023-01-12,1297,164,8,8803.51,0.01,69723.7992,7,West,Cash,...,1,22,Male,Electronics,4.2,Returned,Positive,14.15,,4.7


<div class="alert alert-warning">

**6. Normalizing Data with Min-Max Scaling**

**Write a Pandas program that normalizes data with Min-Max scaling.**

</div>

In [14]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1712,185,1,5259.93,0.11,4681.3377,6,East,Online,...,2,49,Other,Food,3.9,Not Returned,Negative,19.93,,
1,2023-01-02,1555,194,9,4070.12,0.29,26008.0668,3,South,Credit Card,...,3,54,Other,Food,3.3,Returned,Positive,,,4.6
2,2023-01-03,1134,101,8,6113.21,0.05,46460.396,9,East,Credit Card,...,6,52,Other,Electronics,3.4,Not Returned,Positive,14.97,,
3,2023-01-04,1065,127,1,8121.53,0.0,8121.53,5,North,Cash,...,5,49,Male,Food,4.0,Not Returned,Positive,18.75,,
4,2023-01-05,1475,120,6,827.18,0.21,3920.8332,5,West,Online,...,1,41,Other,Clothing,3.4,Returned,Negative,13.47,FREESHIP,
5,2023-01-06,1406,133,3,7058.6,0.12,18634.704,6,North,Online,...,2,22,Male,Books,3.9,Not Returned,Positive,19.1,FREESHIP,
6,2023-01-07,1316,172,9,5416.84,0.04,46801.4976,8,East,Credit Card,...,5,35,Male,Clothing,2.8,Not Returned,Negative,5.48,SAVE10,2.4
7,2023-01-08,1144,101,6,4350.42,0.15,22187.142,2,South,Credit Card,...,3,28,Female,Clothing,1.6,Not Returned,Negative,13.67,WELCOME5,2.9
8,2023-01-09,1385,136,4,8116.38,0.26,24024.4848,6,North,Cash,...,1,55,Female,Electronics,4.2,Not Returned,Neutral,8.03,WELCOME5,4.0
9,2023-01-10,1470,177,2,1638.87,0.17,2720.5242,5,South,Credit Card,...,5,33,Other,Books,1.7,Returned,Positive,16.81,,


<div class="alert alert-success">

**Solution 06:**
</div>

In [15]:
# 6. Normalizing Data with Min-Max Scaling
# Write a Pandas program that normalizes data with Min-Max scaling.

def normalize_min_max(df, column):
    min_val = df[column].min()
    max_val = df[column].max()
    df[column] = (df[column] - min_val) / (max_val - min_val)
    return df

result_df = normalize_min_max(df.copy(), 'price')

result_df


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1712,185,1,0.578847,0.11,4681.3377,6,East,Online,...,2,49,Other,Food,3.9,Not Returned,Negative,19.93,,
1,2023-01-02,1555,194,9,0.43703,0.29,26008.0668,3,South,Credit Card,...,3,54,Other,Food,3.3,Returned,Positive,,,4.6
2,2023-01-03,1134,101,8,0.680552,0.05,46460.396,9,East,Credit Card,...,6,52,Other,Electronics,3.4,Not Returned,Positive,14.97,,
3,2023-01-04,1065,127,1,0.919931,0.0,8121.53,5,North,Cash,...,5,49,Male,Food,4.0,Not Returned,Positive,18.75,,
4,2023-01-05,1475,120,6,0.050493,0.21,3920.8332,5,West,Online,...,1,41,Other,Clothing,3.4,Returned,Negative,13.47,FREESHIP,
5,2023-01-06,1406,133,3,0.793236,0.12,18634.704,6,North,Online,...,2,22,Male,Books,3.9,Not Returned,Positive,19.1,FREESHIP,
6,2023-01-07,1316,172,9,0.59755,0.04,46801.4976,8,East,Credit Card,...,5,35,Male,Clothing,2.8,Not Returned,Negative,5.48,SAVE10,2.4
7,2023-01-08,1144,101,6,0.470439,0.15,22187.142,2,South,Credit Card,...,3,28,Female,Clothing,1.6,Not Returned,Negative,13.67,WELCOME5,2.9
8,2023-01-09,1385,136,4,0.919317,0.26,24024.4848,6,North,Cash,...,1,55,Female,Electronics,4.2,Not Returned,Neutral,8.03,WELCOME5,4.0
9,2023-01-10,1470,177,2,0.147241,0.17,2720.5242,5,South,Credit Card,...,5,33,Other,Books,1.7,Returned,Positive,16.81,,


<div class="alert alert-warning">

**7. Binning Data into Categories**

**Write a Pandas program to bin data into categories.**

</div>

In [16]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1177,110,5,378.22,0.23,1456.147,8,East,Cash,...,6,55,Male,Books,1.9,Returned,Positive,6.14,SAVE10,2.2
1,2023-01-02,1440,154,8,2206.7,0.0,17653.6,6,South,Credit Card,...,4,35,Other,Electronics,2.2,Not Returned,Negative,14.13,WELCOME5,
2,2023-01-03,1993,105,9,3250.09,0.15,24863.1885,6,West,Online,...,6,35,Male,Clothing,2.9,Returned,Negative,6.52,,4.0
3,2023-01-04,1703,104,1,2735.25,0.22,2133.495,4,West,Credit Card,...,5,61,Male,Electronics,1.0,Not Returned,Neutral,12.26,WELCOME5,
4,2023-01-05,1294,104,9,9515.46,0.14,73649.6604,2,East,Credit Card,...,6,49,Male,Books,3.8,Not Returned,Negative,7.08,,
5,2023-01-06,1888,199,9,7838.53,0.09,64197.5607,5,West,Credit Card,...,2,48,Male,Food,4.0,Not Returned,Positive,7.95,WELCOME5,2.2
6,2023-01-07,1397,126,4,5005.51,0.03,19421.3788,7,East,Online,...,5,25,Female,Electronics,3.0,Returned,Positive,10.98,,
7,2023-01-08,1077,183,1,4183.08,0.14,3597.4488,3,West,Online,...,3,50,Other,Electronics,3.1,Not Returned,Neutral,13.03,WELCOME5,4.8
8,2023-01-09,1153,126,1,9810.7,0.22,7652.346,4,North,Credit Card,...,3,54,Female,Clothing,1.3,Not Returned,Neutral,,FREESHIP,2.0
9,2023-01-10,1591,141,9,8502.35,0.23,58921.2855,3,East,Credit Card,...,5,20,Female,Clothing,3.2,Returned,Neutral,,WELCOME5,


<div class="alert alert-success">

**Solution 07:**
</div>

In [17]:
# binning price
bins = [0, 100, 500, 1000, 5000, 10000]
labels = ['0-100', '100-500', '500-1000', '1000-5000', '5000-10000']
df['price_binned'] = pd.cut(
    df['price']
    , bins=bins
    , labels=labels
    , right=False
)

df['price_binned_q'] = pd.qcut(
    df['price']
    , q=[0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
    , duplicates='drop'
)
df['price_binned_q_no_labels'] = pd.qcut(
    df['price']
    , q=[0, 0.2, 0.4, 0.6, 0.8, 1.0]
    , labels=False
)

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,price_binned,price_binned_q,price_binned_q_no_labels
0,2023-01-01,1177,110,5,378.22,0.23,1456.147,8,East,Cash,...,Books,1.9,Returned,Positive,6.14,SAVE10,2.2,100-500,Very Low,0
1,2023-01-02,1440,154,8,2206.7,0.0,17653.6,6,South,Credit Card,...,Electronics,2.2,Not Returned,Negative,14.13,WELCOME5,,1000-5000,Very Low,0
2,2023-01-03,1993,105,9,3250.09,0.15,24863.1885,6,West,Online,...,Clothing,2.9,Returned,Negative,6.52,,4.0,1000-5000,Low,1
3,2023-01-04,1703,104,1,2735.25,0.22,2133.495,4,West,Credit Card,...,Electronics,1.0,Not Returned,Neutral,12.26,WELCOME5,,1000-5000,Low,1
4,2023-01-05,1294,104,9,9515.46,0.14,73649.6604,2,East,Credit Card,...,Books,3.8,Not Returned,Negative,7.08,,,5000-10000,Very High,4
5,2023-01-06,1888,199,9,7838.53,0.09,64197.5607,5,West,Credit Card,...,Food,4.0,Not Returned,Positive,7.95,WELCOME5,2.2,5000-10000,High,3
6,2023-01-07,1397,126,4,5005.51,0.03,19421.3788,7,East,Online,...,Electronics,3.0,Returned,Positive,10.98,,,5000-10000,Medium,2
7,2023-01-08,1077,183,1,4183.08,0.14,3597.4488,3,West,Online,...,Electronics,3.1,Not Returned,Neutral,13.03,WELCOME5,4.8,1000-5000,Low,1
8,2023-01-09,1153,126,1,9810.7,0.22,7652.346,4,North,Credit Card,...,Clothing,1.3,Not Returned,Neutral,,FREESHIP,2.0,5000-10000,Very High,4
9,2023-01-10,1591,141,9,8502.35,0.23,58921.2855,3,East,Credit Card,...,Clothing,3.2,Returned,Neutral,,WELCOME5,,5000-10000,Very High,4


<div class="alert alert-warning">

**8. Converting Data Types and Column Operations**

**Write a Pandas program that handles text data with str.replace().**

</div>

In [18]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1358,101,3,6115.88,0.02,17980.6872,5,West,Online,...,5,27,Male,Clothing,2.4,Returned,Positive,8.28,,4.8
1,2023-01-02,1435,134,9,3928.43,0.26,26163.3438,7,South,Online,...,5,30,Female,Clothing,2.5,Not Returned,Neutral,15.77,,1.6
2,2023-01-03,1758,111,8,562.62,0.05,4275.912,4,East,Online,...,4,42,Male,Food,1.4,Returned,Neutral,5.73,,2.6
3,2023-01-04,1735,165,6,1136.93,0.1,6139.422,3,West,Online,...,3,52,Male,Books,2.0,Not Returned,Positive,9.54,WELCOME5,2.8
4,2023-01-05,1119,115,3,6199.44,0.1,16738.488,8,West,Cash,...,1,44,Female,Clothing,2.1,Returned,Positive,6.92,FREESHIP,1.5
5,2023-01-06,1270,182,1,8738.49,0.02,8563.7202,7,North,Credit Card,...,1,68,Male,Food,2.6,Returned,Negative,5.75,WELCOME5,2.6
6,2023-01-07,1395,137,6,2520.14,0.13,13155.1308,2,North,Online,...,6,21,Male,Clothing,1.2,Returned,Neutral,,WELCOME5,1.5
7,2023-01-08,1273,130,8,3210.69,0.05,24401.244,2,North,Online,...,4,23,Other,Food,1.3,Not Returned,Neutral,10.68,WELCOME5,
8,2023-01-09,1612,157,3,4347.62,0.17,10825.5738,4,South,Credit Card,...,2,59,Female,Books,2.6,Not Returned,Neutral,15.68,,
9,2023-01-10,1222,179,4,8073.41,0.29,22928.4844,9,North,Credit Card,...,3,18,Other,Books,4.9,Not Returned,Neutral,,SAVE10,


<div class="alert alert-success">

**Solution 08:**
</div>

In [19]:
# 8. Converting Data Types and Column Operations
# Write a Pandas program that handles text data with str.replace().

df['region_arrows'] = df['region'].str.replace('North', '↑')
df['region_arrows'] = df['region_arrows'].str.replace('South', '↓')
df['region_arrows'] = df['region_arrows'].str.replace('East', '→')
df['region_arrows'] = df['region_arrows'].str.replace('West', '←')

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,region_arrows
0,2023-01-01,1358,101,3,6115.88,0.02,17980.6872,5,West,Online,...,27,Male,Clothing,2.4,Returned,Positive,8.28,,4.8,←
1,2023-01-02,1435,134,9,3928.43,0.26,26163.3438,7,South,Online,...,30,Female,Clothing,2.5,Not Returned,Neutral,15.77,,1.6,↓
2,2023-01-03,1758,111,8,562.62,0.05,4275.912,4,East,Online,...,42,Male,Food,1.4,Returned,Neutral,5.73,,2.6,→
3,2023-01-04,1735,165,6,1136.93,0.1,6139.422,3,West,Online,...,52,Male,Books,2.0,Not Returned,Positive,9.54,WELCOME5,2.8,←
4,2023-01-05,1119,115,3,6199.44,0.1,16738.488,8,West,Cash,...,44,Female,Clothing,2.1,Returned,Positive,6.92,FREESHIP,1.5,←
5,2023-01-06,1270,182,1,8738.49,0.02,8563.7202,7,North,Credit Card,...,68,Male,Food,2.6,Returned,Negative,5.75,WELCOME5,2.6,↑
6,2023-01-07,1395,137,6,2520.14,0.13,13155.1308,2,North,Online,...,21,Male,Clothing,1.2,Returned,Neutral,,WELCOME5,1.5,↑
7,2023-01-08,1273,130,8,3210.69,0.05,24401.244,2,North,Online,...,23,Other,Food,1.3,Not Returned,Neutral,10.68,WELCOME5,,↑
8,2023-01-09,1612,157,3,4347.62,0.17,10825.5738,4,South,Credit Card,...,59,Female,Books,2.6,Not Returned,Neutral,15.68,,,↓
9,2023-01-10,1222,179,4,8073.41,0.29,22928.4844,9,North,Credit Card,...,18,Other,Books,4.9,Not Returned,Neutral,,SAVE10,,↑
