[Pandas Data Cleaning and Preprocessing](https://www.w3resource.com/python-exercises/pandas/pandas-data-cleaning-and-preprocessing.php)

In [1]:
import pandas as pd
import numpy as np

<div class="alert alert-warning">

**1. Handling Missing Data in Pandas**

**Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1131,100,9,32.55,0.12,257.796,1,North,Cash,...,2,42,Female,Clothing,2.5,Returned,Positive,10.65,FREESHIP,
1,2023-01-02,1214,161,1,23.39,0.17,19.4137,3,East,Credit Card,...,5,18,Male,Clothing,4.8,Not Returned,Positive,7.89,,
2,2023-01-03,1533,163,4,54.45,0.06,204.732,9,South,Online,...,3,69,Female,Electronics,2.2,Returned,Neutral,18.38,SAVE10,4.3
3,2023-01-04,1291,186,2,54.57,0.09,99.3174,8,South,Cash,...,2,27,Female,Food,3.2,Not Returned,Positive,14.46,,3.5
4,2023-01-05,1002,105,3,63.4,0.01,188.298,4,East,Cash,...,4,57,Other,Clothing,4.5,Not Returned,Neutral,16.97,,1.6
5,2023-01-06,1068,108,9,41.79,0.16,315.9324,2,South,Cash,...,4,66,Other,Books,2.2,Not Returned,Positive,8.57,,4.5
6,2023-01-07,1402,162,4,97.49,0.22,304.1688,6,East,Online,...,3,47,Male,Food,1.7,Not Returned,Negative,15.54,SAVE10,3.8
7,2023-01-08,1752,107,7,98.2,0.08,632.408,7,East,Credit Card,...,6,39,Female,Food,1.7,Not Returned,Negative,16.63,WELCOME5,
8,2023-01-09,1186,145,8,34.0,0.17,225.76,9,North,Credit Card,...,3,54,Female,Books,4.2,Returned,Positive,9.62,FREESHIP,5.0
9,2023-01-10,1944,195,5,37.2,0.13,161.82,4,North,Online,...,5,53,Female,Electronics,1.8,Returned,Positive,9.22,SAVE10,2.8


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Handling Missing Data in Pandas
# Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().
result_df = df.fillna({
    'shipping_cost': df['shipping_cost'].mean(),
    'coupon_code': 'No Coupon',
    'delivery_rating': df['delivery_rating'].mean()
})

result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1131,100,9,32.55,0.12,257.796,1,North,Cash,...,2,42,Female,Clothing,2.5,Returned,Positive,10.65,FREESHIP,3.366667
1,2023-01-02,1214,161,1,23.39,0.17,19.4137,3,East,Credit Card,...,5,18,Male,Clothing,4.8,Not Returned,Positive,7.89,,3.366667
2,2023-01-03,1533,163,4,54.45,0.06,204.732,9,South,Online,...,3,69,Female,Electronics,2.2,Returned,Neutral,18.38,SAVE10,4.3
3,2023-01-04,1291,186,2,54.57,0.09,99.3174,8,South,Cash,...,2,27,Female,Food,3.2,Not Returned,Positive,14.46,,3.5
4,2023-01-05,1002,105,3,63.4,0.01,188.298,4,East,Cash,...,4,57,Other,Clothing,4.5,Not Returned,Neutral,16.97,,1.6
5,2023-01-06,1068,108,9,41.79,0.16,315.9324,2,South,Cash,...,4,66,Other,Books,2.2,Not Returned,Positive,8.57,,4.5
6,2023-01-07,1402,162,4,97.49,0.22,304.1688,6,East,Online,...,3,47,Male,Food,1.7,Not Returned,Negative,15.54,SAVE10,3.8
7,2023-01-08,1752,107,7,98.2,0.08,632.408,7,East,Credit Card,...,6,39,Female,Food,1.7,Not Returned,Negative,16.63,WELCOME5,3.366667
8,2023-01-09,1186,145,8,34.0,0.17,225.76,9,North,Credit Card,...,3,54,Female,Books,4.2,Returned,Positive,9.62,FREESHIP,5.0
9,2023-01-10,1944,195,5,37.2,0.13,161.82,4,North,Online,...,5,53,Female,Electronics,1.8,Returned,Positive,9.22,SAVE10,2.8


<div class="alert alert-warning">

**2. Removing Missing Data observations in Pandas**

**Write a Pandas program to drop rows with missing data.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1968,144,1,47.22,0.14,40.6092,4,North,Cash,...,5,56,Female,Electronics,2.9,Returned,Neutral,12.28,,
1,2023-01-02,1157,133,2,57.54,0.12,101.2704,8,West,Online,...,5,19,Other,Books,3.7,Returned,Neutral,13.14,,
2,2023-01-03,1199,191,7,42.2,0.13,256.998,2,South,Online,...,5,66,Female,Electronics,2.3,Returned,Positive,10.13,SAVE10,
3,2023-01-04,1945,166,5,27.04,0.13,117.624,2,South,Online,...,5,66,Male,Books,1.7,Returned,Neutral,,SAVE10,5.0
4,2023-01-05,1543,135,8,82.99,0.13,577.6104,7,North,Credit Card,...,5,33,Female,Books,3.4,Not Returned,Negative,14.88,,1.3
5,2023-01-06,1682,168,9,51.98,0.27,341.5086,8,South,Cash,...,2,54,Other,Clothing,1.5,Returned,Positive,14.34,WELCOME5,1.6
6,2023-01-07,1369,131,3,29.25,0.12,77.22,5,West,Online,...,1,24,Male,Clothing,1.7,Returned,Negative,9.9,SAVE10,1.2
7,2023-01-08,1906,114,8,82.9,0.12,583.616,9,South,Online,...,5,22,Male,Food,1.8,Returned,Negative,17.09,,
8,2023-01-09,1049,116,2,57.15,0.28,82.296,5,East,Online,...,4,22,Male,Clothing,3.0,Returned,Neutral,14.6,WELCOME5,
9,2023-01-10,1798,146,9,85.53,0.21,608.1183,2,South,Cash,...,6,22,Male,Books,3.8,Not Returned,Negative,14.6,SAVE10,


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Handling Duplicates in Pandas
# Write a Pandas program to drop rows with missing data.

result_df = df.dropna()
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
4,2023-01-05,1543,135,8,82.99,0.13,577.6104,7,North,Credit Card,...,5,33,Female,Books,3.4,Not Returned,Negative,14.88,,1.3
5,2023-01-06,1682,168,9,51.98,0.27,341.5086,8,South,Cash,...,2,54,Other,Clothing,1.5,Returned,Positive,14.34,WELCOME5,1.6
6,2023-01-07,1369,131,3,29.25,0.12,77.22,5,West,Online,...,1,24,Male,Clothing,1.7,Returned,Negative,9.9,SAVE10,1.2
10,2023-01-11,1590,165,9,70.75,0.22,496.665,7,West,Online,...,4,24,Male,Books,4.2,Returned,Negative,12.25,SAVE10,4.0
11,2023-01-12,1190,198,7,54.14,0.19,306.9738,8,North,Cash,...,1,48,Female,Books,3.8,Returned,Neutral,13.28,FREESHIP,3.2
14,2023-01-15,1424,177,5,91.56,0.27,334.194,6,East,Online,...,3,29,Female,Clothing,1.5,Not Returned,Positive,9.26,SAVE10,3.0
18,2023-01-19,1956,142,2,27.45,0.26,40.626,5,East,Credit Card,...,5,41,Female,Electronics,4.1,Not Returned,Neutral,15.72,SAVE10,4.5
20,2023-01-21,1749,135,6,36.07,0.17,179.6286,8,South,Online,...,5,43,Female,Books,4.4,Not Returned,Negative,8.54,,4.7
28,2023-01-29,1537,163,7,86.74,0.25,455.385,7,West,Online,...,1,55,Female,Clothing,3.4,Returned,Positive,16.93,SAVE10,1.2
29,2023-01-30,1722,113,9,67.59,0.13,529.2297,4,East,Credit Card,...,6,46,Female,Electronics,4.4,Not Returned,Neutral,9.5,FREESHIP,4.1


<div class="alert alert-warning">

**3. Detecting Duplicates in Pandas**

**Write a Pandas program to detect duplicates using duplicated() method.**

</div>

In [6]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1740,192,4,19.18,0.1,69.048,2,West,Cash,...,4,55,Female,Electronics,1.2,Returned,Positive,8.6,SAVE10,4.8
1,2023-01-02,1072,146,9,99.64,0.18,735.3432,9,East,Online,...,4,26,Male,Books,1.4,Not Returned,Positive,,SAVE10,
2,2023-01-03,1468,122,1,60.9,0.04,58.464,3,East,Credit Card,...,1,26,Other,Electronics,3.5,Returned,Positive,12.78,SAVE10,2.1
3,2023-01-04,1676,172,5,60.7,0.2,242.8,7,East,Online,...,4,58,Male,Books,2.5,Not Returned,Negative,17.48,,
4,2023-01-05,1827,181,5,37.79,0.27,137.9335,4,South,Cash,...,1,28,Female,Clothing,2.9,Returned,Neutral,14.45,WELCOME5,1.3
5,2023-01-06,1626,184,5,92.45,0.05,439.1375,9,South,Credit Card,...,6,51,Other,Electronics,1.6,Not Returned,Neutral,10.06,WELCOME5,
6,2023-01-07,1957,180,6,81.94,0.09,447.3924,5,East,Online,...,3,35,Female,Clothing,1.1,Not Returned,Negative,17.13,SAVE10,
7,2023-01-08,1059,115,4,62.37,0.07,232.0164,9,East,Cash,...,6,29,Female,Electronics,2.2,Not Returned,Neutral,17.37,WELCOME5,
8,2023-01-09,1464,166,6,27.92,0.07,155.7936,9,East,Cash,...,1,27,Male,Electronics,1.2,Not Returned,Neutral,15.07,SAVE10,3.0
9,2023-01-10,1039,102,6,16.87,0.29,71.8662,9,East,Online,...,1,36,Other,Books,1.7,Not Returned,Neutral,10.47,,4.2


<div class="alert alert-success">

**Solution 03:**
</div>

In [7]:
# 3. Data Cleaning Techniques
# Write a Pandas program to detect duplicates using duplicated() method.
duplicates = df.duplicated()
duplicates_df = df[duplicates]
duplicates_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
35,2023-01-17,1737,123,9,10.64,0.16,80.4384,4,North,Credit Card,...,1,68,Female,Books,1.3,Not Returned,Neutral,7.92,,4.0
36,2023-01-26,1898,169,7,16.42,0.2,91.952,4,East,Credit Card,...,6,68,Male,Electronics,1.8,Not Returned,Neutral,5.5,,1.6
37,2023-01-24,1241,137,2,70.01,0.26,103.6148,8,East,Online,...,5,60,Female,Electronics,1.7,Not Returned,Neutral,15.0,,2.4
38,2023-01-07,1957,180,6,81.94,0.09,447.3924,5,East,Online,...,3,35,Female,Clothing,1.1,Not Returned,Negative,17.13,SAVE10,
39,2023-01-29,1589,194,5,47.2,0.08,217.12,1,East,Credit Card,...,2,59,Male,Clothing,2.5,Not Returned,Negative,17.41,SAVE10,1.1


<div class="alert alert-warning">

**4. Handling Duplicates in Pandas**

**Write a Pandas program to remove duplicates rows from a DataFrame.**

</div>

In [8]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1286,142,5,62.57,0.07,290.9505,7,North,Online,...,1,53,Other,Electronics,3.7,Not Returned,Negative,10.89,SAVE10,4.8
1,2023-01-02,1541,195,7,61.41,0.23,330.9999,8,West,Online,...,5,51,Male,Food,3.5,Returned,Negative,9.43,WELCOME5,1.0
2,2023-01-03,1079,144,8,30.96,0.2,198.144,9,West,Cash,...,4,47,Male,Electronics,3.8,Not Returned,Negative,16.95,FREESHIP,1.3
3,2023-01-04,1316,131,3,27.96,0.17,69.6204,4,North,Online,...,6,67,Other,Electronics,4.0,Returned,Positive,8.03,,3.0
4,2023-01-05,1983,170,9,33.69,0.19,245.6001,5,West,Credit Card,...,3,19,Female,Clothing,3.4,Not Returned,Positive,6.9,WELCOME5,2.8
5,2023-01-06,1707,114,3,89.3,0.01,265.221,8,East,Credit Card,...,3,30,Male,Books,1.6,Returned,Negative,11.05,,3.0
6,2023-01-07,1024,101,6,55.31,0.24,252.2136,3,West,Cash,...,2,60,Other,Electronics,3.5,Returned,Neutral,,WELCOME5,1.5
7,2023-01-08,1631,124,1,29.46,0.18,24.1572,9,West,Credit Card,...,5,19,Other,Electronics,4.5,Returned,Positive,,WELCOME5,
8,2023-01-09,1345,158,8,89.7,0.18,588.432,5,South,Online,...,4,46,Male,Books,3.3,Returned,Negative,7.99,WELCOME5,4.1
9,2023-01-10,1441,182,2,18.45,0.14,31.734,9,South,Cash,...,2,64,Other,Clothing,1.9,Returned,Positive,13.89,WELCOME5,1.7


<div class="alert alert-success">

**Solution 04:**
</div>

In [9]:
df.drop_duplicates()

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1286,142,5,62.57,0.07,290.9505,7,North,Online,...,1,53,Other,Electronics,3.7,Not Returned,Negative,10.89,SAVE10,4.8
1,2023-01-02,1541,195,7,61.41,0.23,330.9999,8,West,Online,...,5,51,Male,Food,3.5,Returned,Negative,9.43,WELCOME5,1.0
2,2023-01-03,1079,144,8,30.96,0.2,198.144,9,West,Cash,...,4,47,Male,Electronics,3.8,Not Returned,Negative,16.95,FREESHIP,1.3
3,2023-01-04,1316,131,3,27.96,0.17,69.6204,4,North,Online,...,6,67,Other,Electronics,4.0,Returned,Positive,8.03,,3.0
4,2023-01-05,1983,170,9,33.69,0.19,245.6001,5,West,Credit Card,...,3,19,Female,Clothing,3.4,Not Returned,Positive,6.9,WELCOME5,2.8
5,2023-01-06,1707,114,3,89.3,0.01,265.221,8,East,Credit Card,...,3,30,Male,Books,1.6,Returned,Negative,11.05,,3.0
6,2023-01-07,1024,101,6,55.31,0.24,252.2136,3,West,Cash,...,2,60,Other,Electronics,3.5,Returned,Neutral,,WELCOME5,1.5
7,2023-01-08,1631,124,1,29.46,0.18,24.1572,9,West,Credit Card,...,5,19,Other,Electronics,4.5,Returned,Positive,,WELCOME5,
8,2023-01-09,1345,158,8,89.7,0.18,588.432,5,South,Online,...,4,46,Male,Books,3.3,Returned,Negative,7.99,WELCOME5,4.1
9,2023-01-10,1441,182,2,18.45,0.14,31.734,9,South,Cash,...,2,64,Other,Clothing,1.9,Returned,Positive,13.89,WELCOME5,1.7


In [10]:
# 4. Handling Duplicates in Pandas
# Write a Pandas program to remove duplicates rows from a DataFrame.

df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)

<div class="alert alert-warning">

**5. Handling Outliers with Z-Score Method**

**Write a Pandas program to handle outliers in a DataFrame with Z-Score method.**

</div>

In [11]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1640,149,1,7058.37,0.16,5929.0308,3,North,Credit Card,...,3,22,Female,Food,1.1,Not Returned,Neutral,5.64,WELCOME5,
1,2023-01-02,1858,140,3,2664.07,0.24,6074.0796,3,West,Credit Card,...,5,22,Female,Clothing,3.6,Returned,Negative,19.1,,1.6
2,2023-01-03,1209,140,3,1430.43,0.16,3604.6836,8,North,Online,...,6,27,Female,Books,2.6,Not Returned,Neutral,11.54,,
3,2023-01-04,1261,118,8,8954.64,0.27,52295.0976,3,East,Online,...,6,52,Male,Clothing,3.7,Returned,Positive,,,1.1
4,2023-01-05,1051,108,7,8184.21,0.14,49268.9442,9,West,Cash,...,4,55,Female,Clothing,3.5,Not Returned,Neutral,18.9,WELCOME5,1.2
5,2023-01-06,1660,150,4,583.96,0.05,2219.048,8,West,Cash,...,2,67,Female,Food,3.3,Not Returned,Neutral,,FREESHIP,
6,2023-01-07,1647,108,1,8692.24,0.13,7562.2488,2,East,Credit Card,...,1,35,Male,Electronics,4.1,Not Returned,Negative,12.59,SAVE10,
7,2023-01-08,1105,128,9,4880.03,0.06,41285.0538,3,West,Credit Card,...,6,51,Other,Electronics,3.0,Returned,Positive,18.59,FREESHIP,2.0
8,2023-01-09,1409,177,7,5853.8,0.18,33600.812,8,West,Online,...,4,52,Other,Clothing,4.9,Returned,Negative,,FREESHIP,
9,2023-01-10,1522,116,6,7241.4,0.05,41275.98,5,West,Cash,...,3,32,Other,Clothing,3.5,Not Returned,Negative,6.03,,3.7


<div class="alert alert-success">

**Solution 05:**
</div>

In [12]:
# 5. Handling Outliers with Z-Score Method
# Write a Pandas program to handle outliers in a DataFrame with Z-Score method.

def handle_outliers_z_score(df, column):
    from scipy import stats
    z_scores = np.abs(stats.zscore(df[column]))
    threshold = 1.5
    return df[(z_scores < threshold)]

result_df = handle_outliers_z_score(df, 'price')

In [13]:
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1640,149,1,7058.37,0.16,5929.0308,3,North,Credit Card,...,3,22,Female,Food,1.1,Not Returned,Neutral,5.64,WELCOME5,
1,2023-01-02,1858,140,3,2664.07,0.24,6074.0796,3,West,Credit Card,...,5,22,Female,Clothing,3.6,Returned,Negative,19.1,,1.6
2,2023-01-03,1209,140,3,1430.43,0.16,3604.6836,8,North,Online,...,6,27,Female,Books,2.6,Not Returned,Neutral,11.54,,
3,2023-01-04,1261,118,8,8954.64,0.27,52295.0976,3,East,Online,...,6,52,Male,Clothing,3.7,Returned,Positive,,,1.1
4,2023-01-05,1051,108,7,8184.21,0.14,49268.9442,9,West,Cash,...,4,55,Female,Clothing,3.5,Not Returned,Neutral,18.9,WELCOME5,1.2
6,2023-01-07,1647,108,1,8692.24,0.13,7562.2488,2,East,Credit Card,...,1,35,Male,Electronics,4.1,Not Returned,Negative,12.59,SAVE10,
7,2023-01-08,1105,128,9,4880.03,0.06,41285.0538,3,West,Credit Card,...,6,51,Other,Electronics,3.0,Returned,Positive,18.59,FREESHIP,2.0
8,2023-01-09,1409,177,7,5853.8,0.18,33600.812,8,West,Online,...,4,52,Other,Clothing,4.9,Returned,Negative,,FREESHIP,
9,2023-01-10,1522,116,6,7241.4,0.05,41275.98,5,West,Cash,...,3,32,Other,Clothing,3.5,Not Returned,Negative,6.03,,3.7
12,2023-01-13,1945,137,2,8734.8,0.29,12403.416,5,East,Cash,...,1,63,Male,Clothing,4.0,Returned,Negative,17.45,SAVE10,2.8


<div class="alert alert-warning">

**6. Normalizing Data with Min-Max Scaling**

**Write a Pandas program that normalizes data with Min-Max scaling.**

</div>

In [14]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1474,179,5,5204.03,0.01,25759.9485,1,North,Credit Card,...,3,53,Female,Clothing,3.0,Not Returned,Neutral,16.71,SAVE10,4.2
1,2023-01-02,1305,116,1,2245.11,0.04,2155.3056,1,South,Cash,...,2,31,Other,Food,3.3,Not Returned,Neutral,15.89,,1.4
2,2023-01-03,1506,153,9,6179.81,0.22,43382.2662,9,West,Credit Card,...,3,49,Male,Electronics,1.7,Returned,Neutral,15.28,,
3,2023-01-04,1821,197,9,9705.13,0.23,67256.5509,7,West,Credit Card,...,3,47,Male,Books,2.5,Not Returned,Neutral,,,4.0
4,2023-01-05,1680,154,2,2410.57,0.17,4001.5462,1,South,Online,...,2,61,Female,Food,3.7,Returned,Negative,,SAVE10,1.2
5,2023-01-06,1237,180,4,8895.13,0.29,25262.1692,3,South,Online,...,2,27,Female,Electronics,1.4,Returned,Negative,6.85,,1.4
6,2023-01-07,1930,185,8,7842.37,0.08,57719.8432,8,South,Online,...,5,54,Other,Electronics,4.0,Not Returned,Positive,9.96,WELCOME5,
7,2023-01-08,1629,101,1,462.59,0.15,393.2015,6,East,Credit Card,...,4,49,Female,Electronics,2.3,Returned,Neutral,,WELCOME5,
8,2023-01-09,1345,169,2,6855.48,0.09,12476.9736,9,West,Credit Card,...,4,32,Other,Clothing,4.1,Not Returned,Negative,14.99,SAVE10,1.8
9,2023-01-10,1125,138,2,8550.45,0.22,13338.702,6,East,Cash,...,3,55,Male,Clothing,1.6,Returned,Neutral,11.46,FREESHIP,3.9


<div class="alert alert-success">

**Solution 06:**
</div>

In [15]:
# 6. Normalizing Data with Min-Max Scaling
# Write a Pandas program that normalizes data with Min-Max scaling.

def normalize_min_max(df, column):
    min_val = df[column].min()
    max_val = df[column].max()
    df[column] = (df[column] - min_val) / (max_val - min_val)
    return df

result_df = normalize_min_max(df.copy(), 'price')

result_df


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1474,179,5,0.527957,0.01,25759.9485,1,North,Credit Card,...,3,53,Female,Clothing,3.0,Not Returned,Neutral,16.71,SAVE10,4.2
1,2023-01-02,1305,116,1,0.217648,0.04,2155.3056,1,South,Cash,...,2,31,Other,Food,3.3,Not Returned,Neutral,15.89,,1.4
2,2023-01-03,1506,153,9,0.63029,0.22,43382.2662,9,West,Credit Card,...,3,49,Male,Electronics,1.7,Returned,Neutral,15.28,,
3,2023-01-04,1821,197,9,1.0,0.23,67256.5509,7,West,Credit Card,...,3,47,Male,Books,2.5,Not Returned,Neutral,,,4.0
4,2023-01-05,1680,154,2,0.235,0.17,4001.5462,1,South,Online,...,2,61,Female,Food,3.7,Returned,Negative,,SAVE10,1.2
5,2023-01-06,1237,180,4,0.915053,0.29,25262.1692,3,South,Online,...,2,27,Female,Electronics,1.4,Returned,Negative,6.85,,1.4
6,2023-01-07,1930,185,8,0.804647,0.08,57719.8432,8,South,Online,...,5,54,Other,Electronics,4.0,Not Returned,Positive,9.96,WELCOME5,
7,2023-01-08,1629,101,1,0.03071,0.15,393.2015,6,East,Credit Card,...,4,49,Female,Electronics,2.3,Returned,Neutral,,WELCOME5,
8,2023-01-09,1345,169,2,0.70115,0.09,12476.9736,9,West,Credit Card,...,4,32,Other,Clothing,4.1,Not Returned,Negative,14.99,SAVE10,1.8
9,2023-01-10,1125,138,2,0.878906,0.22,13338.702,6,East,Cash,...,3,55,Male,Clothing,1.6,Returned,Neutral,11.46,FREESHIP,3.9


<div class="alert alert-warning">

**7. Binning Data into Categories**

**Write a Pandas program to bin data into categories.**

</div>

In [16]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1658,105,7,2630.27,0.2,14729.512,9,North,Online,...,2,49,Male,Food,4.7,Returned,Neutral,19.65,FREESHIP,
1,2023-01-02,1722,124,2,5405.7,0.24,8216.664,5,North,Online,...,3,44,Female,Books,2.4,Not Returned,Neutral,19.51,FREESHIP,4.5
2,2023-01-03,1219,123,1,2833.34,0.17,2351.6722,1,South,Online,...,2,23,Other,Electronics,3.4,Returned,Positive,7.03,SAVE10,2.4
3,2023-01-04,1901,127,1,9365.62,0.14,8054.4332,5,North,Cash,...,6,38,Other,Electronics,2.3,Not Returned,Negative,12.39,FREESHIP,
4,2023-01-05,1403,124,3,5145.38,0.21,12194.5506,9,South,Cash,...,4,24,Female,Electronics,3.2,Returned,Negative,10.56,FREESHIP,2.8
5,2023-01-06,1030,132,4,6168.21,0.23,18998.0868,2,North,Credit Card,...,6,38,Other,Clothing,1.9,Not Returned,Positive,,WELCOME5,1.6
6,2023-01-07,1819,136,3,5977.36,0.12,15780.2304,6,North,Credit Card,...,6,19,Male,Electronics,3.5,Not Returned,Negative,10.99,SAVE10,4.2
7,2023-01-08,1741,124,5,1045.94,0.09,4759.027,9,West,Online,...,4,34,Female,Clothing,4.3,Not Returned,Positive,6.03,FREESHIP,3.7
8,2023-01-09,1715,153,5,8686.03,0.15,36915.6275,2,North,Cash,...,4,68,Other,Clothing,2.6,Not Returned,Negative,14.81,FREESHIP,
9,2023-01-10,1694,151,1,8352.75,0.23,6431.6175,8,East,Cash,...,1,66,Female,Books,1.1,Returned,Negative,,FREESHIP,


<div class="alert alert-success">

**Solution 07:**
</div>

In [17]:
# binning price
bins = [0, 100, 500, 1000, 5000, 10000]
labels = ['0-100', '100-500', '500-1000', '1000-5000', '5000-10000']
df['price_binned'] = pd.cut(
    df['price']
    , bins=bins
    , labels=labels
    , right=False
)

df['price_binned_q'] = pd.qcut(
    df['price']
    , q=[0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
    , duplicates='drop'
)
df['price_binned_q_no_labels'] = pd.qcut(
    df['price']
    , q=[0, 0.2, 0.4, 0.6, 0.8, 1.0]
    , labels=False
)

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,price_binned,price_binned_q,price_binned_q_no_labels
0,2023-01-01,1658,105,7,2630.27,0.2,14729.512,9,North,Online,...,Food,4.7,Returned,Neutral,19.65,FREESHIP,,1000-5000,Low,1
1,2023-01-02,1722,124,2,5405.7,0.24,8216.664,5,North,Online,...,Books,2.4,Not Returned,Neutral,19.51,FREESHIP,4.5,5000-10000,Medium,2
2,2023-01-03,1219,123,1,2833.34,0.17,2351.6722,1,South,Online,...,Electronics,3.4,Returned,Positive,7.03,SAVE10,2.4,1000-5000,Low,1
3,2023-01-04,1901,127,1,9365.62,0.14,8054.4332,5,North,Cash,...,Electronics,2.3,Not Returned,Negative,12.39,FREESHIP,,5000-10000,Very High,4
4,2023-01-05,1403,124,3,5145.38,0.21,12194.5506,9,South,Cash,...,Electronics,3.2,Returned,Negative,10.56,FREESHIP,2.8,5000-10000,Medium,2
5,2023-01-06,1030,132,4,6168.21,0.23,18998.0868,2,North,Credit Card,...,Clothing,1.9,Not Returned,Positive,,WELCOME5,1.6,5000-10000,High,3
6,2023-01-07,1819,136,3,5977.36,0.12,15780.2304,6,North,Credit Card,...,Electronics,3.5,Not Returned,Negative,10.99,SAVE10,4.2,5000-10000,Medium,2
7,2023-01-08,1741,124,5,1045.94,0.09,4759.027,9,West,Online,...,Clothing,4.3,Not Returned,Positive,6.03,FREESHIP,3.7,1000-5000,Very Low,0
8,2023-01-09,1715,153,5,8686.03,0.15,36915.6275,2,North,Cash,...,Clothing,2.6,Not Returned,Negative,14.81,FREESHIP,,5000-10000,Very High,4
9,2023-01-10,1694,151,1,8352.75,0.23,6431.6175,8,East,Cash,...,Books,1.1,Returned,Negative,,FREESHIP,,5000-10000,High,3


<div class="alert alert-warning">

**8. Converting Data Types and Column Operations**

**Write a Pandas program that handles text data with str.replace().**

</div>

In [18]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1896,165,6,9887.08,0.26,43898.6352,9,West,Cash,...,5,47,Male,Books,1.1,Not Returned,Neutral,11.83,SAVE10,2.6
1,2023-01-02,1988,124,2,7871.14,0.21,12436.4012,3,North,Credit Card,...,3,29,Female,Books,4.1,Not Returned,Negative,7.63,SAVE10,1.6
2,2023-01-03,1291,127,3,7912.38,0.16,19939.1976,9,West,Credit Card,...,4,36,Male,Clothing,4.8,Not Returned,Positive,6.72,,1.1
3,2023-01-04,1634,125,9,2635.5,0.03,23007.915,7,East,Online,...,4,47,Other,Food,2.8,Returned,Neutral,13.64,,3.5
4,2023-01-05,1654,129,5,7319.83,0.1,32939.235,8,West,Cash,...,2,41,Male,Electronics,4.2,Returned,Neutral,12.98,FREESHIP,
5,2023-01-06,1910,170,7,7481.84,0.07,48706.7784,2,West,Credit Card,...,6,36,Female,Food,3.5,Not Returned,Neutral,7.27,WELCOME5,
6,2023-01-07,1660,174,8,6453.05,0.27,37685.812,3,North,Cash,...,6,68,Other,Food,3.2,Returned,Negative,12.89,SAVE10,3.0
7,2023-01-08,1725,135,7,4564.59,0.03,30993.5661,1,East,Credit Card,...,6,38,Other,Food,3.2,Returned,Positive,19.29,FREESHIP,3.3
8,2023-01-09,1213,108,8,2010.43,0.08,14796.7648,7,South,Online,...,6,47,Female,Clothing,1.5,Returned,Negative,8.56,FREESHIP,3.4
9,2023-01-10,1325,176,9,8166.1,0.22,57326.022,4,North,Credit Card,...,2,51,Female,Food,5.0,Returned,Positive,8.24,SAVE10,3.8


<div class="alert alert-success">

**Solution 08:**
</div>

In [19]:
# 8. Converting Data Types and Column Operations
# Write a Pandas program that handles text data with str.replace().

df['region_arrows'] = df['region'].str.replace('North', '↑')
df['region_arrows'] = df['region_arrows'].str.replace('South', '↓')
df['region_arrows'] = df['region_arrows'].str.replace('East', '→')
df['region_arrows'] = df['region_arrows'].str.replace('West', '←')

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,region_arrows
0,2023-01-01,1896,165,6,9887.08,0.26,43898.6352,9,West,Cash,...,47,Male,Books,1.1,Not Returned,Neutral,11.83,SAVE10,2.6,←
1,2023-01-02,1988,124,2,7871.14,0.21,12436.4012,3,North,Credit Card,...,29,Female,Books,4.1,Not Returned,Negative,7.63,SAVE10,1.6,↑
2,2023-01-03,1291,127,3,7912.38,0.16,19939.1976,9,West,Credit Card,...,36,Male,Clothing,4.8,Not Returned,Positive,6.72,,1.1,←
3,2023-01-04,1634,125,9,2635.5,0.03,23007.915,7,East,Online,...,47,Other,Food,2.8,Returned,Neutral,13.64,,3.5,→
4,2023-01-05,1654,129,5,7319.83,0.1,32939.235,8,West,Cash,...,41,Male,Electronics,4.2,Returned,Neutral,12.98,FREESHIP,,←
5,2023-01-06,1910,170,7,7481.84,0.07,48706.7784,2,West,Credit Card,...,36,Female,Food,3.5,Not Returned,Neutral,7.27,WELCOME5,,←
6,2023-01-07,1660,174,8,6453.05,0.27,37685.812,3,North,Cash,...,68,Other,Food,3.2,Returned,Negative,12.89,SAVE10,3.0,↑
7,2023-01-08,1725,135,7,4564.59,0.03,30993.5661,1,East,Credit Card,...,38,Other,Food,3.2,Returned,Positive,19.29,FREESHIP,3.3,→
8,2023-01-09,1213,108,8,2010.43,0.08,14796.7648,7,South,Online,...,47,Female,Clothing,1.5,Returned,Negative,8.56,FREESHIP,3.4,↓
9,2023-01-10,1325,176,9,8166.1,0.22,57326.022,4,North,Credit Card,...,51,Female,Food,5.0,Returned,Positive,8.24,SAVE10,3.8,↑


<div class="alert alert-warning">

**9. Replacing Missing Data with Mean Value**

**Replace Missing Data with Mean Value**

</div>

In [20]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1453,167,1,1262.26,0.23,971.9402,6,East,Cash,...,5,27,Other,Food,2.0,Returned,Positive,5.59,,4.0
1,2023-01-02,1398,183,6,2030.45,0.24,9258.852,9,North,Online,...,6,67,Female,Books,1.4,Not Returned,Negative,18.67,,2.0
2,2023-01-03,1553,173,3,9796.42,0.05,27919.797,8,East,Online,...,6,36,Male,Clothing,4.1,Returned,Negative,10.53,WELCOME5,2.4
3,2023-01-04,1070,183,5,1719.81,0.03,8341.0785,6,South,Credit Card,...,4,68,Male,Electronics,2.0,Not Returned,Negative,,,4.0
4,2023-01-05,1475,107,7,8912.64,0.06,58645.1712,4,East,Online,...,3,58,Other,Clothing,2.5,Returned,Positive,10.47,FREESHIP,4.9
5,2023-01-06,1173,136,7,858.98,0.14,5171.0596,8,North,Cash,...,6,68,Male,Clothing,1.0,Not Returned,Neutral,18.73,FREESHIP,1.5
6,2023-01-07,1070,181,2,1163.08,0.11,2070.2824,4,North,Cash,...,6,55,Female,Food,3.2,Not Returned,Positive,8.51,,2.1
7,2023-01-08,1736,169,7,4850.2,0.23,26142.578,3,North,Online,...,4,42,Other,Food,2.9,Returned,Negative,10.17,SAVE10,1.7
8,2023-01-09,1359,120,7,136.2,0.12,838.992,5,East,Credit Card,...,4,23,Male,Clothing,2.7,Returned,Negative,11.82,FREESHIP,
9,2023-01-10,1249,101,3,5897.93,0.27,12916.4667,9,East,Credit Card,...,5,61,Female,Books,4.3,Not Returned,Neutral,8.14,,1.1


<div class="alert alert-success">

**Solution 09:**
</div>

In [21]:
# 9. Replacing Missing Data with Mean Value
# replace missing values in 'delivery_rating' column with mean
df['delivery_rating'] = df['delivery_rating'].fillna(df['delivery_rating'].mean())

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1453,167,1,1262.26,0.23,971.9402,6,East,Cash,...,5,27,Other,Food,2.0,Returned,Positive,5.59,,4.0
1,2023-01-02,1398,183,6,2030.45,0.24,9258.852,9,North,Online,...,6,67,Female,Books,1.4,Not Returned,Negative,18.67,,2.0
2,2023-01-03,1553,173,3,9796.42,0.05,27919.797,8,East,Online,...,6,36,Male,Clothing,4.1,Returned,Negative,10.53,WELCOME5,2.4
3,2023-01-04,1070,183,5,1719.81,0.03,8341.0785,6,South,Credit Card,...,4,68,Male,Electronics,2.0,Not Returned,Negative,,,4.0
4,2023-01-05,1475,107,7,8912.64,0.06,58645.1712,4,East,Online,...,3,58,Other,Clothing,2.5,Returned,Positive,10.47,FREESHIP,4.9
5,2023-01-06,1173,136,7,858.98,0.14,5171.0596,8,North,Cash,...,6,68,Male,Clothing,1.0,Not Returned,Neutral,18.73,FREESHIP,1.5
6,2023-01-07,1070,181,2,1163.08,0.11,2070.2824,4,North,Cash,...,6,55,Female,Food,3.2,Not Returned,Positive,8.51,,2.1
7,2023-01-08,1736,169,7,4850.2,0.23,26142.578,3,North,Online,...,4,42,Other,Food,2.9,Returned,Negative,10.17,SAVE10,1.7
8,2023-01-09,1359,120,7,136.2,0.12,838.992,5,East,Credit Card,...,4,23,Male,Clothing,2.7,Returned,Negative,11.82,FREESHIP,3.133333
9,2023-01-10,1249,101,3,5897.93,0.27,12916.4667,9,East,Credit Card,...,5,61,Female,Books,4.3,Not Returned,Neutral,8.14,,1.1


<div class="alert alert-warning">

**10. Removing Columns with Too Many Missing Values**

**Remove columns with 5 or more missing values**

</div>

In [22]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1169,159,6,4536.99,0.15,23138.649,1,South,Cash,...,2,39,Male,Clothing,4.9,Not Returned,Neutral,,FREESHIP,
1,2023-01-02,1539,169,1,8671.6,0.01,8584.884,6,East,Online,...,6,50,Female,Electronics,3.8,Returned,Negative,,FREESHIP,3.0
2,2023-01-03,1869,158,3,1113.05,0.2,2671.32,7,South,Credit Card,...,4,62,Other,Electronics,2.5,Returned,Negative,8.29,FREESHIP,3.8
3,2023-01-04,1171,120,6,2071.51,0.25,9321.795,7,North,Credit Card,...,2,22,Male,Clothing,1.4,Not Returned,Neutral,,WELCOME5,3.0
4,2023-01-05,1639,166,9,195.76,0.13,1532.8008,8,West,Credit Card,...,1,64,Other,Food,1.8,Returned,Positive,18.45,,
5,2023-01-06,1111,138,1,7372.74,0.18,6045.6468,4,West,Credit Card,...,2,49,Male,Food,3.9,Returned,Positive,14.55,,2.7
6,2023-01-07,1452,195,1,9665.96,0.21,7636.1084,5,South,Cash,...,2,19,Female,Books,3.4,Not Returned,Neutral,,SAVE10,2.3
7,2023-01-08,1591,148,5,615.55,0.3,2154.425,4,East,Cash,...,4,63,Female,Books,1.9,Not Returned,Neutral,,FREESHIP,2.0
8,2023-01-09,1786,195,4,1253.92,0.1,4514.112,9,East,Online,...,1,30,Other,Electronics,3.8,Returned,Neutral,,,1.8
9,2023-01-10,1111,122,6,2852.43,0.25,12835.935,8,West,Credit Card,...,4,57,Female,Clothing,4.5,Returned,Positive,12.51,FREESHIP,1.3


<div class="alert alert-success">

**Solution 10:**
</div>

In [23]:
# 10. Removing Columns with Too Many Missing Values
# Remove columns with 5 or more missing values
df.dropna(thresh=len(df) - 5, axis=1, inplace=True)

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,coupon_code
0,2023-01-01,1169,159,6,4536.99,0.15,23138.649,1,South,Cash,True,2,39,Male,Clothing,4.9,Not Returned,Neutral,FREESHIP
1,2023-01-02,1539,169,1,8671.6,0.01,8584.884,6,East,Online,True,6,50,Female,Electronics,3.8,Returned,Negative,FREESHIP
2,2023-01-03,1869,158,3,1113.05,0.2,2671.32,7,South,Credit Card,True,4,62,Other,Electronics,2.5,Returned,Negative,FREESHIP
3,2023-01-04,1171,120,6,2071.51,0.25,9321.795,7,North,Credit Card,False,2,22,Male,Clothing,1.4,Not Returned,Neutral,WELCOME5
4,2023-01-05,1639,166,9,195.76,0.13,1532.8008,8,West,Credit Card,True,1,64,Other,Food,1.8,Returned,Positive,
5,2023-01-06,1111,138,1,7372.74,0.18,6045.6468,4,West,Credit Card,True,2,49,Male,Food,3.9,Returned,Positive,
6,2023-01-07,1452,195,1,9665.96,0.21,7636.1084,5,South,Cash,False,2,19,Female,Books,3.4,Not Returned,Neutral,SAVE10
7,2023-01-08,1591,148,5,615.55,0.3,2154.425,4,East,Cash,False,4,63,Female,Books,1.9,Not Returned,Neutral,FREESHIP
8,2023-01-09,1786,195,4,1253.92,0.1,4514.112,9,East,Online,True,1,30,Other,Electronics,3.8,Returned,Neutral,
9,2023-01-10,1111,122,6,2852.43,0.25,12835.935,8,West,Credit Card,False,4,57,Female,Clothing,4.5,Returned,Positive,FREESHIP


<div class="alert alert-warning">

**11. Reordering and Splitting Columns**

**Reorder Columns and Split columns**

</div>

In [24]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'Date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'REVENUE': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['REVENUE'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

df['to_split'] = df['region'] + ' - ' + df['gender']

duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,Date,customer_id,product_id,quantity,price,discount,REVENUE,store_id,region,payment_method,...,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,to_split
0,2023-01-01,1558,174,8,7297.73,0.07,54295.1112,3,North,Online,...,65,Other,Food,1.8,Returned,Negative,18.17,,,North - Other
1,2023-01-02,1204,181,5,463.79,0.0,2318.95,5,East,Credit Card,...,47,Male,Food,3.3,Not Returned,Negative,,FREESHIP,,East - Male
2,2023-01-03,1938,172,2,4639.88,0.28,6681.4272,7,West,Online,...,21,Male,Books,2.7,Returned,Neutral,11.05,FREESHIP,2.8,West - Male
3,2023-01-04,1650,167,1,2233.48,0.04,2144.1408,7,North,Cash,...,43,Male,Food,4.3,Returned,Negative,18.27,WELCOME5,4.6,North - Male
4,2023-01-05,1099,194,9,2504.23,0.17,18706.5981,4,West,Credit Card,...,69,Other,Books,1.0,Returned,Negative,13.54,FREESHIP,,West - Other
5,2023-01-06,1699,167,3,9633.46,0.18,23698.3116,7,East,Cash,...,55,Male,Clothing,3.8,Returned,Negative,6.76,,2.1,East - Male
6,2023-01-07,1231,159,7,3460.0,0.2,19376.0,5,South,Cash,...,37,Male,Food,2.0,Returned,Positive,13.11,,,South - Male
7,2023-01-08,1396,170,3,9761.44,0.22,22841.7696,2,East,Cash,...,51,Female,Clothing,1.1,Returned,Positive,10.47,FREESHIP,,East - Female
8,2023-01-09,1613,121,4,3493.57,0.17,11598.6524,7,East,Online,...,34,Male,Food,3.5,Not Returned,Negative,19.54,WELCOME5,,East - Male
9,2023-01-10,1216,113,2,934.21,0.17,1550.7886,1,North,Cash,...,46,Female,Books,1.4,Not Returned,Neutral,14.59,FREESHIP,2.0,North - Female


<div class="alert alert-success">

**Solution 11:**
</div>

In [25]:
# 11. Reordering and Splitting Columns
# Reorder Columns and Split columns
df.columns = df.columns.str.lower()
split_values = df['to_split'].str.split(' - ', expand=True)
df['region_split'] = split_values[0]
df['gender_split'] = split_values[1]
df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,to_split,region_split,gender_split
0,2023-01-01,1558,174,8,7297.73,0.07,54295.1112,3,North,Online,...,Food,1.8,Returned,Negative,18.17,,,North - Other,North,Other
1,2023-01-02,1204,181,5,463.79,0.0,2318.95,5,East,Credit Card,...,Food,3.3,Not Returned,Negative,,FREESHIP,,East - Male,East,Male
2,2023-01-03,1938,172,2,4639.88,0.28,6681.4272,7,West,Online,...,Books,2.7,Returned,Neutral,11.05,FREESHIP,2.8,West - Male,West,Male
3,2023-01-04,1650,167,1,2233.48,0.04,2144.1408,7,North,Cash,...,Food,4.3,Returned,Negative,18.27,WELCOME5,4.6,North - Male,North,Male
4,2023-01-05,1099,194,9,2504.23,0.17,18706.5981,4,West,Credit Card,...,Books,1.0,Returned,Negative,13.54,FREESHIP,,West - Other,West,Other
5,2023-01-06,1699,167,3,9633.46,0.18,23698.3116,7,East,Cash,...,Clothing,3.8,Returned,Negative,6.76,,2.1,East - Male,East,Male
6,2023-01-07,1231,159,7,3460.0,0.2,19376.0,5,South,Cash,...,Food,2.0,Returned,Positive,13.11,,,South - Male,South,Male
7,2023-01-08,1396,170,3,9761.44,0.22,22841.7696,2,East,Cash,...,Clothing,1.1,Returned,Positive,10.47,FREESHIP,,East - Female,East,Female
8,2023-01-09,1613,121,4,3493.57,0.17,11598.6524,7,East,Online,...,Food,3.5,Not Returned,Negative,19.54,WELCOME5,,East - Male,East,Male
9,2023-01-10,1216,113,2,934.21,0.17,1550.7886,1,North,Cash,...,Books,1.4,Not Returned,Neutral,14.59,FREESHIP,2.0,North - Female,North,Female


<div class="alert alert-warning">

**12. Replacing Missing Data with Mean Value**

**Write a Pandas program to replacing missing data with mean value.**

</div>

In [26]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1293,111,1,3219.35,0.26,2382.319,5,West,Credit Card,...,3,28,Other,Books,1.8,Returned,Positive,18.9,,1.7
1,2023-01-02,1578,123,4,590.55,0.05,2244.09,2,South,Credit Card,...,3,49,Male,Electronics,3.4,Returned,Negative,7.18,WELCOME5,2.4
2,2023-01-03,1203,182,6,3008.5,0.26,13357.74,9,South,Cash,...,4,32,Male,Electronics,2.1,Not Returned,Negative,19.61,,4.0
3,2023-01-04,1087,182,5,9816.13,0.06,46135.811,9,South,Credit Card,...,4,21,Other,Food,2.1,Returned,Positive,19.57,SAVE10,4.0
4,2023-01-05,1195,143,9,5274.6,0.14,40825.404,2,North,Online,...,4,55,Male,Clothing,4.5,Not Returned,Negative,9.97,,1.3
5,2023-01-06,1293,132,9,6763.77,0.23,46872.9261,5,West,Cash,...,3,49,Male,Electronics,1.4,Not Returned,Neutral,19.34,FREESHIP,
6,2023-01-07,1607,110,4,1162.11,0.08,4276.5648,8,East,Cash,...,5,23,Other,Food,2.7,Not Returned,Negative,14.91,WELCOME5,1.9
7,2023-01-08,1612,166,2,1783.47,0.11,3174.5766,9,South,Online,...,1,32,Female,Food,1.2,Not Returned,Negative,12.48,FREESHIP,
8,2023-01-09,1220,154,1,6624.77,0.21,5233.5683,3,North,Online,...,6,20,Female,Books,2.8,Not Returned,Neutral,8.93,,1.8
9,2023-01-10,1666,138,3,586.64,0.05,1671.924,7,South,Credit Card,...,5,61,Female,Books,1.8,Not Returned,Neutral,17.55,,


<div class="alert alert-success">

**Solution 12:**
</div>

In [27]:
# 12. Replacing Missing Data with Mean Value
# Write a Pandas program to replacing missing data with mean value.

df['delivery_rating'] = df['delivery_rating'].fillna(df['delivery_rating'].mean())
df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1293,111,1,3219.35,0.26,2382.319,5,West,Credit Card,...,3,28,Other,Books,1.8,Returned,Positive,18.9,,1.7
1,2023-01-02,1578,123,4,590.55,0.05,2244.09,2,South,Credit Card,...,3,49,Male,Electronics,3.4,Returned,Negative,7.18,WELCOME5,2.4
2,2023-01-03,1203,182,6,3008.5,0.26,13357.74,9,South,Cash,...,4,32,Male,Electronics,2.1,Not Returned,Negative,19.61,,4.0
3,2023-01-04,1087,182,5,9816.13,0.06,46135.811,9,South,Credit Card,...,4,21,Other,Food,2.1,Returned,Positive,19.57,SAVE10,4.0
4,2023-01-05,1195,143,9,5274.6,0.14,40825.404,2,North,Online,...,4,55,Male,Clothing,4.5,Not Returned,Negative,9.97,,1.3
5,2023-01-06,1293,132,9,6763.77,0.23,46872.9261,5,West,Cash,...,3,49,Male,Electronics,1.4,Not Returned,Neutral,19.34,FREESHIP,2.634483
6,2023-01-07,1607,110,4,1162.11,0.08,4276.5648,8,East,Cash,...,5,23,Other,Food,2.7,Not Returned,Negative,14.91,WELCOME5,1.9
7,2023-01-08,1612,166,2,1783.47,0.11,3174.5766,9,South,Online,...,1,32,Female,Food,1.2,Not Returned,Negative,12.48,FREESHIP,2.634483
8,2023-01-09,1220,154,1,6624.77,0.21,5233.5683,3,North,Online,...,6,20,Female,Books,2.8,Not Returned,Neutral,8.93,,1.8
9,2023-01-10,1666,138,3,586.64,0.05,1671.924,7,South,Credit Card,...,5,61,Female,Books,1.8,Not Returned,Neutral,17.55,,2.634483


<div class="alert alert-warning">

**13. Changing Column Names to Lowercase**

**Write a Pandas program to change column names to lowercase.**

</div>

In [28]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'Date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'REVENUE': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['REVENUE'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

df['to_split'] = df['region'] + ' - ' + df['gender']

duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,Date,customer_id,product_id,quantity,price,discount,REVENUE,store_id,region,payment_method,...,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,to_split
0,2023-01-01,1900,105,1,1767.49,0.29,1254.9179,4,East,Credit Card,...,59,Female,Food,3.3,Not Returned,Negative,17.48,FREESHIP,3.8,East - Female
1,2023-01-02,1597,188,1,9198.77,0.29,6531.1267,4,West,Cash,...,42,Other,Food,2.0,Not Returned,Neutral,,SAVE10,1.1,West - Other
2,2023-01-03,1639,176,7,7926.37,0.23,42723.1343,7,South,Online,...,49,Male,Clothing,3.1,Returned,Negative,13.52,,,South - Male
3,2023-01-04,1246,155,8,939.61,0.21,5938.3352,6,North,Cash,...,46,Male,Books,4.2,Not Returned,Positive,,,3.7,North - Male
4,2023-01-05,1332,101,1,2094.3,0.29,1486.953,9,West,Credit Card,...,55,Male,Electronics,1.6,Not Returned,Positive,5.14,WELCOME5,,West - Male
5,2023-01-06,1445,114,5,1611.62,0.23,6204.737,2,East,Cash,...,31,Male,Electronics,1.8,Returned,Neutral,,,2.2,East - Male
6,2023-01-07,1731,177,8,488.54,0.2,3126.656,9,West,Online,...,30,Female,Electronics,2.7,Not Returned,Positive,17.73,WELCOME5,3.4,West - Female
7,2023-01-08,1724,192,6,1781.91,0.12,9408.4848,6,North,Credit Card,...,40,Female,Clothing,4.4,Returned,Positive,19.78,FREESHIP,2.4,North - Female
8,2023-01-09,1198,168,6,6370.77,0.0,38224.62,9,West,Credit Card,...,43,Other,Electronics,5.0,Not Returned,Negative,14.05,SAVE10,3.1,West - Other
9,2023-01-10,1913,167,6,6158.28,0.19,29929.2408,4,West,Online,...,27,Other,Books,4.3,Not Returned,Positive,14.65,WELCOME5,4.8,West - Other


<div class="alert alert-success">

**Solution 13:**
</div>

In [29]:
# Write a Pandas program to change column names to lowercase

df.columns = df.columns.str.lower()

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,to_split
0,2023-01-01,1900,105,1,1767.49,0.29,1254.9179,4,East,Credit Card,...,59,Female,Food,3.3,Not Returned,Negative,17.48,FREESHIP,3.8,East - Female
1,2023-01-02,1597,188,1,9198.77,0.29,6531.1267,4,West,Cash,...,42,Other,Food,2.0,Not Returned,Neutral,,SAVE10,1.1,West - Other
2,2023-01-03,1639,176,7,7926.37,0.23,42723.1343,7,South,Online,...,49,Male,Clothing,3.1,Returned,Negative,13.52,,,South - Male
3,2023-01-04,1246,155,8,939.61,0.21,5938.3352,6,North,Cash,...,46,Male,Books,4.2,Not Returned,Positive,,,3.7,North - Male
4,2023-01-05,1332,101,1,2094.3,0.29,1486.953,9,West,Credit Card,...,55,Male,Electronics,1.6,Not Returned,Positive,5.14,WELCOME5,,West - Male
5,2023-01-06,1445,114,5,1611.62,0.23,6204.737,2,East,Cash,...,31,Male,Electronics,1.8,Returned,Neutral,,,2.2,East - Male
6,2023-01-07,1731,177,8,488.54,0.2,3126.656,9,West,Online,...,30,Female,Electronics,2.7,Not Returned,Positive,17.73,WELCOME5,3.4,West - Female
7,2023-01-08,1724,192,6,1781.91,0.12,9408.4848,6,North,Credit Card,...,40,Female,Clothing,4.4,Returned,Positive,19.78,FREESHIP,2.4,North - Female
8,2023-01-09,1198,168,6,6370.77,0.0,38224.62,9,West,Credit Card,...,43,Other,Electronics,5.0,Not Returned,Negative,14.05,SAVE10,3.1,West - Other
9,2023-01-10,1913,167,6,6158.28,0.19,29929.2408,4,West,Online,...,27,Other,Books,4.3,Not Returned,Positive,14.65,WELCOME5,4.8,West - Other


<div class="alert alert-warning">

**14. Reordering Columns in a DataFrame**

**Write a Pandas program to reorder columns in a DataFrame.**

</div>

In [30]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1172,150,1,544.16,0.28,391.7952,3,North,Online,...,4,67,Other,Clothing,2.8,Not Returned,Negative,6.96,,1.6
1,2023-01-02,1703,137,1,5857.88,0.05,5564.986,7,West,Cash,...,4,66,Female,Books,3.1,Not Returned,Neutral,15.33,,2.8
2,2023-01-03,1712,135,3,33.98,0.07,94.8042,1,West,Online,...,1,57,Female,Clothing,2.9,Returned,Positive,,WELCOME5,
3,2023-01-04,1095,129,5,6856.02,0.18,28109.682,7,East,Cash,...,3,38,Male,Food,4.3,Returned,Positive,19.19,,4.8
4,2023-01-05,1580,120,9,709.6,0.03,6194.808,4,North,Cash,...,3,49,Other,Electronics,2.0,Returned,Neutral,,WELCOME5,3.4
5,2023-01-06,1864,184,8,3095.9,0.22,19318.416,9,West,Cash,...,5,25,Male,Books,2.2,Returned,Neutral,12.77,,1.3
6,2023-01-07,1950,198,8,7422.03,0.26,43938.4176,9,North,Cash,...,1,39,Male,Electronics,1.4,Returned,Negative,10.38,SAVE10,
7,2023-01-08,1528,124,4,7391.21,0.01,29269.1916,9,West,Online,...,4,34,Male,Books,3.4,Returned,Neutral,,SAVE10,1.7
8,2023-01-09,1875,192,7,6755.53,0.2,37830.968,6,West,Credit Card,...,5,51,Female,Food,1.7,Returned,Positive,13.44,,1.9
9,2023-01-10,1186,130,6,5205.12,0.05,29669.184,1,North,Credit Card,...,5,41,Male,Books,2.6,Not Returned,Neutral,6.03,,1.8


<div class="alert alert-success">

**Solution 14:**
</div>

In [31]:
# 14. Reordering Columns in a DataFrame
# Write a Pandas program to reorder columns in a DataFrame.

sorted_columns_list = df.columns.tolist()
sorted_columns_list.sort()
df = df[sorted_columns_list]
df

Unnamed: 0,coupon_code,customer_age,customer_id,date,delivery_days,delivery_rating,discount,feedback,gender,is_member,...,price,product_category,product_id,quantity,rating,region,return_status,revenue,shipping_cost,store_id
0,,67,1172,2023-01-01,4,1.6,0.28,Negative,Other,True,...,544.16,Clothing,150,1,2.8,North,Not Returned,391.7952,6.96,3
1,,66,1703,2023-01-02,4,2.8,0.05,Neutral,Female,False,...,5857.88,Books,137,1,3.1,West,Not Returned,5564.986,15.33,7
2,WELCOME5,57,1712,2023-01-03,1,,0.07,Positive,Female,False,...,33.98,Clothing,135,3,2.9,West,Returned,94.8042,,1
3,,38,1095,2023-01-04,3,4.8,0.18,Positive,Male,True,...,6856.02,Food,129,5,4.3,East,Returned,28109.682,19.19,7
4,WELCOME5,49,1580,2023-01-05,3,3.4,0.03,Neutral,Other,False,...,709.6,Electronics,120,9,2.0,North,Returned,6194.808,,4
5,,25,1864,2023-01-06,5,1.3,0.22,Neutral,Male,False,...,3095.9,Books,184,8,2.2,West,Returned,19318.416,12.77,9
6,SAVE10,39,1950,2023-01-07,1,,0.26,Negative,Male,True,...,7422.03,Electronics,198,8,1.4,North,Returned,43938.4176,10.38,9
7,SAVE10,34,1528,2023-01-08,4,1.7,0.01,Neutral,Male,True,...,7391.21,Books,124,4,3.4,West,Returned,29269.1916,,9
8,,51,1875,2023-01-09,5,1.9,0.2,Positive,Female,True,...,6755.53,Food,192,7,1.7,West,Returned,37830.968,13.44,6
9,,41,1186,2023-01-10,5,1.8,0.05,Neutral,Male,True,...,5205.12,Books,130,6,2.6,North,Not Returned,29669.184,6.03,1


<div class="alert alert-warning">

**15. Splitting a Column into Multiple Columns**

**Write a Pandas program to split a column into multiple columns.**

</div>

In [32]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1432,198,3,9358.45,0.14,24144.801,3,West,Credit Card,...,4,55,Male,Food,1.3,Returned,Negative,13.66,,2.9
1,2023-01-02,1279,114,8,5947.98,0.28,34260.3648,1,East,Online,...,2,37,Male,Electronics,1.5,Not Returned,Negative,12.92,,4.7
2,2023-01-03,1740,130,2,1981.89,0.02,3884.5044,1,West,Credit Card,...,2,64,Other,Books,1.3,Not Returned,Positive,,WELCOME5,
3,2023-01-04,1528,117,3,2249.82,0.17,5602.0518,3,East,Cash,...,6,24,Other,Clothing,3.2,Not Returned,Neutral,9.77,SAVE10,
4,2023-01-05,1087,115,3,6343.97,0.13,16557.7617,2,North,Credit Card,...,3,47,Other,Books,4.1,Returned,Negative,12.63,WELCOME5,3.2
5,2023-01-06,1472,144,5,8866.52,0.26,32806.124,9,West,Online,...,6,43,Other,Clothing,2.9,Not Returned,Negative,18.91,FREESHIP,3.9
6,2023-01-07,1539,137,7,891.49,0.18,5117.1526,3,East,Cash,...,5,63,Male,Clothing,2.1,Not Returned,Negative,9.36,FREESHIP,1.7
7,2023-01-08,1394,131,1,6040.43,0.01,5980.0257,9,North,Online,...,4,43,Other,Electronics,4.2,Not Returned,Negative,,,1.1
8,2023-01-09,1717,160,1,792.81,0.07,737.3133,6,North,Cash,...,4,29,Other,Clothing,2.3,Not Returned,Positive,9.81,,2.0
9,2023-01-10,1470,154,1,3594.94,0.16,3019.7496,7,West,Online,...,2,20,Male,Books,3.3,Not Returned,Neutral,7.02,SAVE10,4.0


<div class="alert alert-success">

**Solution 15:**
</div>

In [33]:
# 15. Splitting a Column into Multiple Columns
# Write a Pandas program to split a column into multiple columns.

df['column_to_split'] = df['region'] + ' - ' + df['product_category'] + ' - ' + df['gender'] + ' - ' + df['payment_method']
df[['region_from_split', 'product_category_from_split', 'gender_from_split', 'payment_method_from_split']] = df['column_to_split'].str.split(' - ', expand=True)
df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,return_status,feedback,shipping_cost,coupon_code,delivery_rating,column_to_split,region_from_split,product_category_from_split,gender_from_split,payment_method_from_split
0,2023-01-01,1432,198,3,9358.45,0.14,24144.801,3,West,Credit Card,...,Returned,Negative,13.66,,2.9,West - Food - Male - Credit Card,West,Food,Male,Credit Card
1,2023-01-02,1279,114,8,5947.98,0.28,34260.3648,1,East,Online,...,Not Returned,Negative,12.92,,4.7,East - Electronics - Male - Online,East,Electronics,Male,Online
2,2023-01-03,1740,130,2,1981.89,0.02,3884.5044,1,West,Credit Card,...,Not Returned,Positive,,WELCOME5,,West - Books - Other - Credit Card,West,Books,Other,Credit Card
3,2023-01-04,1528,117,3,2249.82,0.17,5602.0518,3,East,Cash,...,Not Returned,Neutral,9.77,SAVE10,,East - Clothing - Other - Cash,East,Clothing,Other,Cash
4,2023-01-05,1087,115,3,6343.97,0.13,16557.7617,2,North,Credit Card,...,Returned,Negative,12.63,WELCOME5,3.2,North - Books - Other - Credit Card,North,Books,Other,Credit Card
5,2023-01-06,1472,144,5,8866.52,0.26,32806.124,9,West,Online,...,Not Returned,Negative,18.91,FREESHIP,3.9,West - Clothing - Other - Online,West,Clothing,Other,Online
6,2023-01-07,1539,137,7,891.49,0.18,5117.1526,3,East,Cash,...,Not Returned,Negative,9.36,FREESHIP,1.7,East - Clothing - Male - Cash,East,Clothing,Male,Cash
7,2023-01-08,1394,131,1,6040.43,0.01,5980.0257,9,North,Online,...,Not Returned,Negative,,,1.1,North - Electronics - Other - Online,North,Electronics,Other,Online
8,2023-01-09,1717,160,1,792.81,0.07,737.3133,6,North,Cash,...,Not Returned,Positive,9.81,,2.0,North - Clothing - Other - Cash,North,Clothing,Other,Cash
9,2023-01-10,1470,154,1,3594.94,0.16,3019.7496,7,West,Online,...,Not Returned,Neutral,7.02,SAVE10,4.0,West - Books - Male - Online,West,Books,Male,Online
