[Pandas Data Cleaning and Preprocessing](https://www.w3resource.com/python-exercises/pandas/pandas-data-cleaning-and-preprocessing.php)

In [1]:
import pandas as pd
import numpy as np

<div class="alert alert-warning">

**1. Handling Missing Data in Pandas**

**Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1922,152,1,82.92,0.21,65.5068,9,East,Credit Card,...,1,53,Female,Clothing,3.8,Returned,Negative,,WELCOME5,
1,2023-01-02,1973,148,7,56.92,0.23,306.7988,1,East,Credit Card,...,5,35,Male,Books,2.9,Returned,Negative,12.98,,3.2
2,2023-01-03,1611,156,3,10.48,0.03,30.4968,9,West,Cash,...,4,50,Female,Clothing,3.8,Returned,Positive,,SAVE10,3.2
3,2023-01-04,1679,103,2,88.79,0.12,156.2704,3,East,Online,...,6,48,Female,Food,2.3,Not Returned,Negative,,,4.9
4,2023-01-05,1603,122,4,20.32,0.2,65.024,4,East,Online,...,1,60,Other,Electronics,3.2,Not Returned,Neutral,,,1.5
5,2023-01-06,1059,124,1,48.99,0.18,40.1718,5,West,Online,...,6,44,Other,Electronics,1.5,Not Returned,Positive,7.51,,
6,2023-01-07,1012,131,7,32.55,0.22,177.723,8,North,Credit Card,...,5,51,Female,Electronics,1.1,Returned,Positive,18.15,SAVE10,2.4
7,2023-01-08,1510,176,2,95.54,0.24,145.2208,5,South,Cash,...,1,37,Female,Clothing,4.8,Returned,Positive,,FREESHIP,4.8
8,2023-01-09,1702,110,9,15.25,0.05,130.3875,6,North,Cash,...,6,68,Other,Books,4.0,Returned,Positive,,,
9,2023-01-10,1973,173,7,61.58,0.23,331.9162,3,South,Online,...,1,64,Male,Books,4.1,Not Returned,Neutral,11.83,FREESHIP,


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Handling Missing Data in Pandas
# Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().
result_df = df.fillna({
    'shipping_cost': df['shipping_cost'].mean(),
    'coupon_code': 'No Coupon',
    'delivery_rating': df['delivery_rating'].mean()
})

result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1922,152,1,82.92,0.21,65.5068,9,East,Credit Card,...,1,53,Female,Clothing,3.8,Returned,Negative,12.898696,WELCOME5,2.948
1,2023-01-02,1973,148,7,56.92,0.23,306.7988,1,East,Credit Card,...,5,35,Male,Books,2.9,Returned,Negative,12.98,,3.2
2,2023-01-03,1611,156,3,10.48,0.03,30.4968,9,West,Cash,...,4,50,Female,Clothing,3.8,Returned,Positive,12.898696,SAVE10,3.2
3,2023-01-04,1679,103,2,88.79,0.12,156.2704,3,East,Online,...,6,48,Female,Food,2.3,Not Returned,Negative,12.898696,,4.9
4,2023-01-05,1603,122,4,20.32,0.2,65.024,4,East,Online,...,1,60,Other,Electronics,3.2,Not Returned,Neutral,12.898696,,1.5
5,2023-01-06,1059,124,1,48.99,0.18,40.1718,5,West,Online,...,6,44,Other,Electronics,1.5,Not Returned,Positive,7.51,,2.948
6,2023-01-07,1012,131,7,32.55,0.22,177.723,8,North,Credit Card,...,5,51,Female,Electronics,1.1,Returned,Positive,18.15,SAVE10,2.4
7,2023-01-08,1510,176,2,95.54,0.24,145.2208,5,South,Cash,...,1,37,Female,Clothing,4.8,Returned,Positive,12.898696,FREESHIP,4.8
8,2023-01-09,1702,110,9,15.25,0.05,130.3875,6,North,Cash,...,6,68,Other,Books,4.0,Returned,Positive,12.898696,,2.948
9,2023-01-10,1973,173,7,61.58,0.23,331.9162,3,South,Online,...,1,64,Male,Books,4.1,Not Returned,Neutral,11.83,FREESHIP,2.948


<div class="alert alert-warning">

**2. Removing Missing Data observations in Pandas**

**Write a Pandas program to drop rows with missing data.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1955,149,3,51.04,0.29,108.7152,5,North,Credit Card,...,3,37,Male,Electronics,2.9,Returned,Negative,,,3.1
1,2023-01-02,1950,122,1,59.68,0.18,48.9376,6,South,Online,...,3,37,Female,Books,3.7,Returned,Negative,13.13,WELCOME5,
2,2023-01-03,1528,166,1,94.69,0.11,84.2741,4,North,Online,...,1,33,Female,Books,4.5,Not Returned,Neutral,15.49,,
3,2023-01-04,1259,135,9,80.47,0.22,564.8994,1,West,Cash,...,5,20,Other,Books,3.1,Not Returned,Positive,15.53,,
4,2023-01-05,1033,177,8,90.42,0.01,716.1264,7,East,Online,...,6,29,Other,Food,4.2,Not Returned,Neutral,9.12,,4.7
5,2023-01-06,1897,150,2,46.41,0.3,64.974,6,East,Cash,...,1,53,Female,Electronics,3.5,Not Returned,Positive,,,2.8
6,2023-01-07,1907,111,1,12.52,0.07,11.6436,3,East,Online,...,5,30,Male,Food,2.5,Not Returned,Positive,10.66,,
7,2023-01-08,1274,157,6,11.18,0.11,59.7012,4,East,Credit Card,...,5,20,Other,Books,1.0,Not Returned,Negative,6.0,WELCOME5,1.7
8,2023-01-09,1560,183,8,77.28,0.3,432.768,2,North,Credit Card,...,2,30,Other,Clothing,4.4,Not Returned,Positive,,WELCOME5,1.8
9,2023-01-10,1483,194,2,49.04,0.08,90.2336,5,West,Cash,...,1,42,Female,Books,4.4,Returned,Positive,,,4.6


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Handling Duplicates in Pandas
# Write a Pandas program to drop rows with missing data.

result_df = df.dropna()
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
4,2023-01-05,1033,177,8,90.42,0.01,716.1264,7,East,Online,...,6,29,Other,Food,4.2,Not Returned,Neutral,9.12,,4.7
7,2023-01-08,1274,157,6,11.18,0.11,59.7012,4,East,Credit Card,...,5,20,Other,Books,1.0,Not Returned,Negative,6.0,WELCOME5,1.7
11,2023-01-12,1030,192,5,64.2,0.15,272.85,9,West,Credit Card,...,5,63,Male,Electronics,3.3,Not Returned,Positive,13.83,,1.8
12,2023-01-13,1754,137,1,76.27,0.24,57.9652,8,East,Cash,...,2,38,Other,Clothing,4.1,Not Returned,Positive,13.56,SAVE10,4.4
13,2023-01-14,1512,105,7,21.31,0.15,126.7945,6,South,Online,...,6,62,Male,Electronics,4.7,Returned,Negative,11.04,SAVE10,3.3
15,2023-01-16,1292,129,9,33.87,0.12,268.2504,2,West,Credit Card,...,1,45,Other,Electronics,1.2,Not Returned,Neutral,5.7,WELCOME5,2.8
17,2023-01-18,1048,168,5,63.98,0.23,246.323,3,West,Online,...,5,67,Other,Food,2.3,Not Returned,Negative,6.22,,3.0
18,2023-01-19,1843,164,5,71.21,0.1,320.445,5,East,Credit Card,...,5,26,Male,Electronics,4.6,Returned,Positive,13.64,SAVE10,1.7
19,2023-01-20,1153,117,1,16.93,0.28,12.1896,5,West,Cash,...,4,37,Male,Food,4.0,Not Returned,Neutral,5.06,SAVE10,1.9
23,2023-01-24,1358,126,4,40.32,0.15,137.088,9,South,Online,...,1,59,Other,Clothing,1.2,Not Returned,Positive,16.45,FREESHIP,3.8


<div class="alert alert-warning">

**3. Detecting Duplicates in Pandas**

**Write a Pandas program to detect duplicates using duplicated() method.**

</div>

In [6]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1943,151,8,79.35,0.23,488.796,9,East,Credit Card,...,4,66,Male,Food,3.8,Not Returned,Positive,6.9,FREESHIP,3.7
1,2023-01-02,1879,106,7,73.41,0.04,493.3152,1,West,Credit Card,...,3,41,Other,Electronics,1.4,Not Returned,Neutral,14.5,FREESHIP,
2,2023-01-03,1675,149,9,25.65,0.18,189.297,7,West,Online,...,5,42,Female,Clothing,1.6,Not Returned,Neutral,,SAVE10,2.8
3,2023-01-04,1627,139,8,44.63,0.09,324.9064,7,South,Credit Card,...,5,38,Male,Electronics,2.2,Returned,Positive,18.17,SAVE10,3.0
4,2023-01-05,1145,108,2,61.29,0.1,110.322,5,South,Cash,...,3,41,Male,Food,2.5,Not Returned,Negative,5.83,WELCOME5,
5,2023-01-06,1858,111,9,38.79,0.01,345.6189,8,East,Credit Card,...,1,20,Male,Food,1.5,Not Returned,Negative,,SAVE10,
6,2023-01-07,1265,114,2,56.75,0.09,103.285,9,East,Online,...,1,61,Male,Electronics,3.9,Not Returned,Positive,5.51,FREESHIP,1.1
7,2023-01-08,1393,152,9,41.72,0.07,349.1964,3,West,Cash,...,5,56,Male,Food,3.1,Returned,Positive,12.84,WELCOME5,4.0
8,2023-01-09,1854,135,6,33.46,0.17,166.6308,5,North,Credit Card,...,3,25,Other,Clothing,3.0,Not Returned,Neutral,19.47,WELCOME5,2.7
9,2023-01-10,1575,120,1,22.54,0.24,17.1304,7,West,Cash,...,4,21,Other,Books,2.9,Returned,Negative,,FREESHIP,4.7


<div class="alert alert-success">

**Solution 03:**
</div>

In [7]:
# 3. Data Cleaning Techniques
# Write a Pandas program to detect duplicates using duplicated() method.
duplicates = df.duplicated()
duplicates_df = df[duplicates]
duplicates_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
35,2023-01-16,1476,185,5,75.55,0.11,336.1975,2,West,Online,...,2,57,Other,Books,1.5,Not Returned,Negative,17.29,,
36,2023-02-01,1302,189,1,60.6,0.25,45.45,1,North,Online,...,4,45,Other,Electronics,2.7,Not Returned,Positive,15.55,SAVE10,2.8
37,2023-01-20,1978,132,4,27.47,0.22,85.7064,2,North,Cash,...,6,52,Other,Books,2.3,Not Returned,Positive,,,4.3
38,2023-01-11,1837,104,4,15.67,0.18,51.3976,4,South,Credit Card,...,6,50,Female,Clothing,2.4,Returned,Positive,10.96,SAVE10,2.6
39,2023-01-21,1939,149,6,53.27,0.1,287.658,4,South,Cash,...,5,30,Female,Food,1.2,Not Returned,Positive,,SAVE10,3.1


<div class="alert alert-warning">

**4. Handling Duplicates in Pandas**

**Write a Pandas program to remove duplicates rows from a DataFrame.**

</div>

In [8]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1218,157,2,88.93,0.11,158.2954,3,West,Credit Card,...,4,20,Female,Electronics,4.5,Returned,Positive,9.74,,4.1
1,2023-01-02,1694,173,3,60.17,0.18,148.0182,6,East,Online,...,2,67,Male,Books,1.4,Returned,Negative,13.91,,
2,2023-01-03,1450,119,9,68.3,0.16,516.348,5,West,Credit Card,...,3,53,Female,Clothing,2.2,Not Returned,Negative,14.62,SAVE10,4.9
3,2023-01-04,1082,127,1,62.48,0.16,52.4832,7,South,Credit Card,...,5,59,Other,Clothing,1.9,Not Returned,Neutral,,WELCOME5,
4,2023-01-05,1739,123,9,61.42,0.09,503.0298,9,West,Cash,...,3,33,Female,Electronics,1.4,Not Returned,Positive,9.7,,4.3
5,2023-01-06,1896,123,6,47.59,0.22,222.7212,3,North,Cash,...,3,44,Female,Books,3.3,Returned,Negative,10.38,WELCOME5,4.6
6,2023-01-07,1542,163,2,53.22,0.02,104.3112,9,West,Cash,...,2,40,Female,Books,4.4,Not Returned,Positive,8.18,FREESHIP,3.8
7,2023-01-08,1895,196,9,14.69,0.27,96.5133,1,North,Credit Card,...,4,20,Female,Books,1.9,Returned,Positive,18.0,SAVE10,1.9
8,2023-01-09,1542,166,4,80.22,0.14,275.9568,9,East,Credit Card,...,5,30,Male,Books,3.9,Returned,Positive,8.16,SAVE10,3.9
9,2023-01-10,1671,114,9,72.69,0.09,595.3311,5,East,Credit Card,...,4,24,Female,Clothing,2.0,Not Returned,Positive,16.16,,4.3


<div class="alert alert-success">

**Solution 04:**
</div>

In [9]:
df.drop_duplicates()

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1218,157,2,88.93,0.11,158.2954,3,West,Credit Card,...,4,20,Female,Electronics,4.5,Returned,Positive,9.74,,4.1
1,2023-01-02,1694,173,3,60.17,0.18,148.0182,6,East,Online,...,2,67,Male,Books,1.4,Returned,Negative,13.91,,
2,2023-01-03,1450,119,9,68.3,0.16,516.348,5,West,Credit Card,...,3,53,Female,Clothing,2.2,Not Returned,Negative,14.62,SAVE10,4.9
3,2023-01-04,1082,127,1,62.48,0.16,52.4832,7,South,Credit Card,...,5,59,Other,Clothing,1.9,Not Returned,Neutral,,WELCOME5,
4,2023-01-05,1739,123,9,61.42,0.09,503.0298,9,West,Cash,...,3,33,Female,Electronics,1.4,Not Returned,Positive,9.7,,4.3
5,2023-01-06,1896,123,6,47.59,0.22,222.7212,3,North,Cash,...,3,44,Female,Books,3.3,Returned,Negative,10.38,WELCOME5,4.6
6,2023-01-07,1542,163,2,53.22,0.02,104.3112,9,West,Cash,...,2,40,Female,Books,4.4,Not Returned,Positive,8.18,FREESHIP,3.8
7,2023-01-08,1895,196,9,14.69,0.27,96.5133,1,North,Credit Card,...,4,20,Female,Books,1.9,Returned,Positive,18.0,SAVE10,1.9
8,2023-01-09,1542,166,4,80.22,0.14,275.9568,9,East,Credit Card,...,5,30,Male,Books,3.9,Returned,Positive,8.16,SAVE10,3.9
9,2023-01-10,1671,114,9,72.69,0.09,595.3311,5,East,Credit Card,...,4,24,Female,Clothing,2.0,Not Returned,Positive,16.16,,4.3


In [10]:
# 4. Handling Duplicates in Pandas
# Write a Pandas program to remove duplicates rows from a DataFrame.

df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)

<div class="alert alert-warning">

**5. Handling Outliers with Z-Score Method**

**Write a Pandas program to handle outliers in a DataFrame with Z-Score method.**

</div>

In [11]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1994,159,5,7452.84,0.26,27575.508,2,West,Credit Card,...,6,31,Male,Books,3.6,Not Returned,Negative,10.94,WELCOME5,
1,2023-01-02,1772,100,2,4813.36,0.05,9145.384,8,West,Online,...,2,43,Other,Clothing,1.2,Not Returned,Neutral,9.43,,5.0
2,2023-01-03,1333,176,3,1580.43,0.14,4077.5094,7,South,Cash,...,4,19,Male,Food,4.6,Not Returned,Negative,14.03,FREESHIP,2.1
3,2023-01-04,1515,163,6,2010.57,0.0,12063.42,8,West,Online,...,6,60,Other,Books,1.4,Not Returned,Positive,10.06,WELCOME5,3.6
4,2023-01-05,1032,195,4,8267.07,0.2,26454.624,7,West,Online,...,1,67,Male,Electronics,3.5,Not Returned,Positive,7.7,WELCOME5,
5,2023-01-06,1663,151,8,1389.96,0.23,8562.1536,9,East,Cash,...,5,50,Female,Food,3.1,Not Returned,Positive,13.51,,3.2
6,2023-01-07,1479,133,7,9163.15,0.1,57727.845,3,South,Online,...,1,68,Male,Electronics,4.1,Returned,Positive,7.48,SAVE10,4.4
7,2023-01-08,1821,195,8,8620.23,0.05,65513.748,4,North,Cash,...,5,23,Female,Electronics,4.8,Not Returned,Positive,17.02,FREESHIP,4.4
8,2023-01-09,1355,102,9,991.42,0.28,6424.4016,4,North,Online,...,6,66,Other,Clothing,4.0,Not Returned,Neutral,15.71,,4.2
9,2023-01-10,1598,179,6,9211.82,0.21,43664.0268,1,West,Cash,...,6,22,Other,Clothing,4.7,Not Returned,Negative,17.9,FREESHIP,2.0


<div class="alert alert-success">

**Solution 05:**
</div>

In [12]:
# 5. Handling Outliers with Z-Score Method
# Write a Pandas program to handle outliers in a DataFrame with Z-Score method.

def handle_outliers_z_score(df, column):
    from scipy import stats
    z_scores = np.abs(stats.zscore(df[column]))
    threshold = 1.5
    return df[(z_scores < threshold)]

result_df = handle_outliers_z_score(df, 'price')

In [13]:
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1994,159,5,7452.84,0.26,27575.508,2,West,Credit Card,...,6,31,Male,Books,3.6,Not Returned,Negative,10.94,WELCOME5,
1,2023-01-02,1772,100,2,4813.36,0.05,9145.384,8,West,Online,...,2,43,Other,Clothing,1.2,Not Returned,Neutral,9.43,,5.0
2,2023-01-03,1333,176,3,1580.43,0.14,4077.5094,7,South,Cash,...,4,19,Male,Food,4.6,Not Returned,Negative,14.03,FREESHIP,2.1
3,2023-01-04,1515,163,6,2010.57,0.0,12063.42,8,West,Online,...,6,60,Other,Books,1.4,Not Returned,Positive,10.06,WELCOME5,3.6
4,2023-01-05,1032,195,4,8267.07,0.2,26454.624,7,West,Online,...,1,67,Male,Electronics,3.5,Not Returned,Positive,7.7,WELCOME5,
5,2023-01-06,1663,151,8,1389.96,0.23,8562.1536,9,East,Cash,...,5,50,Female,Food,3.1,Not Returned,Positive,13.51,,3.2
7,2023-01-08,1821,195,8,8620.23,0.05,65513.748,4,North,Cash,...,5,23,Female,Electronics,4.8,Not Returned,Positive,17.02,FREESHIP,4.4
10,2023-01-11,1540,167,2,5075.48,0.11,9034.3544,9,South,Cash,...,2,32,Other,Electronics,2.2,Returned,Positive,11.69,,3.1
11,2023-01-12,1847,138,4,3218.83,0.11,11459.0348,2,North,Cash,...,2,62,Female,Books,3.6,Not Returned,Positive,,SAVE10,
12,2023-01-13,1431,156,3,1277.7,0.18,3143.142,6,West,Credit Card,...,3,65,Female,Food,2.7,Not Returned,Negative,10.19,FREESHIP,


<div class="alert alert-warning">

**6. Normalizing Data with Min-Max Scaling**

**Write a Pandas program that normalizes data with Min-Max scaling.**

</div>

In [14]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1455,194,6,4627.53,0.1,24988.662,2,West,Cash,...,3,31,Male,Electronics,1.9,Not Returned,Positive,18.12,,3.6
1,2023-01-02,1648,156,8,1783.23,0.29,10128.7464,2,South,Cash,...,3,55,Male,Books,2.6,Not Returned,Neutral,8.6,,
2,2023-01-03,1943,191,7,5268.74,0.25,27660.885,5,North,Online,...,6,36,Other,Clothing,2.0,Not Returned,Positive,10.61,SAVE10,2.3
3,2023-01-04,1977,153,2,1650.93,0.12,2905.6368,5,East,Online,...,1,40,Other,Electronics,2.7,Returned,Positive,,SAVE10,4.5
4,2023-01-05,1343,187,9,3831.06,0.26,25514.8596,4,South,Cash,...,3,48,Other,Clothing,2.7,Returned,Neutral,,,1.3
5,2023-01-06,1753,138,9,2167.92,0.13,16974.8136,8,West,Online,...,4,33,Female,Electronics,3.2,Not Returned,Negative,10.91,,4.3
6,2023-01-07,1012,171,5,6699.23,0.07,31151.4195,5,East,Online,...,5,39,Other,Clothing,1.4,Returned,Positive,13.59,,2.7
7,2023-01-08,1453,154,8,5685.38,0.08,41844.3968,8,West,Cash,...,6,36,Female,Books,3.8,Not Returned,Negative,8.22,,1.4
8,2023-01-09,1256,154,2,856.95,0.03,1662.483,4,West,Credit Card,...,3,65,Male,Food,3.4,Returned,Positive,11.25,,1.6
9,2023-01-10,1369,104,6,130.57,0.22,611.0676,1,North,Online,...,6,61,Other,Clothing,3.3,Returned,Positive,,,2.5


<div class="alert alert-success">

**Solution 06:**
</div>

In [15]:
# 6. Normalizing Data with Min-Max Scaling
# Write a Pandas program that normalizes data with Min-Max scaling.

def normalize_min_max(df, column):
    min_val = df[column].min()
    max_val = df[column].max()
    df[column] = (df[column] - min_val) / (max_val - min_val)
    return df

result_df = normalize_min_max(df.copy(), 'price')

result_df


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1455,194,6,0.501118,0.1,24988.662,2,West,Cash,...,3,31,Male,Electronics,1.9,Not Returned,Positive,18.12,,3.6
1,2023-01-02,1648,156,8,0.184164,0.29,10128.7464,2,South,Cash,...,3,55,Male,Books,2.6,Not Returned,Neutral,8.6,,
2,2023-01-03,1943,191,7,0.572571,0.25,27660.885,5,North,Online,...,6,36,Other,Clothing,2.0,Not Returned,Positive,10.61,SAVE10,2.3
3,2023-01-04,1977,153,2,0.169421,0.12,2905.6368,5,East,Online,...,1,40,Other,Electronics,2.7,Returned,Positive,,SAVE10,4.5
4,2023-01-05,1343,187,9,0.412363,0.26,25514.8596,4,South,Cash,...,3,48,Other,Clothing,2.7,Returned,Neutral,,,1.3
5,2023-01-06,1753,138,9,0.227032,0.13,16974.8136,8,West,Online,...,4,33,Female,Electronics,3.2,Not Returned,Negative,10.91,,4.3
6,2023-01-07,1012,171,5,0.731977,0.07,31151.4195,5,East,Online,...,5,39,Other,Clothing,1.4,Returned,Positive,13.59,,2.7
7,2023-01-08,1453,154,8,0.618999,0.08,41844.3968,8,West,Cash,...,6,36,Female,Books,3.8,Not Returned,Negative,8.22,,1.4
8,2023-01-09,1256,154,2,0.080944,0.03,1662.483,4,West,Credit Card,...,3,65,Male,Food,3.4,Returned,Positive,11.25,,1.6
9,2023-01-10,1369,104,6,0.0,0.22,611.0676,1,North,Online,...,6,61,Other,Clothing,3.3,Returned,Positive,,,2.5


<div class="alert alert-warning">

**7. Binning Data into Categories**

**Write a Pandas program to bin data into categories.**

</div>

In [16]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1362,140,1,4176.17,0.02,4092.6466,6,South,Online,...,1,47,Male,Electronics,1.0,Returned,Positive,13.66,,
1,2023-01-02,1707,130,6,9516.43,0.09,51959.7078,5,North,Credit Card,...,5,61,Other,Clothing,3.3,Not Returned,Negative,17.82,SAVE10,
2,2023-01-03,1752,161,1,7846.44,0.02,7689.5112,9,South,Online,...,5,43,Other,Clothing,1.9,Not Returned,Positive,17.27,,
3,2023-01-04,1672,102,8,5498.38,0.3,30790.928,1,North,Credit Card,...,6,67,Other,Books,3.1,Not Returned,Negative,9.59,SAVE10,2.3
4,2023-01-05,1967,140,2,4099.32,0.05,7788.708,7,South,Cash,...,3,36,Other,Electronics,1.9,Returned,Neutral,6.97,FREESHIP,3.3
5,2023-01-06,1185,173,9,3428.29,0.24,23449.5036,6,South,Credit Card,...,4,26,Female,Books,3.8,Returned,Neutral,10.72,,4.4
6,2023-01-07,1234,192,3,5744.86,0.18,14132.3556,3,South,Online,...,6,64,Male,Electronics,3.6,Not Returned,Negative,7.88,,
7,2023-01-08,1258,176,6,4430.37,0.09,24189.8202,7,West,Credit Card,...,5,58,Other,Food,3.5,Returned,Negative,18.48,,
8,2023-01-09,1971,164,6,9518.71,0.19,46260.9306,3,West,Credit Card,...,2,67,Male,Clothing,1.6,Returned,Neutral,12.43,,4.0
9,2023-01-10,1366,160,4,4577.61,0.12,16113.1872,8,North,Cash,...,5,43,Female,Electronics,4.7,Returned,Neutral,16.92,FREESHIP,1.6


<div class="alert alert-success">

**Solution 07:**
</div>

In [17]:
# binning price
bins = [0, 100, 500, 1000, 5000, 10000]
labels = ['0-100', '100-500', '500-1000', '1000-5000', '5000-10000']
df['price_binned'] = pd.cut(
    df['price']
    , bins=bins
    , labels=labels
    , right=False
)

df['price_binned_q'] = pd.qcut(
    df['price']
    , q=[0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
    , duplicates='drop'
)
df['price_binned_q_no_labels'] = pd.qcut(
    df['price']
    , q=[0, 0.2, 0.4, 0.6, 0.8, 1.0]
    , labels=False
)

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,price_binned,price_binned_q,price_binned_q_no_labels
0,2023-01-01,1362,140,1,4176.17,0.02,4092.6466,6,South,Online,...,Electronics,1.0,Returned,Positive,13.66,,,1000-5000,Low,1
1,2023-01-02,1707,130,6,9516.43,0.09,51959.7078,5,North,Credit Card,...,Clothing,3.3,Not Returned,Negative,17.82,SAVE10,,5000-10000,Very High,4
2,2023-01-03,1752,161,1,7846.44,0.02,7689.5112,9,South,Online,...,Clothing,1.9,Not Returned,Positive,17.27,,,5000-10000,High,3
3,2023-01-04,1672,102,8,5498.38,0.3,30790.928,1,North,Credit Card,...,Books,3.1,Not Returned,Negative,9.59,SAVE10,2.3,5000-10000,Medium,2
4,2023-01-05,1967,140,2,4099.32,0.05,7788.708,7,South,Cash,...,Electronics,1.9,Returned,Neutral,6.97,FREESHIP,3.3,1000-5000,Low,1
5,2023-01-06,1185,173,9,3428.29,0.24,23449.5036,6,South,Credit Card,...,Books,3.8,Returned,Neutral,10.72,,4.4,1000-5000,Low,1
6,2023-01-07,1234,192,3,5744.86,0.18,14132.3556,3,South,Online,...,Electronics,3.6,Not Returned,Negative,7.88,,,5000-10000,High,3
7,2023-01-08,1258,176,6,4430.37,0.09,24189.8202,7,West,Credit Card,...,Food,3.5,Returned,Negative,18.48,,,1000-5000,Low,1
8,2023-01-09,1971,164,6,9518.71,0.19,46260.9306,3,West,Credit Card,...,Clothing,1.6,Returned,Neutral,12.43,,4.0,5000-10000,Very High,4
9,2023-01-10,1366,160,4,4577.61,0.12,16113.1872,8,North,Cash,...,Electronics,4.7,Returned,Neutral,16.92,FREESHIP,1.6,1000-5000,Medium,2


<div class="alert alert-warning">

**8. Converting Data Types and Column Operations**

**Write a Pandas program that handles text data with str.replace().**

</div>

In [18]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1124,148,6,2437.22,0.05,13892.154,1,North,Credit Card,...,6,41,Male,Electronics,4.3,Not Returned,Positive,14.06,,
1,2023-01-02,1257,148,7,6745.98,0.23,36360.8322,7,South,Cash,...,6,29,Female,Electronics,1.2,Not Returned,Neutral,9.83,FREESHIP,1.3
2,2023-01-03,1852,161,2,5710.27,0.21,9022.2266,2,West,Online,...,6,46,Other,Clothing,4.7,Returned,Neutral,10.65,,
3,2023-01-04,1427,149,7,1260.32,0.03,8557.5728,5,East,Credit Card,...,4,23,Other,Books,4.4,Not Returned,Neutral,9.77,FREESHIP,1.8
4,2023-01-05,1472,187,9,9527.26,0.07,79743.1662,4,South,Online,...,4,19,Female,Food,2.0,Not Returned,Positive,17.93,SAVE10,3.6
5,2023-01-06,1691,176,7,8764.74,0.18,50309.6076,4,South,Cash,...,3,26,Other,Food,1.1,Returned,Negative,5.95,,3.7
6,2023-01-07,1508,178,4,9973.55,0.25,29920.65,9,East,Cash,...,6,55,Male,Electronics,1.8,Returned,Positive,,SAVE10,3.1
7,2023-01-08,1552,109,2,971.85,0.25,1457.775,9,South,Cash,...,2,34,Male,Electronics,1.1,Returned,Negative,11.98,,1.8
8,2023-01-09,1436,149,7,9768.22,0.25,51283.155,6,South,Credit Card,...,3,64,Male,Food,1.6,Not Returned,Negative,13.33,,
9,2023-01-10,1386,102,9,7160.65,0.21,50912.2215,1,East,Cash,...,4,56,Female,Books,3.7,Not Returned,Neutral,13.62,FREESHIP,1.8


<div class="alert alert-success">

**Solution 08:**
</div>

In [19]:
# 8. Converting Data Types and Column Operations
# Write a Pandas program that handles text data with str.replace().

df['region_arrows'] = df['region'].str.replace('North', '↑')
df['region_arrows'] = df['region_arrows'].str.replace('South', '↓')
df['region_arrows'] = df['region_arrows'].str.replace('East', '→')
df['region_arrows'] = df['region_arrows'].str.replace('West', '←')

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,region_arrows
0,2023-01-01,1124,148,6,2437.22,0.05,13892.154,1,North,Credit Card,...,41,Male,Electronics,4.3,Not Returned,Positive,14.06,,,↑
1,2023-01-02,1257,148,7,6745.98,0.23,36360.8322,7,South,Cash,...,29,Female,Electronics,1.2,Not Returned,Neutral,9.83,FREESHIP,1.3,↓
2,2023-01-03,1852,161,2,5710.27,0.21,9022.2266,2,West,Online,...,46,Other,Clothing,4.7,Returned,Neutral,10.65,,,←
3,2023-01-04,1427,149,7,1260.32,0.03,8557.5728,5,East,Credit Card,...,23,Other,Books,4.4,Not Returned,Neutral,9.77,FREESHIP,1.8,→
4,2023-01-05,1472,187,9,9527.26,0.07,79743.1662,4,South,Online,...,19,Female,Food,2.0,Not Returned,Positive,17.93,SAVE10,3.6,↓
5,2023-01-06,1691,176,7,8764.74,0.18,50309.6076,4,South,Cash,...,26,Other,Food,1.1,Returned,Negative,5.95,,3.7,↓
6,2023-01-07,1508,178,4,9973.55,0.25,29920.65,9,East,Cash,...,55,Male,Electronics,1.8,Returned,Positive,,SAVE10,3.1,→
7,2023-01-08,1552,109,2,971.85,0.25,1457.775,9,South,Cash,...,34,Male,Electronics,1.1,Returned,Negative,11.98,,1.8,↓
8,2023-01-09,1436,149,7,9768.22,0.25,51283.155,6,South,Credit Card,...,64,Male,Food,1.6,Not Returned,Negative,13.33,,,↓
9,2023-01-10,1386,102,9,7160.65,0.21,50912.2215,1,East,Cash,...,56,Female,Books,3.7,Not Returned,Neutral,13.62,FREESHIP,1.8,→


<div class="alert alert-warning">

**9. Replacing Missing Data with Mean Value**

**Replace Missing Data with Mean Value**

</div>

In [20]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1818,138,6,3760.89,0.24,17149.6584,5,West,Online,...,1,48,Female,Food,3.9,Not Returned,Positive,9.59,WELCOME5,2.6
1,2023-01-02,1573,181,9,4593.85,0.21,32662.2735,5,South,Cash,...,4,22,Male,Electronics,4.2,Returned,Neutral,9.71,WELCOME5,1.8
2,2023-01-03,1064,135,5,6825.69,0.16,28667.898,2,East,Online,...,6,38,Other,Books,3.4,Not Returned,Negative,6.83,FREESHIP,2.1
3,2023-01-04,1214,196,4,5111.75,0.3,14312.9,7,North,Online,...,1,69,Male,Electronics,3.8,Returned,Negative,8.84,,1.0
4,2023-01-05,1345,153,2,6082.0,0.23,9366.28,2,East,Credit Card,...,4,66,Male,Books,2.5,Returned,Negative,,FREESHIP,1.5
5,2023-01-06,1457,103,4,7969.26,0.27,23270.2392,9,East,Cash,...,5,55,Male,Clothing,3.5,Returned,Negative,17.41,,2.6
6,2023-01-07,1478,183,4,5382.14,0.19,17438.1336,6,South,Cash,...,4,58,Female,Electronics,2.7,Returned,Positive,17.45,,1.6
7,2023-01-08,1303,112,7,958.32,0.09,6104.4984,5,North,Online,...,5,35,Other,Electronics,2.5,Not Returned,Negative,,FREESHIP,3.1
8,2023-01-09,1725,130,9,1535.45,0.04,13266.288,8,West,Credit Card,...,5,49,Female,Food,1.9,Returned,Neutral,,FREESHIP,
9,2023-01-10,1413,159,9,9759.39,0.29,62362.5021,7,East,Cash,...,4,28,Female,Clothing,3.6,Returned,Negative,10.32,WELCOME5,1.0


<div class="alert alert-success">

**Solution 09:**
</div>

In [21]:
# 9. Replacing Missing Data with Mean Value
# replace missing values in 'delivery_rating' column with mean
df['delivery_rating'] = df['delivery_rating'].fillna(df['delivery_rating'].mean())

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1818,138,6,3760.89,0.24,17149.6584,5,West,Online,...,1,48,Female,Food,3.9,Not Returned,Positive,9.59,WELCOME5,2.6
1,2023-01-02,1573,181,9,4593.85,0.21,32662.2735,5,South,Cash,...,4,22,Male,Electronics,4.2,Returned,Neutral,9.71,WELCOME5,1.8
2,2023-01-03,1064,135,5,6825.69,0.16,28667.898,2,East,Online,...,6,38,Other,Books,3.4,Not Returned,Negative,6.83,FREESHIP,2.1
3,2023-01-04,1214,196,4,5111.75,0.3,14312.9,7,North,Online,...,1,69,Male,Electronics,3.8,Returned,Negative,8.84,,1.0
4,2023-01-05,1345,153,2,6082.0,0.23,9366.28,2,East,Credit Card,...,4,66,Male,Books,2.5,Returned,Negative,,FREESHIP,1.5
5,2023-01-06,1457,103,4,7969.26,0.27,23270.2392,9,East,Cash,...,5,55,Male,Clothing,3.5,Returned,Negative,17.41,,2.6
6,2023-01-07,1478,183,4,5382.14,0.19,17438.1336,6,South,Cash,...,4,58,Female,Electronics,2.7,Returned,Positive,17.45,,1.6
7,2023-01-08,1303,112,7,958.32,0.09,6104.4984,5,North,Online,...,5,35,Other,Electronics,2.5,Not Returned,Negative,,FREESHIP,3.1
8,2023-01-09,1725,130,9,1535.45,0.04,13266.288,8,West,Credit Card,...,5,49,Female,Food,1.9,Returned,Neutral,,FREESHIP,2.316667
9,2023-01-10,1413,159,9,9759.39,0.29,62362.5021,7,East,Cash,...,4,28,Female,Clothing,3.6,Returned,Negative,10.32,WELCOME5,1.0


<div class="alert alert-warning">

**10. Removing Columns with Too Many Missing Values**

**Remove columns with 5 or more missing values**

</div>

In [22]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1693,192,3,7682.81,0.2,18438.744,9,East,Online,...,6,67,Other,Food,2.4,Not Returned,Negative,9.38,,1.2
1,2023-01-02,1541,162,9,8039.26,0.14,62223.8724,6,North,Credit Card,...,6,60,Other,Food,4.5,Not Returned,Neutral,9.05,,2.0
2,2023-01-03,1804,171,2,7893.43,0.17,13103.0938,9,North,Online,...,6,40,Female,Food,4.5,Returned,Neutral,,,1.7
3,2023-01-04,1511,122,4,2767.71,0.01,10960.1316,3,East,Cash,...,2,67,Female,Books,4.7,Returned,Positive,,FREESHIP,1.3
4,2023-01-05,1125,128,9,7727.16,0.15,59112.774,6,West,Online,...,4,62,Male,Clothing,2.0,Returned,Neutral,18.19,SAVE10,
5,2023-01-06,1425,126,3,7430.48,0.29,15826.9224,8,South,Online,...,6,36,Male,Food,2.5,Not Returned,Neutral,7.61,WELCOME5,4.4
6,2023-01-07,1325,109,5,7447.32,0.04,35747.136,5,South,Credit Card,...,4,59,Male,Clothing,3.9,Returned,Negative,,WELCOME5,2.7
7,2023-01-08,1281,155,6,7205.7,0.17,35884.386,2,South,Cash,...,3,62,Other,Clothing,1.8,Not Returned,Neutral,12.53,SAVE10,1.8
8,2023-01-09,1949,118,4,3299.54,0.25,9898.62,4,North,Cash,...,1,62,Male,Electronics,2.8,Not Returned,Neutral,14.9,FREESHIP,4.9
9,2023-01-10,1066,142,6,1925.39,0.1,10397.106,1,South,Credit Card,...,5,60,Other,Food,2.4,Returned,Negative,,WELCOME5,


<div class="alert alert-success">

**Solution 10:**
</div>

In [23]:
# 10. Removing Columns with Too Many Missing Values
# Remove columns with 5 or more missing values
df.dropna(thresh=len(df) - 5, axis=1, inplace=True)

df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,coupon_code
0,2023-01-01,1693,192,3,7682.81,0.2,18438.744,9,East,Online,False,6,67,Other,Food,2.4,Not Returned,Negative,
1,2023-01-02,1541,162,9,8039.26,0.14,62223.8724,6,North,Credit Card,True,6,60,Other,Food,4.5,Not Returned,Neutral,
2,2023-01-03,1804,171,2,7893.43,0.17,13103.0938,9,North,Online,False,6,40,Female,Food,4.5,Returned,Neutral,
3,2023-01-04,1511,122,4,2767.71,0.01,10960.1316,3,East,Cash,False,2,67,Female,Books,4.7,Returned,Positive,FREESHIP
4,2023-01-05,1125,128,9,7727.16,0.15,59112.774,6,West,Online,False,4,62,Male,Clothing,2.0,Returned,Neutral,SAVE10
5,2023-01-06,1425,126,3,7430.48,0.29,15826.9224,8,South,Online,False,6,36,Male,Food,2.5,Not Returned,Neutral,WELCOME5
6,2023-01-07,1325,109,5,7447.32,0.04,35747.136,5,South,Credit Card,True,4,59,Male,Clothing,3.9,Returned,Negative,WELCOME5
7,2023-01-08,1281,155,6,7205.7,0.17,35884.386,2,South,Cash,False,3,62,Other,Clothing,1.8,Not Returned,Neutral,SAVE10
8,2023-01-09,1949,118,4,3299.54,0.25,9898.62,4,North,Cash,True,1,62,Male,Electronics,2.8,Not Returned,Neutral,FREESHIP
9,2023-01-10,1066,142,6,1925.39,0.1,10397.106,1,South,Credit Card,False,5,60,Other,Food,2.4,Returned,Negative,WELCOME5


<div class="alert alert-warning">

**11. Changing column names**

**Write a Pandas program to change column names to lowercase.**

</div>

In [24]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'Date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'REVENUE': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['REVENUE'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,Date,customer_id,product_id,quantity,price,discount,REVENUE,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1519,159,8,8650.88,0.29,49136.9984,4,East,Credit Card,...,6,62,Male,Clothing,2.3,Returned,Neutral,11.64,WELCOME5,1.5
1,2023-01-02,1279,124,5,7765.15,0.07,36107.9475,9,North,Credit Card,...,4,56,Male,Clothing,4.0,Not Returned,Negative,7.9,FREESHIP,3.7
2,2023-01-03,1620,168,1,229.02,0.07,212.9886,8,South,Credit Card,...,5,54,Male,Electronics,2.9,Returned,Neutral,,FREESHIP,
3,2023-01-04,1734,105,6,9831.33,0.28,42471.3456,9,North,Online,...,2,23,Other,Electronics,1.1,Returned,Neutral,6.14,WELCOME5,3.5
4,2023-01-05,1853,110,6,3272.72,0.21,15512.6928,3,West,Cash,...,4,22,Male,Clothing,2.6,Returned,Neutral,,,2.4
5,2023-01-06,1076,151,9,7861.05,0.17,58722.0435,6,West,Online,...,2,45,Other,Books,3.6,Not Returned,Positive,5.67,SAVE10,
6,2023-01-07,1039,112,1,4808.42,0.03,4664.1674,1,South,Online,...,2,32,Other,Electronics,4.2,Not Returned,Negative,12.72,SAVE10,2.2
7,2023-01-08,1534,117,1,1967.99,0.12,1731.8312,9,East,Credit Card,...,2,51,Male,Electronics,1.1,Returned,Negative,,,2.8
8,2023-01-09,1670,190,3,7129.91,0.09,19464.6543,9,South,Cash,...,1,67,Other,Books,3.8,Returned,Positive,5.75,FREESHIP,
9,2023-01-10,1361,107,8,7486.55,0.21,47314.996,2,West,Online,...,3,68,Male,Clothing,1.4,Not Returned,Neutral,6.76,,


<div class="alert alert-success">

**Solution 11:**
</div>

In [25]:
# Write a Pandas program to change column names to lowercase
df.columns = df.columns.str.lower()
df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1519,159,8,8650.88,0.29,49136.9984,4,East,Credit Card,...,6,62,Male,Clothing,2.3,Returned,Neutral,11.64,WELCOME5,1.5
1,2023-01-02,1279,124,5,7765.15,0.07,36107.9475,9,North,Credit Card,...,4,56,Male,Clothing,4.0,Not Returned,Negative,7.9,FREESHIP,3.7
2,2023-01-03,1620,168,1,229.02,0.07,212.9886,8,South,Credit Card,...,5,54,Male,Electronics,2.9,Returned,Neutral,,FREESHIP,
3,2023-01-04,1734,105,6,9831.33,0.28,42471.3456,9,North,Online,...,2,23,Other,Electronics,1.1,Returned,Neutral,6.14,WELCOME5,3.5
4,2023-01-05,1853,110,6,3272.72,0.21,15512.6928,3,West,Cash,...,4,22,Male,Clothing,2.6,Returned,Neutral,,,2.4
5,2023-01-06,1076,151,9,7861.05,0.17,58722.0435,6,West,Online,...,2,45,Other,Books,3.6,Not Returned,Positive,5.67,SAVE10,
6,2023-01-07,1039,112,1,4808.42,0.03,4664.1674,1,South,Online,...,2,32,Other,Electronics,4.2,Not Returned,Negative,12.72,SAVE10,2.2
7,2023-01-08,1534,117,1,1967.99,0.12,1731.8312,9,East,Credit Card,...,2,51,Male,Electronics,1.1,Returned,Negative,,,2.8
8,2023-01-09,1670,190,3,7129.91,0.09,19464.6543,9,South,Cash,...,1,67,Other,Books,3.8,Returned,Positive,5.75,FREESHIP,
9,2023-01-10,1361,107,8,7486.55,0.21,47314.996,2,West,Online,...,3,68,Male,Clothing,1.4,Not Returned,Neutral,6.76,,


<div class="alert alert-warning">

**12. Replacing Missing Data with Mean Value**

**Write a Pandas program to replacing missing data with mean value.**

</div>

In [26]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1262,113,2,378.36,0.13,658.3464,8,South,Online,...,6,45,Male,Books,3.6,Returned,Negative,14.7,WELCOME5,4.2
1,2023-01-02,1714,144,6,3611.28,0.21,17117.4672,5,North,Online,...,2,18,Other,Electronics,2.5,Returned,Negative,,,3.9
2,2023-01-03,1514,106,5,5436.47,0.04,26095.056,7,West,Online,...,4,39,Female,Food,4.6,Returned,Negative,18.34,WELCOME5,2.4
3,2023-01-04,1995,193,1,3409.35,0.18,2795.667,2,South,Credit Card,...,4,39,Other,Food,4.7,Returned,Positive,5.99,SAVE10,1.0
4,2023-01-05,1946,171,5,4679.71,0.1,21058.695,4,South,Online,...,6,42,Other,Clothing,2.4,Returned,Neutral,,SAVE10,
5,2023-01-06,1416,164,7,4834.05,0.01,33499.9665,5,West,Credit Card,...,1,62,Female,Books,4.9,Returned,Negative,18.43,SAVE10,3.6
6,2023-01-07,1885,162,4,9962.14,0.15,33871.276,7,South,Online,...,3,69,Female,Books,2.0,Returned,Positive,12.61,,3.4
7,2023-01-08,1441,140,3,4608.99,0.21,10923.3063,5,North,Credit Card,...,5,51,Female,Electronics,2.0,Returned,Negative,7.8,,2.8
8,2023-01-09,1924,158,9,1082.0,0.21,7693.02,9,South,Online,...,1,57,Male,Food,3.7,Returned,Neutral,14.75,SAVE10,4.9
9,2023-01-10,1358,153,2,9954.8,0.11,17719.544,4,North,Credit Card,...,2,57,Female,Food,1.3,Not Returned,Positive,18.03,SAVE10,4.3


<div class="alert alert-success">

**Solution 12:**
</div>

In [27]:
# 12. Replacing Missing Data with Mean Value
# Write a Pandas program to replacing missing data with mean value.

df['delivery_rating'] = df['delivery_rating'].fillna(df['delivery_rating'].mean())
df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1262,113,2,378.36,0.13,658.3464,8,South,Online,...,6,45,Male,Books,3.6,Returned,Negative,14.7,WELCOME5,4.2
1,2023-01-02,1714,144,6,3611.28,0.21,17117.4672,5,North,Online,...,2,18,Other,Electronics,2.5,Returned,Negative,,,3.9
2,2023-01-03,1514,106,5,5436.47,0.04,26095.056,7,West,Online,...,4,39,Female,Food,4.6,Returned,Negative,18.34,WELCOME5,2.4
3,2023-01-04,1995,193,1,3409.35,0.18,2795.667,2,South,Credit Card,...,4,39,Other,Food,4.7,Returned,Positive,5.99,SAVE10,1.0
4,2023-01-05,1946,171,5,4679.71,0.1,21058.695,4,South,Online,...,6,42,Other,Clothing,2.4,Returned,Neutral,,SAVE10,3.122581
5,2023-01-06,1416,164,7,4834.05,0.01,33499.9665,5,West,Credit Card,...,1,62,Female,Books,4.9,Returned,Negative,18.43,SAVE10,3.6
6,2023-01-07,1885,162,4,9962.14,0.15,33871.276,7,South,Online,...,3,69,Female,Books,2.0,Returned,Positive,12.61,,3.4
7,2023-01-08,1441,140,3,4608.99,0.21,10923.3063,5,North,Credit Card,...,5,51,Female,Electronics,2.0,Returned,Negative,7.8,,2.8
8,2023-01-09,1924,158,9,1082.0,0.21,7693.02,9,South,Online,...,1,57,Male,Food,3.7,Returned,Neutral,14.75,SAVE10,4.9
9,2023-01-10,1358,153,2,9954.8,0.11,17719.544,4,North,Credit Card,...,2,57,Female,Food,1.3,Not Returned,Positive,18.03,SAVE10,4.3
