[Pandas Data Cleaning and Preprocessing](https://www.w3resource.com/python-exercises/pandas/pandas-data-cleaning-and-preprocessing.php)

In [1]:
import pandas as pd
import numpy as np

<div class="alert alert-warning">

**1. Handling Missing Data in Pandas**

**Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1272,163,2,48.16,0.12,84.7616,1,East,Online,...,6,35,Female,Clothing,4.0,Returned,Negative,10.06,FREESHIP,
1,2023-01-02,1901,133,8,19.28,0.06,144.9856,4,North,Cash,...,3,41,Other,Electronics,3.1,Returned,Positive,5.43,,4.9
2,2023-01-03,1126,133,9,63.06,0.18,465.3828,1,East,Online,...,4,36,Female,Electronics,2.8,Returned,Negative,,WELCOME5,2.9
3,2023-01-04,1526,151,2,51.02,0.27,74.4892,1,West,Online,...,2,20,Male,Electronics,4.3,Not Returned,Neutral,14.71,,
4,2023-01-05,1832,158,6,94.37,0.22,441.6516,8,East,Online,...,5,69,Male,Electronics,4.7,Not Returned,Positive,13.19,SAVE10,1.2
5,2023-01-06,1014,137,4,45.62,0.1,164.232,5,South,Credit Card,...,2,58,Female,Books,3.7,Not Returned,Negative,12.33,WELCOME5,3.9
6,2023-01-07,1312,120,5,60.6,0.07,281.79,3,East,Credit Card,...,3,58,Male,Food,3.3,Returned,Positive,,SAVE10,1.5
7,2023-01-08,1896,122,2,79.85,0.02,156.506,5,North,Online,...,3,45,Male,Electronics,4.1,Not Returned,Positive,,,3.5
8,2023-01-09,1904,148,8,57.8,0.24,351.424,2,West,Cash,...,1,43,Male,Food,4.7,Returned,Neutral,6.9,,3.2
9,2023-01-10,1555,170,6,13.34,0.01,79.2396,5,East,Cash,...,1,21,Male,Books,2.5,Not Returned,Positive,15.3,,1.4


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Handling Missing Data in Pandas
# Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().
result_df = df.fillna({
    'shipping_cost': df['shipping_cost'].mean(),
    'coupon_code': 'No Coupon',
    'delivery_rating': df['delivery_rating'].mean()
})

result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1272,163,2,48.16,0.12,84.7616,1,East,Online,...,6,35,Female,Clothing,4.0,Returned,Negative,10.06,FREESHIP,3.023077
1,2023-01-02,1901,133,8,19.28,0.06,144.9856,4,North,Cash,...,3,41,Other,Electronics,3.1,Returned,Positive,5.43,,4.9
2,2023-01-03,1126,133,9,63.06,0.18,465.3828,1,East,Online,...,4,36,Female,Electronics,2.8,Returned,Negative,12.59,WELCOME5,2.9
3,2023-01-04,1526,151,2,51.02,0.27,74.4892,1,West,Online,...,2,20,Male,Electronics,4.3,Not Returned,Neutral,14.71,,3.023077
4,2023-01-05,1832,158,6,94.37,0.22,441.6516,8,East,Online,...,5,69,Male,Electronics,4.7,Not Returned,Positive,13.19,SAVE10,1.2
5,2023-01-06,1014,137,4,45.62,0.1,164.232,5,South,Credit Card,...,2,58,Female,Books,3.7,Not Returned,Negative,12.33,WELCOME5,3.9
6,2023-01-07,1312,120,5,60.6,0.07,281.79,3,East,Credit Card,...,3,58,Male,Food,3.3,Returned,Positive,12.59,SAVE10,1.5
7,2023-01-08,1896,122,2,79.85,0.02,156.506,5,North,Online,...,3,45,Male,Electronics,4.1,Not Returned,Positive,12.59,,3.5
8,2023-01-09,1904,148,8,57.8,0.24,351.424,2,West,Cash,...,1,43,Male,Food,4.7,Returned,Neutral,6.9,,3.2
9,2023-01-10,1555,170,6,13.34,0.01,79.2396,5,East,Cash,...,1,21,Male,Books,2.5,Not Returned,Positive,15.3,,1.4


<div class="alert alert-warning">

**2. Removing Missing Data observations in Pandas**

**Write a Pandas program to drop rows with missing data.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1598,194,6,60.13,0.16,303.0552,7,East,Online,...,1,35,Other,Electronics,2.0,Not Returned,Neutral,12.95,,
1,2023-01-02,1961,190,5,93.46,0.28,336.456,3,North,Online,...,6,22,Other,Electronics,1.9,Returned,Negative,6.56,,3.0
2,2023-01-03,1789,137,3,99.08,0.23,228.8748,4,East,Cash,...,1,35,Other,Food,2.5,Returned,Positive,,,2.3
3,2023-01-04,1570,164,1,15.52,0.26,11.4848,3,East,Online,...,5,30,Other,Electronics,4.5,Not Returned,Positive,12.2,,
4,2023-01-05,1541,170,2,88.38,0.17,146.7108,8,North,Cash,...,1,26,Other,Electronics,1.5,Returned,Neutral,,,4.5
5,2023-01-06,1841,134,3,39.69,0.08,109.5444,1,West,Cash,...,1,49,Female,Food,1.4,Not Returned,Neutral,19.67,,3.2
6,2023-01-07,1910,159,7,92.9,0.12,572.264,1,North,Cash,...,4,21,Other,Electronics,4.9,Returned,Negative,10.67,SAVE10,1.4
7,2023-01-08,1255,140,2,13.21,0.05,25.099,9,West,Online,...,4,29,Female,Books,3.5,Not Returned,Neutral,,FREESHIP,3.4
8,2023-01-09,1607,188,8,25.52,0.11,181.7024,1,East,Online,...,6,49,Male,Food,4.8,Returned,Neutral,10.08,SAVE10,
9,2023-01-10,1874,193,1,30.05,0.26,22.237,2,North,Online,...,1,55,Male,Books,1.5,Returned,Negative,,SAVE10,4.7


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Handling Duplicates in Pandas
# Write a Pandas program to drop rows with missing data.

result_df = df.dropna()
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
1,2023-01-02,1961,190,5,93.46,0.28,336.456,3,North,Online,...,6,22,Other,Electronics,1.9,Returned,Negative,6.56,,3.0
5,2023-01-06,1841,134,3,39.69,0.08,109.5444,1,West,Cash,...,1,49,Female,Food,1.4,Not Returned,Neutral,19.67,,3.2
6,2023-01-07,1910,159,7,92.9,0.12,572.264,1,North,Cash,...,4,21,Other,Electronics,4.9,Returned,Negative,10.67,SAVE10,1.4
10,2023-01-11,1424,133,8,57.51,0.28,331.2576,9,South,Cash,...,6,35,Other,Clothing,3.4,Not Returned,Positive,12.11,,1.6
11,2023-01-12,1659,103,7,27.63,0.26,143.1234,9,North,Cash,...,2,67,Female,Electronics,3.1,Returned,Positive,13.57,WELCOME5,3.4
13,2023-01-14,1277,192,9,30.11,0.1,243.891,9,East,Online,...,4,46,Female,Food,3.9,Not Returned,Neutral,17.0,FREESHIP,4.1
15,2023-01-16,1061,197,2,93.93,0.04,180.3456,2,West,Credit Card,...,6,64,Male,Food,2.4,Not Returned,Positive,9.89,SAVE10,4.4
16,2023-01-17,1610,113,7,31.22,0.0,218.54,9,North,Online,...,2,53,Female,Books,3.0,Not Returned,Positive,7.03,WELCOME5,1.4
17,2023-01-18,1703,122,6,36.29,0.02,213.3852,9,West,Credit Card,...,5,31,Male,Clothing,3.2,Not Returned,Positive,16.06,,3.0
18,2023-01-19,1651,165,1,78.89,0.06,74.1566,3,East,Online,...,3,63,Female,Electronics,4.3,Not Returned,Negative,12.55,WELCOME5,3.0


<div class="alert alert-warning">

**3. Detecting Duplicates in Pandas**

**Write a Pandas program to detect duplicates using duplicated() method.**

</div>

In [6]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1976,120,7,19.65,0.21,108.6645,6,North,Credit Card,...,5,20,Other,Books,4.6,Returned,Neutral,,,2.4
1,2023-01-02,1817,107,6,20.3,0.1,109.62,5,East,Cash,...,1,46,Male,Books,1.3,Returned,Negative,15.11,FREESHIP,1.2
2,2023-01-03,1058,173,8,66.19,0.13,460.6824,1,East,Credit Card,...,4,31,Male,Books,4.3,Not Returned,Positive,7.73,WELCOME5,3.5
3,2023-01-04,1673,161,4,38.28,0.0,153.12,5,South,Credit Card,...,1,56,Female,Food,2.8,Not Returned,Negative,12.29,,2.7
4,2023-01-05,1254,126,1,85.57,0.25,64.1775,3,East,Online,...,2,27,Other,Electronics,4.5,Not Returned,Negative,7.68,,
5,2023-01-06,1782,178,6,88.22,0.2,423.456,9,South,Credit Card,...,4,41,Other,Clothing,4.5,Returned,Negative,5.26,WELCOME5,1.6
6,2023-01-07,1673,193,1,60.03,0.03,58.2291,2,West,Credit Card,...,1,52,Other,Electronics,1.7,Returned,Negative,10.34,SAVE10,
7,2023-01-08,1769,112,9,69.08,0.07,578.1996,4,North,Credit Card,...,1,24,Other,Books,1.4,Not Returned,Positive,17.03,,1.8
8,2023-01-09,1552,124,4,81.95,0.03,317.966,3,South,Online,...,6,57,Female,Books,4.7,Not Returned,Neutral,11.83,,4.6
9,2023-01-10,1731,111,8,13.62,0.12,95.8848,5,East,Cash,...,5,54,Female,Clothing,1.2,Not Returned,Positive,10.58,,4.6


<div class="alert alert-success">

**Solution 03:**
</div>

In [7]:
# 3. Data Cleaning Techniques
# Write a Pandas program to detect duplicates using duplicated() method.
duplicates = df.duplicated()
duplicates_df = df[duplicates]
duplicates_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
35,2023-01-18,1377,106,1,80.21,0.24,60.9596,6,East,Credit Card,...,6,50,Male,Food,2.6,Not Returned,Neutral,,SAVE10,3.9
36,2023-01-28,1978,119,9,91.34,0.23,632.9862,6,West,Credit Card,...,5,48,Female,Food,3.5,Returned,Negative,,,
37,2023-01-01,1976,120,7,19.65,0.21,108.6645,6,North,Credit Card,...,5,20,Other,Books,4.6,Returned,Neutral,,,2.4
38,2023-01-14,1137,104,3,45.03,0.04,129.6864,9,North,Credit Card,...,6,68,Other,Clothing,4.0,Returned,Negative,12.36,SAVE10,4.3
39,2023-01-12,1231,114,5,16.46,0.14,70.778,5,East,Online,...,4,55,Other,Books,1.7,Returned,Negative,,SAVE10,


<div class="alert alert-warning">

**4. Handling Duplicates in Pandas**

**Write a Pandas program to remove duplicates rows from a DataFrame.**

</div>

In [8]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1890,168,9,39.7,0.17,296.559,4,East,Online,...,5,53,Female,Books,4.9,Not Returned,Negative,5.37,,
1,2023-01-02,1113,149,9,60.2,0.2,433.44,9,East,Cash,...,2,37,Male,Books,3.8,Returned,Neutral,,SAVE10,2.6
2,2023-01-03,1821,187,7,24.36,0.28,122.7744,1,North,Online,...,5,19,Male,Books,2.4,Not Returned,Neutral,9.58,FREESHIP,1.6
3,2023-01-04,1356,165,4,19.95,0.07,74.214,6,East,Online,...,4,67,Other,Electronics,1.7,Not Returned,Neutral,11.2,,3.3
4,2023-01-05,1397,177,9,29.23,0.18,215.7174,9,North,Credit Card,...,4,34,Other,Books,1.9,Not Returned,Negative,6.06,,
5,2023-01-06,1487,165,2,19.51,0.06,36.6788,4,East,Cash,...,6,46,Other,Books,1.4,Not Returned,Negative,,,
6,2023-01-07,1989,187,1,75.36,0.11,67.0704,1,South,Cash,...,3,58,Female,Clothing,1.1,Not Returned,Neutral,13.25,,1.3
7,2023-01-08,1561,188,5,37.38,0.28,134.568,9,North,Credit Card,...,1,28,Female,Electronics,2.8,Returned,Negative,,,
8,2023-01-09,1061,117,7,79.99,0.22,436.7454,6,East,Cash,...,6,46,Female,Clothing,1.2,Not Returned,Positive,9.7,,1.8
9,2023-01-10,1813,191,2,12.95,0.11,23.051,7,North,Credit Card,...,5,22,Female,Books,3.1,Not Returned,Neutral,,,


<div class="alert alert-success">

**Solution 04:**
</div>

In [9]:
df.drop_duplicates()

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1890,168,9,39.7,0.17,296.559,4,East,Online,...,5,53,Female,Books,4.9,Not Returned,Negative,5.37,,
1,2023-01-02,1113,149,9,60.2,0.2,433.44,9,East,Cash,...,2,37,Male,Books,3.8,Returned,Neutral,,SAVE10,2.6
2,2023-01-03,1821,187,7,24.36,0.28,122.7744,1,North,Online,...,5,19,Male,Books,2.4,Not Returned,Neutral,9.58,FREESHIP,1.6
3,2023-01-04,1356,165,4,19.95,0.07,74.214,6,East,Online,...,4,67,Other,Electronics,1.7,Not Returned,Neutral,11.2,,3.3
4,2023-01-05,1397,177,9,29.23,0.18,215.7174,9,North,Credit Card,...,4,34,Other,Books,1.9,Not Returned,Negative,6.06,,
5,2023-01-06,1487,165,2,19.51,0.06,36.6788,4,East,Cash,...,6,46,Other,Books,1.4,Not Returned,Negative,,,
6,2023-01-07,1989,187,1,75.36,0.11,67.0704,1,South,Cash,...,3,58,Female,Clothing,1.1,Not Returned,Neutral,13.25,,1.3
7,2023-01-08,1561,188,5,37.38,0.28,134.568,9,North,Credit Card,...,1,28,Female,Electronics,2.8,Returned,Negative,,,
8,2023-01-09,1061,117,7,79.99,0.22,436.7454,6,East,Cash,...,6,46,Female,Clothing,1.2,Not Returned,Positive,9.7,,1.8
9,2023-01-10,1813,191,2,12.95,0.11,23.051,7,North,Credit Card,...,5,22,Female,Books,3.1,Not Returned,Neutral,,,


In [10]:
# 4. Handling Duplicates in Pandas
# Write a Pandas program to remove duplicates rows from a DataFrame.

df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)

<div class="alert alert-warning">

**5. Handling Outliers with Z-Score Method**

**Write a Pandas program to handle outliers in a DataFrame with Z-Score method.**

</div>

In [11]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1729,149,3,3622.35,0.25,8150.2875,4,South,Credit Card,...,2,67,Other,Books,4.5,Returned,Negative,6.51,FREESHIP,
1,2023-01-02,1653,192,3,4684.6,0.28,10118.736,7,South,Cash,...,6,51,Other,Clothing,3.8,Not Returned,Negative,8.35,SAVE10,
2,2023-01-03,1176,193,4,6627.12,0.1,23857.632,3,North,Online,...,6,69,Male,Books,2.7,Not Returned,Positive,8.89,FREESHIP,3.7
3,2023-01-04,1201,153,4,213.67,0.23,658.1036,2,North,Credit Card,...,3,34,Female,Clothing,4.3,Returned,Positive,,SAVE10,3.8
4,2023-01-05,1053,133,6,2033.34,0.01,12078.0396,5,West,Cash,...,1,45,Female,Clothing,1.9,Not Returned,Positive,12.02,FREESHIP,2.1
5,2023-01-06,1966,168,4,8109.96,0.21,25627.4736,6,East,Online,...,6,52,Female,Food,2.1,Returned,Positive,,SAVE10,4.6
6,2023-01-07,1242,179,6,4136.89,0.29,17623.1514,9,North,Cash,...,3,55,Other,Electronics,4.9,Not Returned,Positive,8.91,SAVE10,
7,2023-01-08,1633,171,7,5432.41,0.28,27379.3464,9,North,Online,...,6,35,Other,Books,4.2,Not Returned,Neutral,18.39,SAVE10,
8,2023-01-09,1598,128,4,1652.57,0.1,5949.252,6,North,Cash,...,1,57,Female,Clothing,3.7,Not Returned,Negative,9.32,FREESHIP,3.0
9,2023-01-10,1554,175,3,721.19,0.2,1730.856,3,South,Online,...,4,59,Female,Clothing,1.9,Returned,Neutral,9.66,,


<div class="alert alert-success">

**Solution 05:**
</div>

In [12]:
# 5. Handling Outliers with Z-Score Method
# Write a Pandas program to handle outliers in a DataFrame with Z-Score method.

def handle_outliers_z_score(df, column):
    from scipy import stats
    z_scores = np.abs(stats.zscore(df[column]))
    threshold = 1.5
    return df[(z_scores < threshold)]

result_df = handle_outliers_z_score(df, 'price')

In [13]:
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1729,149,3,3622.35,0.25,8150.2875,4,South,Credit Card,...,2,67,Other,Books,4.5,Returned,Negative,6.51,FREESHIP,
1,2023-01-02,1653,192,3,4684.6,0.28,10118.736,7,South,Cash,...,6,51,Other,Clothing,3.8,Not Returned,Negative,8.35,SAVE10,
2,2023-01-03,1176,193,4,6627.12,0.1,23857.632,3,North,Online,...,6,69,Male,Books,2.7,Not Returned,Positive,8.89,FREESHIP,3.7
3,2023-01-04,1201,153,4,213.67,0.23,658.1036,2,North,Credit Card,...,3,34,Female,Clothing,4.3,Returned,Positive,,SAVE10,3.8
4,2023-01-05,1053,133,6,2033.34,0.01,12078.0396,5,West,Cash,...,1,45,Female,Clothing,1.9,Not Returned,Positive,12.02,FREESHIP,2.1
5,2023-01-06,1966,168,4,8109.96,0.21,25627.4736,6,East,Online,...,6,52,Female,Food,2.1,Returned,Positive,,SAVE10,4.6
6,2023-01-07,1242,179,6,4136.89,0.29,17623.1514,9,North,Cash,...,3,55,Other,Electronics,4.9,Not Returned,Positive,8.91,SAVE10,
7,2023-01-08,1633,171,7,5432.41,0.28,27379.3464,9,North,Online,...,6,35,Other,Books,4.2,Not Returned,Neutral,18.39,SAVE10,
8,2023-01-09,1598,128,4,1652.57,0.1,5949.252,6,North,Cash,...,1,57,Female,Clothing,3.7,Not Returned,Negative,9.32,FREESHIP,3.0
9,2023-01-10,1554,175,3,721.19,0.2,1730.856,3,South,Online,...,4,59,Female,Clothing,1.9,Returned,Neutral,9.66,,


<div class="alert alert-warning">

**6. Normalizing Data with Min-Max Scaling**

**Write a Pandas program that normalizes data with Min-Max scaling.**

</div>

In [14]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 10000.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))


duplicates = df.sample(n=5, replace=False)  # Select 5 random rows without replacement
df = pd.concat([df, duplicates], ignore_index=True)


# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1738,165,5,2575.21,0.23,9914.5585,2,West,Cash,...,2,31,Other,Books,3.3,Returned,Negative,5.24,FREESHIP,4.9
1,2023-01-02,1629,168,7,9530.91,0.08,61379.0604,7,South,Credit Card,...,2,48,Male,Electronics,2.5,Returned,Positive,19.58,SAVE10,3.5
2,2023-01-03,1322,184,1,8626.8,0.25,6470.1,9,North,Cash,...,5,64,Male,Electronics,1.9,Returned,Negative,15.84,FREESHIP,
3,2023-01-04,1498,179,8,8114.23,0.22,50632.7952,9,East,Online,...,5,24,Male,Food,1.9,Returned,Neutral,15.59,WELCOME5,1.5
4,2023-01-05,1517,168,1,1013.42,0.1,912.078,1,North,Online,...,1,57,Female,Electronics,1.0,Returned,Negative,8.32,WELCOME5,3.7
5,2023-01-06,1017,119,6,2895.39,0.23,13376.7018,2,South,Online,...,4,48,Female,Clothing,1.8,Not Returned,Positive,8.34,,1.2
6,2023-01-07,1848,122,2,3019.76,0.19,4892.0112,6,West,Credit Card,...,4,47,Female,Books,5.0,Not Returned,Neutral,6.18,FREESHIP,4.6
7,2023-01-08,1092,198,3,3991.89,0.18,9820.0494,5,East,Credit Card,...,3,50,Other,Books,1.7,Returned,Positive,,,2.3
8,2023-01-09,1149,181,1,548.1,0.13,476.847,1,South,Online,...,2,53,Female,Electronics,4.2,Returned,Neutral,14.07,SAVE10,3.0
9,2023-01-10,1244,144,8,3367.05,0.21,21279.756,9,South,Credit Card,...,3,65,Male,Books,1.3,Not Returned,Neutral,11.35,,2.1


<div class="alert alert-success">

**Solution 06:**
</div>

In [15]:
# 6. Normalizing Data with Min-Max Scaling
# Write a Pandas program that normalizes data with Min-Max scaling.

def normalize_min_max(df, column):
    min_val = df[column].min()
    max_val = df[column].max()
    df[column] = (df[column] - min_val) / (max_val - min_val)
    return df

result_df = normalize_min_max(df.copy(), 'price')

result_df


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1738,165,5,0.251311,0.23,9914.5585,2,West,Cash,...,2,31,Other,Books,3.3,Returned,Negative,5.24,FREESHIP,4.9
1,2023-01-02,1629,168,7,0.963163,0.08,61379.0604,7,South,Credit Card,...,2,48,Male,Electronics,2.5,Returned,Positive,19.58,SAVE10,3.5
2,2023-01-03,1322,184,1,0.870636,0.25,6470.1,9,North,Cash,...,5,64,Male,Electronics,1.9,Returned,Negative,15.84,FREESHIP,
3,2023-01-04,1498,179,8,0.818179,0.22,50632.7952,9,East,Online,...,5,24,Male,Food,1.9,Returned,Neutral,15.59,WELCOME5,1.5
4,2023-01-05,1517,168,1,0.091476,0.1,912.078,1,North,Online,...,1,57,Female,Electronics,1.0,Returned,Negative,8.32,WELCOME5,3.7
5,2023-01-06,1017,119,6,0.284079,0.23,13376.7018,2,South,Online,...,4,48,Female,Clothing,1.8,Not Returned,Positive,8.34,,1.2
6,2023-01-07,1848,122,2,0.296807,0.19,4892.0112,6,West,Credit Card,...,4,47,Female,Books,5.0,Not Returned,Neutral,6.18,FREESHIP,4.6
7,2023-01-08,1092,198,3,0.396295,0.18,9820.0494,5,East,Credit Card,...,3,50,Other,Books,1.7,Returned,Positive,,,2.3
8,2023-01-09,1149,181,1,0.043855,0.13,476.847,1,South,Online,...,2,53,Female,Electronics,4.2,Returned,Neutral,14.07,SAVE10,3.0
9,2023-01-10,1244,144,8,0.332349,0.21,21279.756,9,South,Credit Card,...,3,65,Male,Books,1.3,Not Returned,Neutral,11.35,,2.1
