[Pandas Data Cleaning and Preprocessing](https://www.w3resource.com/python-exercises/pandas/pandas-data-cleaning-and-preprocessing.php)

In [1]:
import pandas as pd
import numpy as np

<div class="alert alert-warning">

**1. Handling Missing Data in Pandas**

**Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1243,123,3,43.94,0.05,125.229,3,North,Online,...,4,62,Male,Books,1.7,Returned,Positive,,SAVE10,1.5
1,2023-01-02,1258,168,6,37.39,0.05,213.123,1,North,Credit Card,...,4,41,Male,Clothing,4.2,Not Returned,Negative,,,
2,2023-01-03,1840,105,4,78.92,0.08,290.4256,4,South,Credit Card,...,6,18,Female,Books,1.6,Not Returned,Positive,17.29,WELCOME5,
3,2023-01-04,1575,167,2,71.43,0.2,114.288,2,South,Cash,...,2,52,Male,Electronics,4.5,Returned,Positive,15.42,SAVE10,2.6
4,2023-01-05,1790,148,5,78.59,0.02,385.091,8,East,Online,...,4,57,Female,Clothing,4.8,Returned,Positive,,SAVE10,4.5
5,2023-01-06,1381,116,1,62.33,0.25,46.7475,8,North,Credit Card,...,3,68,Female,Electronics,1.6,Returned,Negative,16.04,SAVE10,3.6
6,2023-01-07,1236,155,5,12.76,0.12,56.144,2,West,Online,...,1,63,Female,Food,3.0,Not Returned,Negative,,WELCOME5,2.1
7,2023-01-08,1408,150,7,29.87,0.13,181.9083,2,North,Online,...,3,41,Female,Clothing,4.3,Returned,Neutral,17.41,,
8,2023-01-09,1336,105,9,93.19,0.17,696.1293,2,West,Cash,...,1,52,Male,Electronics,3.2,Not Returned,Neutral,16.46,,
9,2023-01-10,1807,175,2,32.74,0.23,50.4196,1,South,Cash,...,3,45,Female,Clothing,2.2,Not Returned,Neutral,9.45,FREESHIP,


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Handling Missing Data in Pandas
# Write a Pandas program to fill missing values (NaN) in a DataFrame using fillna().
result_df = df.fillna({
    'shipping_cost': df['shipping_cost'].mean(),
    'coupon_code': 'No Coupon',
    'delivery_rating': df['delivery_rating'].mean()
})

result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1243,123,3,43.94,0.05,125.229,3,North,Online,...,4,62,Male,Books,1.7,Returned,Positive,12.884483,SAVE10,1.5
1,2023-01-02,1258,168,6,37.39,0.05,213.123,1,North,Credit Card,...,4,41,Male,Clothing,4.2,Not Returned,Negative,12.884483,,3.073913
2,2023-01-03,1840,105,4,78.92,0.08,290.4256,4,South,Credit Card,...,6,18,Female,Books,1.6,Not Returned,Positive,17.29,WELCOME5,3.073913
3,2023-01-04,1575,167,2,71.43,0.2,114.288,2,South,Cash,...,2,52,Male,Electronics,4.5,Returned,Positive,15.42,SAVE10,2.6
4,2023-01-05,1790,148,5,78.59,0.02,385.091,8,East,Online,...,4,57,Female,Clothing,4.8,Returned,Positive,12.884483,SAVE10,4.5
5,2023-01-06,1381,116,1,62.33,0.25,46.7475,8,North,Credit Card,...,3,68,Female,Electronics,1.6,Returned,Negative,16.04,SAVE10,3.6
6,2023-01-07,1236,155,5,12.76,0.12,56.144,2,West,Online,...,1,63,Female,Food,3.0,Not Returned,Negative,12.884483,WELCOME5,2.1
7,2023-01-08,1408,150,7,29.87,0.13,181.9083,2,North,Online,...,3,41,Female,Clothing,4.3,Returned,Neutral,17.41,,3.073913
8,2023-01-09,1336,105,9,93.19,0.17,696.1293,2,West,Cash,...,1,52,Male,Electronics,3.2,Not Returned,Neutral,16.46,,3.073913
9,2023-01-10,1807,175,2,32.74,0.23,50.4196,1,South,Cash,...,3,45,Female,Clothing,2.2,Not Returned,Neutral,9.45,FREESHIP,3.073913


<div class="alert alert-warning">

**2. Handling Duplicates in Pandas**

**Write a Pandas program to drop rows with missing data.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1116,124,1,62.16,0.23,47.8632,6,North,Cash,...,4,43,Male,Books,1.1,Not Returned,Positive,7.71,WELCOME5,1.4
1,2023-01-02,1506,141,9,14.64,0.24,100.1376,3,South,Credit Card,...,4,57,Male,Books,4.8,Returned,Negative,,FREESHIP,3.2
2,2023-01-03,1927,151,7,66.26,0.24,352.5032,1,West,Cash,...,5,20,Other,Electronics,4.6,Not Returned,Neutral,8.13,FREESHIP,
3,2023-01-04,1966,127,8,73.18,0.02,573.7312,9,East,Online,...,5,27,Male,Electronics,2.5,Not Returned,Positive,,FREESHIP,2.2
4,2023-01-05,1015,193,1,50.62,0.22,39.4836,6,South,Cash,...,6,42,Other,Electronics,3.4,Not Returned,Positive,17.14,SAVE10,3.9
5,2023-01-06,1305,164,6,69.46,0.12,366.7488,9,West,Credit Card,...,5,50,Other,Books,3.0,Returned,Neutral,11.73,,3.2
6,2023-01-07,1321,110,2,56.55,0.04,108.576,5,West,Credit Card,...,4,28,Male,Clothing,1.3,Returned,Negative,12.18,SAVE10,
7,2023-01-08,1280,193,1,45.2,0.17,37.516,9,East,Online,...,6,20,Female,Clothing,3.3,Not Returned,Negative,5.76,SAVE10,5.0
8,2023-01-09,1641,163,8,11.66,0.14,80.2208,8,North,Cash,...,1,32,Male,Clothing,3.6,Not Returned,Neutral,5.92,FREESHIP,1.8
9,2023-01-10,1053,185,4,86.63,0.01,343.0548,7,East,Cash,...,6,50,Female,Clothing,3.5,Returned,Negative,8.85,WELCOME5,


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Handling Duplicates in Pandas
# Write a Pandas program to drop rows with missing data.

result_df = df.dropna()
result_df

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1116,124,1,62.16,0.23,47.8632,6,North,Cash,...,4,43,Male,Books,1.1,Not Returned,Positive,7.71,WELCOME5,1.4
4,2023-01-05,1015,193,1,50.62,0.22,39.4836,6,South,Cash,...,6,42,Other,Electronics,3.4,Not Returned,Positive,17.14,SAVE10,3.9
5,2023-01-06,1305,164,6,69.46,0.12,366.7488,9,West,Credit Card,...,5,50,Other,Books,3.0,Returned,Neutral,11.73,,3.2
7,2023-01-08,1280,193,1,45.2,0.17,37.516,9,East,Online,...,6,20,Female,Clothing,3.3,Not Returned,Negative,5.76,SAVE10,5.0
8,2023-01-09,1641,163,8,11.66,0.14,80.2208,8,North,Cash,...,1,32,Male,Clothing,3.6,Not Returned,Neutral,5.92,FREESHIP,1.8
10,2023-01-11,1465,173,8,92.78,0.07,690.2832,9,West,Cash,...,5,34,Male,Electronics,2.6,Returned,Positive,15.1,,4.5
12,2023-01-13,1456,104,1,25.35,0.14,21.801,9,West,Cash,...,2,25,Male,Clothing,5.0,Returned,Neutral,18.77,,1.6
15,2023-01-16,1169,117,5,48.47,0.16,203.574,9,East,Credit Card,...,1,66,Male,Food,2.8,Not Returned,Negative,8.27,SAVE10,4.8
16,2023-01-17,1662,109,5,28.82,0.17,119.603,9,South,Cash,...,3,55,Female,Books,1.6,Not Returned,Positive,18.59,,3.0
17,2023-01-18,1292,129,8,58.1,0.27,339.304,9,West,Credit Card,...,3,43,Female,Electronics,3.3,Returned,Neutral,14.28,SAVE10,4.8
