[Pandas Data Validation](https://www.w3resource.com/python-exercises/pandas/pandas-data-validation.php)

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

<div class="alert alert-warning">

**1. Checking Missing Values in a DataFrame**

**Write a Pandas program to check for missing values in a DataFrame.**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1699,198,5,15.09,0.24,57.342,9,South,Credit Card,False,2,46,Female,Clothing,3.4,Not Returned,Neutral,8.29,SAVE10,
1,2023-01-02,1923,128,9,18.66,0.04,161.2224,8,South,Credit Card,False,1,47,Other,Food,2.8,Not Returned,Positive,18.87,,2.9
2,2023-01-03,1942,188,7,60.24,0.03,409.0296,7,South,Cash,True,3,58,Male,Food,2.4,Not Returned,Negative,,SAVE10,1.1
3,2023-01-04,1404,140,1,70.1,0.16,58.884,9,West,Online,False,6,56,Male,Electronics,2.6,Not Returned,Neutral,18.04,FREESHIP,3.3
4,2023-01-05,1463,186,9,31.26,0.05,267.273,3,West,Credit Card,False,4,65,Male,Food,2.5,Not Returned,Neutral,15.79,FREESHIP,
5,2023-01-06,1211,184,3,49.61,0.13,129.4821,8,East,Cash,True,4,24,Female,Clothing,3.1,Not Returned,Positive,12.56,FREESHIP,3.4
6,2023-01-07,1057,139,2,78.59,0.16,132.0312,2,South,Credit Card,False,3,24,Other,Books,1.6,Returned,Neutral,19.37,FREESHIP,3.7
7,2023-01-08,1300,145,5,25.01,0.1,112.545,5,South,Cash,False,5,68,Other,Clothing,2.9,Not Returned,Neutral,13.63,FREESHIP,1.4
8,2023-01-09,1307,175,5,51.19,0.28,184.284,3,North,Cash,False,5,69,Male,Clothing,1.4,Not Returned,Positive,10.29,WELCOME5,1.1
9,2023-01-10,1770,158,8,50.84,0.06,382.3168,3,West,Cash,True,4,37,Female,Clothing,1.9,Not Returned,Negative,16.28,SAVE10,1.0


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Checking Missing Values in a DataFrame
# Write a Pandas program to check for missing values in a DataFrame.

# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

shipping_cost      7
delivery_rating    9
dtype: int64

<div class="alert alert-warning">

**2. Ensuring Consistent Data Types of All Columns**

**Write a Pandas program to check and ensure that the data types of all columns are consistent.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1976,117,4,53.65,0.18,175.972,3,South,Cash,False,1,66,Other,Books,1.5,Not Returned,Neutral,7.39,FREESHIP,
1,2023-01-02,1886,134,1,21.5,0.01,21.285,9,South,Cash,False,1,38,Other,Food,1.5,Not Returned,Negative,11.72,SAVE10,4.5
2,2023-01-03,1235,153,4,90.61,0.15,308.074,9,North,Credit Card,True,3,41,Male,Books,5.0,Returned,Negative,8.64,SAVE10,
3,2023-01-04,1772,181,7,53.83,0.3,263.767,4,South,Credit Card,False,3,36,Male,Food,5.0,Not Returned,Negative,,SAVE10,
4,2023-01-05,1568,177,9,25.13,0.12,199.0296,1,East,Credit Card,True,6,51,Other,Clothing,2.3,Returned,Positive,18.69,SAVE10,4.7
5,2023-01-06,1420,174,4,96.14,0.09,349.9496,2,South,Online,True,4,53,Other,Clothing,2.1,Not Returned,Positive,7.13,SAVE10,2.5
6,2023-01-07,1612,123,6,60.43,0.05,344.451,8,West,Cash,True,1,38,Female,Clothing,1.3,Not Returned,Neutral,,,2.0
7,2023-01-08,1767,164,5,88.05,0.13,383.0175,9,North,Cash,False,4,47,Male,Electronics,4.8,Not Returned,Negative,6.2,FREESHIP,4.5
8,2023-01-09,1939,145,9,48.69,0.11,390.0069,3,South,Online,True,4,68,Male,Books,3.6,Not Returned,Positive,9.56,WELCOME5,4.4
9,2023-01-10,1749,118,9,98.71,0.06,835.0866,5,North,Cash,True,1,69,Female,Books,1.7,Not Returned,Negative,8.82,,1.5


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Ensuring Consistent Data Types of All Columns
# Write a Pandas program to check and ensure that the data types of all columns are consistent.

# Check data types
data_types = df.dtypes
data_types

date                datetime64[ns]
customer_id                  int64
product_id                   int64
quantity                     int64
price                      float64
discount                   float64
revenue                    float64
store_id                     int64
region                      object
payment_method              object
is_member                     bool
delivery_days                int64
customer_age                 int64
gender                      object
product_category            object
rating                     float64
return_status               object
feedback                    object
shipping_cost              float64
coupon_code                 object
delivery_rating            float64
dtype: object

<div class="alert alert-warning">

**3. Validating the Data Type of a Specific Column**

**Write a Pandas program to validate the data type of a specific column in a DataFrame.**

</div>

In [6]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1423,105,9,92.88,0.13,727.2504,4,South,Credit Card,True,2,36,Female,Clothing,4.7,Not Returned,Positive,18.96,,3.6
1,2023-01-02,1930,134,5,65.43,0.02,320.607,3,North,Cash,True,6,36,Other,Food,4.1,Returned,Positive,12.5,,1.3
2,2023-01-03,1874,169,8,14.34,0.06,107.8368,8,South,Cash,False,1,39,Female,Books,4.6,Returned,Negative,8.09,WELCOME5,
3,2023-01-04,1498,106,7,95.08,0.18,545.7592,1,West,Online,True,6,25,Other,Electronics,2.4,Returned,Neutral,19.16,SAVE10,2.8
4,2023-01-05,1536,186,3,10.62,0.2,25.488,7,East,Online,False,4,57,Other,Electronics,4.7,Not Returned,Positive,16.09,SAVE10,3.3
5,2023-01-06,1551,175,3,62.11,0.23,143.4741,3,East,Credit Card,True,2,56,Other,Books,4.9,Not Returned,Negative,7.69,,1.7
6,2023-01-07,1816,103,8,79.47,0.01,629.4024,2,West,Online,True,3,53,Female,Electronics,3.3,Not Returned,Positive,6.18,WELCOME5,
7,2023-01-08,1921,161,8,56.2,0.08,413.632,2,North,Cash,True,2,48,Other,Books,4.0,Not Returned,Neutral,,WELCOME5,
8,2023-01-09,1148,116,8,73.09,0.06,549.6368,4,South,Cash,False,6,29,Other,Electronics,1.6,Returned,Positive,8.81,FREESHIP,3.1
9,2023-01-10,1644,123,1,44.92,0.03,43.5724,8,West,Cash,True,2,48,Male,Food,4.1,Returned,Negative,,,2.4


<div class="alert alert-success">

**Solution 03:**
</div>

In [7]:
# 3. Validating the Data Type of a Specific Column
# Write a Pandas program to validate the data type of a specific column in a DataFrame.

# Check if 'price' column is of type float
is_of_type = 'float64'
is_price_float = df['price'].dtype == is_of_type
is_price_float

# Validate the data type of 'price' column
if is_price_float:
    print("The 'price' column is of type {}.".format(df['price'].dtype))
else:
    print("The 'price' column is not of type {}, it is {}.".format(is_of_type, df['price'].dtype))

The 'price' column is of type float64.


<div class="alert alert-warning">

**4. Checking Duplicate Rows in a DataFrame**

**Write a Pandas program to check duplicate rows in a DataFrame.**

</div>

In [8]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1397,139,1,12.5,0.16,10.5,8,North,Credit Card,True,1,38,Female,Electronics,1.4,Not Returned,Positive,,SAVE10,3.0
1,2023-01-02,1303,127,3,98.02,0.26,217.6044,7,West,Credit Card,False,5,29,Female,Food,4.8,Not Returned,Positive,9.78,FREESHIP,3.9
2,2023-01-03,1974,114,5,73.82,0.22,287.898,6,East,Credit Card,False,3,57,Female,Electronics,2.2,Not Returned,Positive,10.95,,
3,2023-01-04,1175,147,9,88.48,0.03,772.4304,1,East,Cash,False,6,50,Other,Books,1.4,Returned,Negative,5.09,SAVE10,1.3
4,2023-01-05,1567,184,9,16.22,0.21,115.3242,4,East,Credit Card,False,2,66,Male,Food,3.8,Not Returned,Negative,9.48,,3.5
5,2023-01-06,1132,159,7,50.19,0.07,326.7369,1,East,Online,True,3,58,Female,Electronics,3.8,Not Returned,Neutral,14.33,WELCOME5,
6,2023-01-07,1428,182,5,88.4,0.01,437.58,8,North,Online,True,5,67,Other,Clothing,3.6,Returned,Negative,15.0,,
7,2023-01-08,1473,155,2,23.24,0.26,34.3952,2,North,Cash,True,2,49,Male,Books,4.0,Returned,Positive,19.33,FREESHIP,3.2
8,2023-01-09,1355,162,5,21.31,0.16,89.502,5,North,Online,False,6,62,Other,Electronics,1.0,Not Returned,Neutral,14.13,WELCOME5,1.2
9,2023-01-10,1649,147,2,36.87,0.2,58.992,7,North,Online,False,2,34,Other,Books,3.8,Not Returned,Negative,,WELCOME5,


<div class="alert alert-success">

**Solution 04:**
</div>

In [9]:
# 4. Checking Duplicate Rows in a DataFrame
# Write a Pandas program to check duplicate rows in a DataFrame.

# Check for duplicate rows
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
35,2023-01-27,1521,132,5,75.2,0.02,368.48,5,East,Cash,True,3,37,Female,Electronics,2.1,Not Returned,Positive,6.0,,
36,2023-01-14,1294,103,7,81.07,0.25,425.6175,5,South,Online,True,4,55,Female,Food,1.8,Returned,Positive,,SAVE10,
37,2023-01-25,1801,155,1,15.89,0.29,11.2819,6,East,Online,True,1,22,Female,Food,1.9,Not Returned,Negative,12.16,SAVE10,2.0


<div class="alert alert-warning">

**5. Removing Duplicate Rows from a DataFrame**

**Write a Pandas program to remove duplicate rows from a DataFrame.**

</div>

In [10]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1758,187,2,36.76,0.15,62.492,8,West,Credit Card,False,3,62,Female,Clothing,4.8,Not Returned,Neutral,,WELCOME5,1.2
1,2023-01-02,1737,119,5,44.01,0.01,217.8495,3,North,Credit Card,False,3,18,Male,Food,3.4,Not Returned,Neutral,16.14,,
2,2023-01-03,1570,134,4,57.15,0.05,217.17,2,North,Online,True,5,35,Female,Food,1.1,Not Returned,Positive,,WELCOME5,1.6
3,2023-01-04,1893,169,3,70.22,0.05,200.127,5,North,Online,True,1,59,Other,Clothing,1.5,Not Returned,Negative,,,2.6
4,2023-01-05,1158,197,9,36.93,0.1,299.133,3,South,Cash,False,2,30,Male,Food,1.5,Not Returned,Negative,9.78,,3.8
5,2023-01-06,1071,189,7,64.72,0.13,394.1448,5,West,Cash,False,3,63,Female,Food,4.2,Not Returned,Negative,13.82,WELCOME5,2.5
6,2023-01-07,1627,119,2,18.06,0.19,29.2572,1,East,Online,False,1,52,Other,Clothing,1.5,Not Returned,Negative,14.56,FREESHIP,4.6
7,2023-01-08,1558,111,5,16.45,0.06,77.315,7,West,Credit Card,True,3,25,Male,Books,3.9,Not Returned,Positive,10.62,WELCOME5,
8,2023-01-09,1134,198,6,95.78,0.07,534.4524,2,South,Online,False,6,28,Male,Food,3.6,Returned,Neutral,5.94,FREESHIP,3.7
9,2023-01-10,1232,187,5,98.54,0.03,477.919,9,North,Credit Card,True,4,30,Female,Electronics,1.6,Returned,Positive,10.45,,1.2


<div class="alert alert-success">

**Solution 05:**
</div>

In [11]:
# 5. Removing Duplicate Rows from a DataFrame
# Write a Pandas program to remove duplicate rows from a DataFrame.

print("Shape of DataFrame before removing duplicates:\n", df.shape)
# Remove duplicate rows
df = df.drop_duplicates()
print("Shape of DataFrame after removing duplicates:\n", df.shape)

Shape of DataFrame before removing duplicates:
 (38, 21)
Shape of DataFrame after removing duplicates:
 (35, 21)


<div class="alert alert-warning">

**6. Validating Data Based on Custom Conditions**

**Write a Pandas program to validate data based on custom conditions.**

</div>

In [12]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Add 3 more rows with negative price
df = pd.concat([df, pd.DataFrame({
    'date': pd.date_range(start='2023-02-05', periods=3, freq='D'),
    'customer_id': [1500, 1501, 1502],
    'product_id': [150, 151, 152],
    'quantity': [2, 3, 1],
    'price': [-20.00, -15.00, -30.00],  # Negative prices
    'discount': [0.1, 0.2, 0.15],
    'revenue': [0, 0, 0],  # Placeholder; will be recalculated
    'store_id': [5, 6, 7],
    'region': ['North', 'South', 'East'],
    'payment_method': ['Credit Card', 'Cash', 'Online'],
    'is_member': [True, False, True],
    'delivery_days': [3, 4, 2],
    'customer_age': [25, 34, 45],
    'gender': ['Male', 'Female', 'Other'],
    'product_category': ['Electronics', 'Clothing', 'Books'],
    'rating': [4.5, 3.0, 5.0],
    'return_status': ['Not Returned', 'Returned', 'Not Returned'],
    'feedback': ['Positive', 'Neutral', 'Negative'],
})], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1788,198,6,67.64,0.26,300.3216,4,North,Cash,False,6,59,Other,Clothing,3.9,Returned,Positive,17.42,FREESHIP,4.2
1,2023-01-02,1643,156,2,93.52,0.2,149.632,4,North,Cash,False,2,47,Other,Electronics,1.7,Not Returned,Positive,8.47,SAVE10,3.1
2,2023-01-03,1178,198,8,83.42,0.16,560.5824,3,North,Cash,False,4,61,Female,Books,1.4,Returned,Negative,16.6,FREESHIP,2.1
3,2023-01-04,1535,130,9,84.61,0.19,616.8069,8,East,Credit Card,False,6,52,Female,Books,2.3,Not Returned,Neutral,5.33,FREESHIP,1.6
4,2023-01-05,1515,185,5,93.51,0.02,458.199,9,East,Cash,False,6,46,Male,Books,2.1,Returned,Negative,12.45,,
5,2023-01-06,1474,129,1,56.15,0.24,42.674,3,East,Credit Card,True,4,38,Male,Books,4.8,Returned,Positive,,FREESHIP,3.9
6,2023-01-07,1951,192,4,11.77,0.1,42.372,2,East,Online,False,3,59,Female,Electronics,2.6,Not Returned,Positive,17.6,FREESHIP,2.3
7,2023-01-08,1966,151,1,41.08,0.22,32.0424,2,West,Credit Card,False,4,34,Female,Books,2.2,Not Returned,Neutral,9.32,SAVE10,
8,2023-01-09,1119,137,9,76.79,0.18,566.7102,4,North,Online,False,3,64,Female,Books,2.2,Returned,Negative,19.82,,1.0
9,2023-01-10,1941,183,4,93.48,0.13,325.3104,7,North,Online,False,6,39,Male,Clothing,4.3,Returned,Neutral,12.19,FREESHIP,3.4


<div class="alert alert-success">

**Solution 06:**
</div>

In [13]:
# 6. Validating Data Based on Custom Conditions
# Write a Pandas program to validate data based on custom conditions.

# Check if 'price' is greater than 0
valid_price = df['price'] > 0
if valid_price.all():
    print("All prices are valid.")
else:
    print("Some prices are invalid:")
    display(df[~valid_price])

Some prices are invalid:


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
38,2023-02-05,1500,150,2,-20.0,0.1,0.0,5,North,Credit Card,True,3,25,Male,Electronics,4.5,Not Returned,Positive,,,
39,2023-02-06,1501,151,3,-15.0,0.2,0.0,6,South,Cash,False,4,34,Female,Clothing,3.0,Returned,Neutral,,,
40,2023-02-07,1502,152,1,-30.0,0.15,0.0,7,East,Online,True,2,45,Other,Books,5.0,Not Returned,Negative,,,


<div class="alert alert-warning">

**7. Checking That a Specific Column Contains Only Unique Values**

**Write a Pandas program to check that a specific column contains only unique values.**

</div>

In [14]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Add 3 more rows with negative price
df = pd.concat([df, pd.DataFrame({
    'date': pd.date_range(start='2023-02-05', periods=3, freq='D'),
    'customer_id': [1500, 1501, 1502],
    'product_id': [150, 151, 152],
    'quantity': [2, 3, 1],
    'price': [-20.00, -15.00, -30.00],  # Negative prices
    'discount': [0.1, 0.2, 0.15],
    'revenue': [0, 0, 0],  # Placeholder; will be recalculated
    'store_id': [5, 6, 7],
    'region': ['North', 'South', 'East'],
    'payment_method': ['Credit Card', 'Cash', 'Online'],
    'is_member': [True, False, True],
    'delivery_days': [3, 4, 2],
    'customer_age': [25, 34, 45],
    'gender': ['Male', 'Female', 'Other'],
    'product_category': ['Electronics', 'Clothing', 'Books'],
    'rating': [4.5, 3.0, 5.0],
    'return_status': ['Not Returned', 'Returned', 'Not Returned'],
    'feedback': ['Positive', 'Neutral', 'Negative'],
})], ignore_index=True)

# add column with unique value
df['unique_id'] = pd.Series(range(1, len(df) + 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1252,105,2,54.65,0.24,83.068,5,East,Cash,False,4,52,Female,Electronics,4.7,Returned,Neutral,15.96,,2.3,1
1,2023-01-02,1754,165,3,63.11,0.16,159.0372,1,North,Cash,True,1,43,Male,Food,4.3,Not Returned,Negative,14.79,,,2
2,2023-01-03,1602,169,4,26.94,0.17,89.4408,9,East,Cash,True,2,29,Male,Electronics,3.2,Not Returned,Neutral,8.72,WELCOME5,3.9,3
3,2023-01-04,1483,133,5,89.76,0.15,381.48,8,North,Online,True,3,68,Male,Clothing,2.4,Not Returned,Positive,16.15,SAVE10,2.1,4
4,2023-01-05,1519,102,1,69.28,0.18,56.8096,1,North,Credit Card,True,5,25,Female,Books,2.0,Returned,Positive,16.21,,,5
5,2023-01-06,1947,114,4,71.38,0.23,219.8504,7,North,Online,False,6,39,Other,Electronics,1.4,Returned,Negative,17.43,WELCOME5,1.6,6
6,2023-01-07,1371,150,3,70.0,0.03,203.7,2,South,Credit Card,False,2,68,Male,Books,2.3,Not Returned,Positive,17.56,SAVE10,,7
7,2023-01-08,1051,100,7,56.6,0.0,396.2,6,North,Credit Card,False,1,38,Other,Food,1.8,Returned,Negative,,SAVE10,4.9,8
8,2023-01-09,1798,161,6,90.35,0.26,401.154,2,West,Cash,False,1,52,Female,Books,2.7,Not Returned,Positive,9.91,SAVE10,1.0,9
9,2023-01-10,1707,154,6,28.0,0.29,119.28,5,West,Credit Card,True,3,66,Female,Clothing,4.4,Returned,Positive,,,1.7,10


<div class="alert alert-success">

**Solution 07:**
</div>

In [15]:
pd.Series.is_unique?

[0;31mType:[0m        property
[0;31mString form:[0m <property object at 0x116f60220>
[0;31mDocstring:[0m  
Return boolean if values in the object are unique.

Returns
-------
bool

Examples
--------
>>> s = pd.Series([1, 2, 3])
>>> s.is_unique
True

>>> s = pd.Series([1, 2, 3, 1])
>>> s.is_unique
False

In [16]:
# 7. Checking That a Specific Column Contains Only Unique Values
# Write a Pandas program to check that a specific column contains only unique values.


def is_unique(column_name, dataframe):
    if dataframe[column_name].is_unique:
        print("The column {:<20} contains only unique values.".format(column_name))
    else:
        print("The column {:<20} has more than one unique value - has duplicates.".format(column_name))

for col in df.columns:
    is_unique(col, df)

The column date                 has more than one unique value - has duplicates.
The column customer_id          has more than one unique value - has duplicates.
The column product_id           has more than one unique value - has duplicates.
The column quantity             has more than one unique value - has duplicates.
The column price                has more than one unique value - has duplicates.
The column discount             has more than one unique value - has duplicates.
The column revenue              has more than one unique value - has duplicates.
The column store_id             has more than one unique value - has duplicates.
The column region               has more than one unique value - has duplicates.
The column payment_method       has more than one unique value - has duplicates.
The column is_member            has more than one unique value - has duplicates.
The column delivery_days        has more than one unique value - has duplicates.
The column customer_age     

<div class="alert alert-warning">

**8. Detecting Outliers in a DataFrame**

**Write a Pandas program to detect outliers in a DataFrame.**

</div>

In [17]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Add 3 more rows with negative price
df = pd.concat([df, pd.DataFrame({
    'date': pd.date_range(start='2023-02-05', periods=3, freq='D'),
    'customer_id': [1500, 1501, 1502],
    'product_id': [150, 151, 152],
    'quantity': [2, 3, 1],
    'price': [-20.00, -15.00, -30.00],  # Negative prices
    'discount': [0.1, 0.2, 0.15],
    'revenue': [0, 0, 0],  # Placeholder; will be recalculated
    'store_id': [5, 6, 7],
    'region': ['North', 'South', 'East'],
    'payment_method': ['Credit Card', 'Cash', 'Online'],
    'is_member': [True, False, True],
    'delivery_days': [3, 4, 2],
    'customer_age': [25, 34, 45],
    'gender': ['Male', 'Female', 'Other'],
    'product_category': ['Electronics', 'Clothing', 'Books'],
    'rating': [4.5, 3.0, 5.0],
    'return_status': ['Not Returned', 'Returned', 'Not Returned'],
    'feedback': ['Positive', 'Neutral', 'Negative'],
})], ignore_index=True)

# add column with unique value
df['unique_id'] = pd.Series(range(1, len(df) + 1))

# add outliers
df.loc[[0,3,5], ['price']] = [9999.99, 8888.88, 7777.77]

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1267,150,2,9999.99,0.2,45.36,1,North,Cash,True,5,30,Other,Clothing,2.1,Returned,Positive,16.86,FREESHIP,1.0,1
1,2023-01-02,1593,185,4,19.19,0.29,54.4996,3,West,Cash,False,2,53,Male,Books,2.5,Not Returned,Negative,13.02,SAVE10,2.7,2
2,2023-01-03,1044,154,3,20.1,0.08,55.476,5,North,Cash,False,2,29,Female,Clothing,3.3,Not Returned,Neutral,14.96,SAVE10,1.1,3
3,2023-01-04,1010,113,5,8888.88,0.03,156.8005,6,South,Credit Card,False,1,25,Other,Electronics,2.8,Returned,Neutral,7.75,WELCOME5,4.9,4
4,2023-01-05,1734,107,5,90.5,0.28,325.8,3,West,Cash,True,5,19,Male,Books,2.9,Returned,Negative,,WELCOME5,1.4,5
5,2023-01-06,1296,140,1,7777.77,0.05,44.6215,2,South,Online,False,5,21,Male,Food,4.5,Not Returned,Neutral,,WELCOME5,4.1,6
6,2023-01-07,1869,157,9,20.12,0.25,135.81,3,North,Online,False,1,53,Male,Food,1.4,Returned,Neutral,11.95,FREESHIP,2.2,7
7,2023-01-08,1241,135,6,84.37,0.29,359.4162,3,North,Credit Card,True,4,34,Female,Clothing,1.8,Returned,Negative,15.98,FREESHIP,2.4,8
8,2023-01-09,1310,183,1,60.21,0.15,51.1785,5,East,Online,True,4,56,Female,Electronics,1.3,Returned,Positive,7.35,WELCOME5,2.4,9
9,2023-01-10,1759,177,1,62.53,0.15,53.1505,4,East,Cash,False,2,24,Male,Clothing,4.6,Returned,Negative,11.69,WELCOME5,,10


<div class="alert alert-success">

**Solution 08:**
</div>

In [18]:
# 8. Detecting Outliers in a DataFrame
# Write a Pandas program to detect outliers in a DataFrame.

def detect_outliers(column_name, dataframe):
    if dataframe[column_name].dtype not in [np.float64, np.int64]:
        print("Column {:>30} is not numeric and will be skipped.".format(column_name))
        return
    else:
        Q1 = dataframe[column_name].quantile(0.25)
        Q3 = dataframe[column_name].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = dataframe[(dataframe[column_name] < lower_bound) | (dataframe[column_name] > upper_bound)]
    if not outliers.empty:
        print("Column {:>30} OUTLIERS DETECTED.\n####### OUTLIERS #######".format(column_name))
        display(outliers)
    else:
        print("Column {:>30} has no outliers detected.".format(column_name))

for col in df.columns:
    detect_outliers(col, df)

Column                           date is not numeric and will be skipped.
Column                    customer_id has no outliers detected.
Column                     product_id has no outliers detected.
Column                       quantity has no outliers detected.
Column                          price OUTLIERS DETECTED.
####### OUTLIERS #######


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1267,150,2,9999.99,0.2,45.36,1,North,Cash,True,5,30,Other,Clothing,2.1,Returned,Positive,16.86,FREESHIP,1.0,1
3,2023-01-04,1010,113,5,8888.88,0.03,156.8005,6,South,Credit Card,False,1,25,Other,Electronics,2.8,Returned,Neutral,7.75,WELCOME5,4.9,4
5,2023-01-06,1296,140,1,7777.77,0.05,44.6215,2,South,Online,False,5,21,Male,Food,4.5,Not Returned,Neutral,,WELCOME5,4.1,6


Column                       discount has no outliers detected.
Column                        revenue has no outliers detected.
Column                       store_id has no outliers detected.
Column                         region is not numeric and will be skipped.
Column                 payment_method is not numeric and will be skipped.
Column                      is_member is not numeric and will be skipped.
Column                  delivery_days has no outliers detected.
Column                   customer_age has no outliers detected.
Column                         gender is not numeric and will be skipped.
Column               product_category is not numeric and will be skipped.
Column                         rating has no outliers detected.
Column                  return_status is not numeric and will be skipped.
Column                       feedback is not numeric and will be skipped.
Column                  shipping_cost has no outliers detected.
Column                    coupon_c

In [19]:
# finding outliers using the 3-σ rule

def detect_outliers_3sigma(column_name, dataframe):
    if dataframe[column_name].dtype not in [np.float64, np.int64]:
        print("Column {:>30} is not numeric and will be skipped.".format(column_name))
        return
    else:
        mean = dataframe[column_name].mean()
        std = dataframe[column_name].std()
        lower_bound = mean - 3 * std
        upper_bound = mean + 3 * std
        outliers = dataframe[(dataframe[column_name] < lower_bound) | (dataframe[column_name] > upper_bound)]
    
    if not outliers.empty:
        print("Column {:>30} OUTLIERS DETECTED (3σ rule).\n####### OUTLIERS #######".format(column_name))
        display(outliers)
    else:
        print("Column {:>30} has no outliers detected.".format(column_name))

# Apply to all columns
for col in df.columns:
    detect_outliers_3sigma(col, df)


Column                           date is not numeric and will be skipped.
Column                    customer_id has no outliers detected.
Column                     product_id has no outliers detected.
Column                       quantity has no outliers detected.
Column                          price OUTLIERS DETECTED (3σ rule).
####### OUTLIERS #######


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1267,150,2,9999.99,0.2,45.36,1,North,Cash,True,5,30,Other,Clothing,2.1,Returned,Positive,16.86,FREESHIP,1.0,1
3,2023-01-04,1010,113,5,8888.88,0.03,156.8005,6,South,Credit Card,False,1,25,Other,Electronics,2.8,Returned,Neutral,7.75,WELCOME5,4.9,4
5,2023-01-06,1296,140,1,7777.77,0.05,44.6215,2,South,Online,False,5,21,Male,Food,4.5,Not Returned,Neutral,,WELCOME5,4.1,6


Column                       discount has no outliers detected.
Column                        revenue has no outliers detected.
Column                       store_id has no outliers detected.
Column                         region is not numeric and will be skipped.
Column                 payment_method is not numeric and will be skipped.
Column                      is_member is not numeric and will be skipped.
Column                  delivery_days has no outliers detected.
Column                   customer_age has no outliers detected.
Column                         gender is not numeric and will be skipped.
Column               product_category is not numeric and will be skipped.
Column                         rating has no outliers detected.
Column                  return_status is not numeric and will be skipped.
Column                       feedback is not numeric and will be skipped.
Column                  shipping_cost has no outliers detected.
Column                    coupon_c

<div class="alert alert-warning">

**9. Validating Date Formats in a DataFrame**

**Write a Pandas program to validate date formats in a DataFrame.**

</div>

In [20]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Add 3 more rows with negative price
df = pd.concat([df, pd.DataFrame({
    'date': pd.date_range(start='2023-02-05', periods=3, freq='D'),
    'customer_id': [1500, 1501, 1502],
    'product_id': [150, 151, 152],
    'quantity': [2, 3, 1],
    'price': [-20.00, -15.00, -30.00],  # Negative prices
    'discount': [0.1, 0.2, 0.15],
    'revenue': [0, 0, 0],  # Placeholder; will be recalculated
    'store_id': [5, 6, 7],
    'region': ['North', 'South', 'East'],
    'payment_method': ['Credit Card', 'Cash', 'Online'],
    'is_member': [True, False, True],
    'delivery_days': [3, 4, 2],
    'customer_age': [25, 34, 45],
    'gender': ['Male', 'Female', 'Other'],
    'product_category': ['Electronics', 'Clothing', 'Books'],
    'rating': [4.5, 3.0, 5.0],
    'return_status': ['Not Returned', 'Returned', 'Not Returned'],
    'feedback': ['Positive', 'Neutral', 'Negative'],
})], ignore_index=True)

# add column with unique value
df['unique_id'] = pd.Series(range(1, len(df) + 1))

# add outliers
df.loc[[0,3,5], ['price']] = [9999.99, 8888.88, 7777.77]

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1433,168,4,9999.99,0.04,326.7456,2,South,Online,False,2,30,Female,Books,1.4,Not Returned,Negative,11.24,SAVE10,,1
1,2023-01-02,1978,195,5,17.96,0.01,88.902,9,South,Credit Card,True,4,57,Other,Food,3.8,Returned,Neutral,19.75,SAVE10,,2
2,2023-01-03,1444,178,4,51.02,0.22,159.1824,4,West,Cash,False,5,19,Male,Clothing,3.3,Not Returned,Negative,7.62,SAVE10,4.2,3
3,2023-01-04,1276,174,1,8888.88,0.05,71.9245,1,South,Credit Card,True,2,20,Male,Clothing,4.8,Not Returned,Neutral,5.09,FREESHIP,4.3,4
4,2023-01-05,1530,138,8,10.03,0.27,58.5752,1,South,Cash,False,6,51,Other,Clothing,1.4,Returned,Neutral,,WELCOME5,1.8,5
5,2023-01-06,1567,103,7,7777.77,0.27,468.1271,9,West,Online,False,2,59,Male,Food,1.0,Returned,Positive,5.37,FREESHIP,3.3,6
6,2023-01-07,1948,141,9,91.8,0.2,660.96,3,North,Credit Card,True,2,33,Male,Books,2.5,Not Returned,Positive,12.71,WELCOME5,4.5,7
7,2023-01-08,1788,145,1,79.13,0.03,76.7561,4,East,Cash,False,5,55,Male,Food,2.7,Returned,Positive,,FREESHIP,4.4,8
8,2023-01-09,1328,174,5,92.12,0.29,327.026,2,South,Cash,False,6,48,Female,Electronics,2.3,Not Returned,Neutral,6.27,SAVE10,3.9,9
9,2023-01-10,1173,121,4,41.08,0.13,142.9584,1,West,Cash,True,6,45,Male,Clothing,3.0,Returned,Positive,,FREESHIP,,10


<div class="alert alert-success">

**Solution 09:**
</div>

In [21]:
# 9. Validating Date Formats in a DataFrame
# Write a Pandas program to validate date formats in a DataFrame.

def validate_date_formats(date_column, dataframe):
    if not pd.api.types.is_datetime64_any_dtype(dataframe[date_column]):
        print("Column {:>30} is not a datetime type and will be skipped.".format(date_column))
        return
    else:
        invalid_dates = dataframe[~dataframe[date_column].dt.strftime('%Y-%m-%d').str.match(r'^\d{4}-\d{2}-\d{2}$')]
    if not invalid_dates.empty:
        print("Column {:>30} INVALID DATES DETECTED.\n####### INVALID DATES #######".format(date_column))
        display(invalid_dates)
    else:
        print("Column {:>30} has no invalid dates detected.".format(date_column))

# Apply to all columns
for col in df.columns:
    validate_date_formats(col, df)


Column                           date has no invalid dates detected.
Column                    customer_id is not a datetime type and will be skipped.
Column                     product_id is not a datetime type and will be skipped.
Column                       quantity is not a datetime type and will be skipped.
Column                          price is not a datetime type and will be skipped.
Column                       discount is not a datetime type and will be skipped.
Column                        revenue is not a datetime type and will be skipped.
Column                       store_id is not a datetime type and will be skipped.
Column                         region is not a datetime type and will be skipped.
Column                 payment_method is not a datetime type and will be skipped.
Column                      is_member is not a datetime type and will be skipped.
Column                  delivery_days is not a datetime type and will be skipped.
Column                   cust

<div class="alert alert-warning">

**10. Ensuring No Missing Values in a Critical Column**

**Write a Pandas program that ensures no missing values in a critical column.**

</div>

In [22]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Add 3 more rows with negative price
df = pd.concat([df, pd.DataFrame({
    'date': pd.date_range(start='2023-02-05', periods=3, freq='D'),
    'customer_id': [1500, 1501, 1502],
    'product_id': [150, 151, 152],
    'quantity': [2, 3, 1],
    'price': [-20.00, -15.00, -30.00],  # Negative prices
    'discount': [0.1, 0.2, 0.15],
    'revenue': [0, 0, 0],  # Placeholder; will be recalculated
    'store_id': [5, 6, 7],
    'region': ['North', 'South', 'East'],
    'payment_method': ['Credit Card', 'Cash', 'Online'],
    'is_member': [True, False, True],
    'delivery_days': [3, 4, 2],
    'customer_age': [25, 34, 45],
    'gender': ['Male', 'Female', 'Other'],
    'product_category': ['Electronics', 'Clothing', 'Books'],
    'rating': [4.5, 3.0, 5.0],
    'return_status': ['Not Returned', 'Returned', 'Not Returned'],
    'feedback': ['Positive', 'Neutral', 'Negative'],
})], ignore_index=True)

# add column with unique value
df['unique_id'] = pd.Series(range(1, len(df) + 1))

# add outliers
df.loc[[0,3,5], ['price']] = [9999.99, 8888.88, 7777.77]

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1220,137,1,9999.99,0.05,42.2085,8,East,Cash,False,4,61,Female,Books,2.4,Returned,Positive,,WELCOME5,4.9,1
1,2023-01-02,1530,119,4,67.63,0.15,229.942,7,East,Online,True,1,43,Other,Food,3.9,Returned,Neutral,10.5,WELCOME5,2.0,2
2,2023-01-03,1013,186,4,15.32,0.03,59.4416,6,West,Online,False,2,22,Other,Books,3.0,Returned,Neutral,6.68,,,3
3,2023-01-04,1438,130,6,8888.88,0.17,164.589,2,South,Credit Card,False,4,39,Male,Food,2.7,Not Returned,Negative,9.82,WELCOME5,1.7,4
4,2023-01-05,1618,147,7,76.5,0.22,417.69,8,East,Cash,True,2,22,Male,Food,2.4,Returned,Neutral,,FREESHIP,2.5,5
5,2023-01-06,1618,164,3,7777.77,0.29,97.554,4,North,Online,False,1,34,Female,Books,1.3,Returned,Neutral,,,1.4,6
6,2023-01-07,1997,152,2,52.18,0.01,103.3164,6,West,Credit Card,False,1,20,Male,Electronics,4.1,Not Returned,Positive,18.04,,1.2,7
7,2023-01-08,1599,154,8,87.87,0.22,548.3088,9,West,Credit Card,True,1,68,Other,Books,2.1,Not Returned,Positive,,SAVE10,1.9,8
8,2023-01-09,1298,101,8,17.58,0.23,108.2928,1,South,Credit Card,False,3,28,Male,Clothing,4.1,Not Returned,Neutral,12.14,WELCOME5,2.5,9
9,2023-01-10,1790,125,1,28.16,0.03,27.3152,4,South,Cash,True,1,61,Male,Books,3.7,Not Returned,Positive,13.99,FREESHIP,1.9,10


<div class="alert alert-success">

**Solution 10:**
</div>

In [23]:
# 10. Ensuring No Missing Values in a Critical Column
# Write a Pandas program that ensures no missing values in a critical column.

def check_missing_values(column_name, dataframe):
    missing_values = dataframe[column_name].isnull().sum()
    if missing_values > 0:
        print("Column {:>30} has {} missing values.".format(column_name, missing_values))
        display(dataframe[dataframe[column_name].isnull()])
    else:
        print("Column {:>30} has no missing values.".format(column_name))

# Apply to all columns
for col in df.columns:
    check_missing_values(col, df)

Column                           date has no missing values.
Column                    customer_id has no missing values.
Column                     product_id has no missing values.
Column                       quantity has no missing values.
Column                          price has no missing values.
Column                       discount has no missing values.
Column                        revenue has no missing values.
Column                       store_id has no missing values.
Column                         region has no missing values.
Column                 payment_method has no missing values.
Column                      is_member has no missing values.
Column                  delivery_days has no missing values.
Column                   customer_age has no missing values.
Column                         gender has no missing values.
Column               product_category has no missing values.
Column                         rating has no missing values.
Column                  

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1220,137,1,9999.99,0.05,42.2085,8,East,Cash,False,4,61,Female,Books,2.4,Returned,Positive,,WELCOME5,4.9,1
4,2023-01-05,1618,147,7,76.5,0.22,417.69,8,East,Cash,True,2,22,Male,Food,2.4,Returned,Neutral,,FREESHIP,2.5,5
5,2023-01-06,1618,164,3,7777.77,0.29,97.554,4,North,Online,False,1,34,Female,Books,1.3,Returned,Neutral,,,1.4,6
7,2023-01-08,1599,154,8,87.87,0.22,548.3088,9,West,Credit Card,True,1,68,Other,Books,2.1,Not Returned,Positive,,SAVE10,1.9,8
17,2023-01-18,1077,129,2,64.36,0.09,117.1352,1,South,Credit Card,False,1,19,Other,Electronics,3.7,Returned,Negative,,,4.1,18
21,2023-01-22,1821,181,7,32.68,0.21,180.7204,7,East,Cash,True,5,47,Female,Food,2.6,Returned,Negative,,FREESHIP,1.0,22
27,2023-01-28,1743,163,5,36.97,0.1,166.365,6,East,Cash,True,3,34,Other,Electronics,3.2,Returned,Neutral,,WELCOME5,2.8,28
28,2023-01-29,1500,104,5,13.3,0.27,48.545,3,West,Online,True,4,45,Other,Clothing,5.0,Not Returned,Negative,,FREESHIP,4.7,29
32,2023-02-02,1084,103,5,85.2,0.29,302.46,7,East,Cash,True,2,33,Other,Books,4.8,Not Returned,Negative,,,,33
38,2023-02-05,1500,150,2,-20.0,0.1,0.0,5,North,Credit Card,True,3,25,Male,Electronics,4.5,Not Returned,Positive,,,,39


Column                    coupon_code has 3 missing values.


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
38,2023-02-05,1500,150,2,-20.0,0.1,0.0,5,North,Credit Card,True,3,25,Male,Electronics,4.5,Not Returned,Positive,,,,39
39,2023-02-06,1501,151,3,-15.0,0.2,0.0,6,South,Cash,False,4,34,Female,Clothing,3.0,Returned,Neutral,,,,40
40,2023-02-07,1502,152,1,-30.0,0.15,0.0,7,East,Online,True,2,45,Other,Books,5.0,Not Returned,Negative,,,,41


Column                delivery_rating has 11 missing values.


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
2,2023-01-03,1013,186,4,15.32,0.03,59.4416,6,West,Online,False,2,22,Other,Books,3.0,Returned,Neutral,6.68,,,3
18,2023-01-19,1058,173,5,52.52,0.26,194.324,3,East,Credit Card,True,2,39,Other,Food,2.7,Not Returned,Negative,9.12,FREESHIP,,19
20,2023-01-21,1604,156,9,40.2,0.14,311.148,5,West,Online,True,5,66,Other,Electronics,1.5,Not Returned,Negative,5.92,SAVE10,,21
24,2023-01-25,1560,129,3,17.27,0.03,50.2557,4,East,Online,True,1,31,Other,Electronics,3.5,Returned,Negative,5.19,SAVE10,,25
25,2023-01-26,1307,101,2,86.18,0.22,134.4408,9,West,Credit Card,True,1,20,Male,Books,4.6,Returned,Neutral,9.94,SAVE10,,26
32,2023-02-02,1084,103,5,85.2,0.29,302.46,7,East,Cash,True,2,33,Other,Books,4.8,Not Returned,Negative,,,,33
33,2023-02-03,1164,161,5,43.56,0.3,152.46,9,West,Online,True,2,32,Male,Electronics,2.4,Not Returned,Positive,19.42,,,34
37,2023-01-25,1560,129,3,17.27,0.03,50.2557,4,East,Online,True,1,31,Other,Electronics,3.5,Returned,Negative,5.19,SAVE10,,38
38,2023-02-05,1500,150,2,-20.0,0.1,0.0,5,North,Credit Card,True,3,25,Male,Electronics,4.5,Not Returned,Positive,,,,39
39,2023-02-06,1501,151,3,-15.0,0.2,0.0,6,South,Cash,False,4,34,Female,Clothing,3.0,Returned,Neutral,,,,40


Column                      unique_id has no missing values.
