[Pandas Data Validation](https://www.w3resource.com/python-exercises/pandas/pandas-data-validation.php)

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

<div class="alert alert-warning">

**1. Checking Missing Values in a DataFrame**

**Write a Pandas program to check for missing values in a DataFrame.**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1398,166,5,38.13,0.09,173.4915,2,West,Cash,False,3,64,Female,Electronics,4.8,Not Returned,Neutral,8.27,SAVE10,
1,2023-01-02,1773,171,7,77.66,0.23,418.5874,4,West,Credit Card,True,4,44,Other,Clothing,1.3,Not Returned,Neutral,12.3,,4.7
2,2023-01-03,1631,146,4,25.76,0.04,98.9184,3,West,Cash,True,5,56,Male,Electronics,2.6,Returned,Positive,9.23,FREESHIP,
3,2023-01-04,1262,169,6,15.02,0.27,65.7876,9,West,Credit Card,True,6,30,Female,Books,3.0,Returned,Neutral,11.39,SAVE10,3.9
4,2023-01-05,1747,196,6,82.9,0.2,397.92,8,North,Credit Card,False,4,59,Female,Books,2.1,Returned,Negative,7.13,FREESHIP,4.6
5,2023-01-06,1520,174,9,64.29,0.29,410.8131,6,North,Cash,False,3,52,Other,Food,4.6,Not Returned,Negative,5.8,FREESHIP,4.6
6,2023-01-07,1184,118,5,69.69,0.07,324.0585,6,West,Cash,True,5,32,Female,Food,4.0,Returned,Negative,16.42,FREESHIP,1.6
7,2023-01-08,1838,152,6,18.42,0.01,109.4148,5,North,Cash,False,2,62,Male,Books,1.3,Not Returned,Positive,9.59,WELCOME5,3.4
8,2023-01-09,1634,169,7,16.24,0.06,106.8592,6,West,Online,False,5,68,Female,Electronics,1.7,Returned,Neutral,,SAVE10,
9,2023-01-10,1614,164,8,79.16,0.23,487.6256,8,South,Online,False,5,18,Female,Electronics,3.9,Not Returned,Negative,,SAVE10,1.1


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Checking Missing Values in a DataFrame
# Write a Pandas program to check for missing values in a DataFrame.

# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

shipping_cost       5
delivery_rating    11
dtype: int64

<div class="alert alert-warning">

**2. Ensuring Consistent Data Types of All Columns**

**Write a Pandas program to check and ensure that the data types of all columns are consistent.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1720,104,7,23.78,0.26,123.1804,2,East,Cash,True,6,27,Other,Electronics,1.8,Returned,Neutral,11.76,,4.6
1,2023-01-02,1217,126,6,15.5,0.3,65.1,3,South,Credit Card,False,1,50,Male,Electronics,3.5,Returned,Neutral,,,3.5
2,2023-01-03,1954,117,3,19.27,0.24,43.9356,2,West,Online,True,1,47,Male,Food,2.2,Not Returned,Positive,,,3.0
3,2023-01-04,1283,132,4,13.37,0.08,49.2016,9,East,Online,True,4,43,Male,Clothing,2.8,Returned,Positive,10.76,FREESHIP,4.7
4,2023-01-05,1666,174,7,44.82,0.14,269.8164,4,South,Online,True,5,40,Male,Food,3.0,Not Returned,Neutral,12.29,FREESHIP,
5,2023-01-06,1261,170,7,25.92,0.24,137.8944,6,West,Online,False,2,36,Female,Food,2.2,Not Returned,Negative,,FREESHIP,3.8
6,2023-01-07,1205,196,8,25.24,0.18,165.5744,1,North,Online,True,2,21,Male,Books,3.3,Returned,Positive,15.93,,
7,2023-01-08,1560,168,6,49.09,0.22,229.7412,3,East,Online,False,4,38,Male,Books,2.8,Not Returned,Negative,8.32,WELCOME5,2.0
8,2023-01-09,1298,195,6,35.01,0.24,159.6456,4,East,Credit Card,False,3,66,Male,Clothing,4.6,Returned,Positive,7.27,WELCOME5,2.4
9,2023-01-10,1846,119,9,30.15,0.07,252.3555,1,South,Online,True,3,19,Male,Electronics,3.1,Returned,Neutral,17.98,SAVE10,4.4


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Ensuring Consistent Data Types of All Columns
# Write a Pandas program to check and ensure that the data types of all columns are consistent.

# Check data types
data_types = df.dtypes
data_types

date                datetime64[ns]
customer_id                  int64
product_id                   int64
quantity                     int64
price                      float64
discount                   float64
revenue                    float64
store_id                     int64
region                      object
payment_method              object
is_member                     bool
delivery_days                int64
customer_age                 int64
gender                      object
product_category            object
rating                     float64
return_status               object
feedback                    object
shipping_cost              float64
coupon_code                 object
delivery_rating            float64
dtype: object

<div class="alert alert-warning">

**3. Validating the Data Type of a Specific Column**

**Write a Pandas program to validate the data type of a specific column in a DataFrame.**

</div>

In [6]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1522,190,2,19.57,0.03,37.9658,1,West,Cash,True,3,67,Other,Food,2.9,Not Returned,Neutral,12.5,,
1,2023-01-02,1037,183,4,16.11,0.17,53.4852,9,North,Credit Card,False,4,40,Male,Books,1.2,Returned,Neutral,,WELCOME5,1.9
2,2023-01-03,1704,122,9,29.11,0.13,227.9313,2,East,Cash,False,4,53,Other,Food,2.9,Not Returned,Positive,17.63,,4.0
3,2023-01-04,1090,167,7,49.52,0.14,298.1104,8,West,Cash,True,6,62,Other,Food,2.7,Returned,Negative,,FREESHIP,4.0
4,2023-01-05,1404,147,6,49.77,0.28,215.0064,3,East,Online,True,5,64,Other,Books,3.2,Returned,Neutral,12.95,WELCOME5,
5,2023-01-06,1053,125,6,67.69,0.21,320.8506,4,East,Credit Card,False,1,50,Other,Food,4.9,Returned,Neutral,,,2.1
6,2023-01-07,1931,169,2,19.84,0.04,38.0928,6,North,Online,False,2,56,Female,Food,4.6,Not Returned,Neutral,13.67,,
7,2023-01-08,1724,106,5,80.13,0.06,376.611,6,North,Online,False,3,40,Female,Electronics,1.2,Not Returned,Neutral,15.9,WELCOME5,
8,2023-01-09,1141,173,4,38.24,0.04,146.8416,6,South,Cash,False,4,31,Other,Electronics,4.5,Returned,Neutral,,,
9,2023-01-10,1167,124,2,20.59,0.14,35.4148,3,South,Online,False,2,21,Female,Clothing,3.5,Not Returned,Negative,16.03,,


<div class="alert alert-success">

**Solution 03:**
</div>

In [7]:
# 3. Validating the Data Type of a Specific Column
# Write a Pandas program to validate the data type of a specific column in a DataFrame.

# Check if 'price' column is of type float
is_of_type = 'float64'
is_price_float = df['price'].dtype == is_of_type
is_price_float

# Validate the data type of 'price' column
if is_price_float:
    print("The 'price' column is of type {}.".format(df['price'].dtype))
else:
    print("The 'price' column is not of type {}, it is {}.".format(is_of_type, df['price'].dtype))

The 'price' column is of type float64.


<div class="alert alert-warning">

**4. Checking Duplicate Rows in a DataFrame**

**Write a Pandas program to check duplicate rows in a DataFrame.**

</div>

In [8]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1584,138,4,74.05,0.22,231.036,5,South,Online,False,1,40,Female,Clothing,4.5,Returned,Positive,10.63,,
1,2023-01-02,1292,174,7,21.82,0.13,132.8838,6,North,Cash,False,2,18,Female,Electronics,4.8,Returned,Neutral,19.88,SAVE10,
2,2023-01-03,1145,187,8,62.82,0.07,467.3808,5,North,Cash,True,3,55,Male,Clothing,1.3,Not Returned,Negative,8.39,WELCOME5,4.7
3,2023-01-04,1200,184,5,26.93,0.21,106.3735,4,East,Online,True,4,49,Male,Food,2.1,Returned,Neutral,18.78,SAVE10,2.7
4,2023-01-05,1942,155,5,47.32,0.07,220.038,1,North,Credit Card,True,3,61,Male,Food,3.3,Returned,Positive,14.22,,
5,2023-01-06,1938,141,8,91.66,0.03,711.2816,1,East,Credit Card,False,2,38,Other,Books,2.3,Not Returned,Neutral,15.52,,4.4
6,2023-01-07,1694,179,2,40.0,0.13,69.6,6,South,Online,True,3,43,Male,Clothing,4.4,Not Returned,Negative,19.06,FREESHIP,1.1
7,2023-01-08,1453,178,6,52.31,0.11,279.3354,6,West,Cash,False,3,55,Male,Food,3.2,Returned,Negative,12.63,,
8,2023-01-09,1357,107,4,77.75,0.15,264.35,8,North,Cash,True,5,60,Female,Electronics,4.5,Not Returned,Negative,7.87,SAVE10,1.4
9,2023-01-10,1048,172,7,40.36,0.25,211.89,1,East,Credit Card,False,4,27,Other,Books,2.5,Not Returned,Negative,17.92,FREESHIP,3.5


<div class="alert alert-success">

**Solution 04:**
</div>

In [9]:
# 4. Checking Duplicate Rows in a DataFrame
# Write a Pandas program to check duplicate rows in a DataFrame.

# Check for duplicate rows
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
35,2023-01-27,1669,195,8,68.59,0.03,532.2584,2,South,Credit Card,True,1,24,Male,Electronics,3.9,Returned,Negative,,WELCOME5,3.1
36,2023-01-14,1811,148,1,54.8,0.07,50.964,9,South,Cash,True,4,23,Female,Electronics,1.5,Returned,Neutral,8.14,FREESHIP,
37,2023-01-25,1289,128,7,42.04,0.07,273.6804,8,North,Cash,False,1,35,Other,Books,1.9,Returned,Positive,6.48,SAVE10,2.0


<div class="alert alert-warning">

**5. Removing Duplicate Rows from a DataFrame**

**Write a Pandas program to remove duplicate rows from a DataFrame.**

</div>

In [10]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1765,149,8,23.92,0.11,170.3104,2,South,Online,False,1,28,Male,Clothing,5.0,Returned,Neutral,16.4,WELCOME5,1.7
1,2023-01-02,1745,133,2,84.79,0.12,149.2304,2,North,Online,True,2,36,Male,Books,2.7,Not Returned,Negative,6.65,,4.5
2,2023-01-03,1936,183,6,48.04,0.12,253.6512,5,South,Cash,True,3,50,Other,Books,5.0,Not Returned,Negative,5.93,,4.4
3,2023-01-04,1637,142,4,24.17,0.18,79.2776,2,East,Credit Card,False,3,59,Other,Books,3.1,Not Returned,Neutral,,,
4,2023-01-05,1623,168,5,10.99,0.29,39.0145,3,East,Cash,False,2,41,Female,Clothing,3.1,Returned,Positive,15.72,,1.1
5,2023-01-06,1751,101,3,78.57,0.12,207.4248,3,East,Online,False,3,32,Other,Electronics,1.1,Returned,Positive,,WELCOME5,
6,2023-01-07,1391,160,6,67.33,0.21,319.1442,3,South,Credit Card,False,4,49,Female,Books,4.2,Not Returned,Positive,17.12,SAVE10,5.0
7,2023-01-08,1780,106,8,94.96,0.15,645.728,2,West,Cash,True,5,43,Other,Electronics,2.8,Not Returned,Positive,14.2,,
8,2023-01-09,1348,137,1,27.33,0.21,21.5907,8,West,Cash,False,3,65,Male,Food,3.9,Not Returned,Negative,5.2,FREESHIP,
9,2023-01-10,1579,196,1,81.71,0.12,71.9048,2,East,Credit Card,False,2,32,Other,Food,3.8,Returned,Neutral,10.61,,4.5


<div class="alert alert-success">

**Solution 05:**
</div>

In [11]:
# 5. Removing Duplicate Rows from a DataFrame
# Write a Pandas program to remove duplicate rows from a DataFrame.

print("Shape of DataFrame before removing duplicates:\n", df.shape)
# Remove duplicate rows
df = df.drop_duplicates()
print("Shape of DataFrame after removing duplicates:\n", df.shape)

Shape of DataFrame before removing duplicates:
 (38, 21)
Shape of DataFrame after removing duplicates:
 (35, 21)


<div class="alert alert-warning">

**6. Validating Data Based on Custom Conditions**

**Write a Pandas program to validate data based on custom conditions.**

</div>

In [12]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Add 3 more rows with negative price
df = pd.concat([df, pd.DataFrame({
    'date': pd.date_range(start='2023-02-05', periods=3, freq='D'),
    'customer_id': [1500, 1501, 1502],
    'product_id': [150, 151, 152],
    'quantity': [2, 3, 1],
    'price': [-20.00, -15.00, -30.00],  # Negative prices
    'discount': [0.1, 0.2, 0.15],
    'revenue': [0, 0, 0],  # Placeholder; will be recalculated
    'store_id': [5, 6, 7],
    'region': ['North', 'South', 'East'],
    'payment_method': ['Credit Card', 'Cash', 'Online'],
    'is_member': [True, False, True],
    'delivery_days': [3, 4, 2],
    'customer_age': [25, 34, 45],
    'gender': ['Male', 'Female', 'Other'],
    'product_category': ['Electronics', 'Clothing', 'Books'],
    'rating': [4.5, 3.0, 5.0],
    'return_status': ['Not Returned', 'Returned', 'Not Returned'],
    'feedback': ['Positive', 'Neutral', 'Negative'],
})], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1255,119,7,67.95,0.22,371.007,5,West,Online,False,6,45,Male,Food,2.2,Not Returned,Neutral,18.47,WELCOME5,4.2
1,2023-01-02,1942,173,8,20.39,0.18,133.7584,7,East,Credit Card,True,2,61,Male,Clothing,2.7,Not Returned,Negative,5.16,FREESHIP,4.0
2,2023-01-03,1773,198,5,48.06,0.08,221.076,5,West,Cash,False,1,30,Other,Clothing,2.8,Returned,Neutral,15.26,FREESHIP,
3,2023-01-04,1724,166,8,29.25,0.1,210.6,6,North,Cash,True,5,44,Female,Clothing,4.6,Returned,Negative,15.19,,
4,2023-01-05,1164,188,3,59.38,0.02,174.5772,6,South,Cash,True,2,46,Other,Electronics,4.4,Returned,Positive,6.1,,4.8
5,2023-01-06,1214,113,6,79.59,0.24,362.9304,5,North,Online,False,4,56,Female,Electronics,1.8,Not Returned,Negative,,FREESHIP,3.9
6,2023-01-07,1044,174,8,73.06,0.05,555.256,6,South,Cash,False,2,65,Female,Electronics,1.3,Returned,Neutral,14.11,SAVE10,
7,2023-01-08,1436,189,5,60.45,0.01,299.2275,4,West,Online,True,2,33,Female,Electronics,1.0,Not Returned,Negative,8.11,FREESHIP,3.3
8,2023-01-09,1987,131,6,68.56,0.28,296.1792,7,West,Online,True,6,40,Male,Books,3.2,Returned,Neutral,14.35,,
9,2023-01-10,1524,121,6,22.28,0.27,97.5864,4,South,Credit Card,True,1,35,Female,Electronics,3.5,Not Returned,Positive,11.45,WELCOME5,3.1


<div class="alert alert-success">

**Solution 06:**
</div>

In [13]:
# 6. Validating Data Based on Custom Conditions
# Write a Pandas program to validate data based on custom conditions.

# Check if 'price' is greater than 0
valid_price = df['price'] > 0
if valid_price.all():
    print("All prices are valid.")
else:
    print("Some prices are invalid:")
    display(df[~valid_price])

Some prices are invalid:


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
38,2023-02-05,1500,150,2,-20.0,0.1,0.0,5,North,Credit Card,True,3,25,Male,Electronics,4.5,Not Returned,Positive,,,
39,2023-02-06,1501,151,3,-15.0,0.2,0.0,6,South,Cash,False,4,34,Female,Clothing,3.0,Returned,Neutral,,,
40,2023-02-07,1502,152,1,-30.0,0.15,0.0,7,East,Online,True,2,45,Other,Books,5.0,Not Returned,Negative,,,


<div class="alert alert-warning">

**7. Checking That a Specific Column Contains Only Unique Values**

**Write a Pandas program to check that a specific column contains only unique values.**

</div>

In [14]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Add 3 more rows with negative price
df = pd.concat([df, pd.DataFrame({
    'date': pd.date_range(start='2023-02-05', periods=3, freq='D'),
    'customer_id': [1500, 1501, 1502],
    'product_id': [150, 151, 152],
    'quantity': [2, 3, 1],
    'price': [-20.00, -15.00, -30.00],  # Negative prices
    'discount': [0.1, 0.2, 0.15],
    'revenue': [0, 0, 0],  # Placeholder; will be recalculated
    'store_id': [5, 6, 7],
    'region': ['North', 'South', 'East'],
    'payment_method': ['Credit Card', 'Cash', 'Online'],
    'is_member': [True, False, True],
    'delivery_days': [3, 4, 2],
    'customer_age': [25, 34, 45],
    'gender': ['Male', 'Female', 'Other'],
    'product_category': ['Electronics', 'Clothing', 'Books'],
    'rating': [4.5, 3.0, 5.0],
    'return_status': ['Not Returned', 'Returned', 'Not Returned'],
    'feedback': ['Positive', 'Neutral', 'Negative'],
})], ignore_index=True)

# add column with unique value
df['unique_id'] = pd.Series(range(1, len(df) + 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1066,199,1,30.6,0.25,22.95,4,East,Cash,False,6,66,Male,Food,1.0,Not Returned,Positive,15.45,FREESHIP,,1
1,2023-01-02,1160,106,9,61.1,0.2,439.92,6,North,Cash,True,5,19,Female,Clothing,1.9,Returned,Neutral,10.89,SAVE10,2.5,2
2,2023-01-03,1853,136,5,99.43,0.17,412.6345,3,East,Online,False,4,48,Female,Electronics,3.3,Not Returned,Positive,,FREESHIP,3.1,3
3,2023-01-04,1458,187,3,74.92,0.26,166.3224,6,North,Credit Card,False,6,67,Female,Clothing,1.3,Not Returned,Positive,17.36,,1.2,4
4,2023-01-05,1668,107,8,26.32,0.3,147.392,4,North,Cash,False,2,20,Other,Food,2.7,Returned,Negative,,FREESHIP,3.8,5
5,2023-01-06,1417,144,4,40.21,0.11,143.1476,7,East,Credit Card,False,3,65,Male,Food,1.1,Returned,Negative,12.25,,1.1,6
6,2023-01-07,1971,129,9,86.45,0.1,700.245,6,South,Cash,True,4,49,Other,Electronics,4.2,Returned,Positive,10.71,,3.6,7
7,2023-01-08,1846,153,1,50.04,0.19,40.5324,1,East,Online,True,1,56,Male,Books,1.1,Not Returned,Neutral,14.95,FREESHIP,2.9,8
8,2023-01-09,1676,164,3,80.92,0.01,240.3324,3,East,Online,True,6,37,Other,Books,1.9,Returned,Neutral,,SAVE10,,9
9,2023-01-10,1391,154,8,48.46,0.27,283.0064,5,West,Online,False,5,46,Male,Books,4.5,Returned,Neutral,,FREESHIP,3.8,10


<div class="alert alert-success">

**Solution 07:**
</div>

In [15]:
pd.Series.is_unique?

[0;31mType:[0m        property
[0;31mString form:[0m <property object at 0x12465c6d0>
[0;31mDocstring:[0m  
Return boolean if values in the object are unique.

Returns
-------
bool

Examples
--------
>>> s = pd.Series([1, 2, 3])
>>> s.is_unique
True

>>> s = pd.Series([1, 2, 3, 1])
>>> s.is_unique
False

In [16]:
# 7. Checking That a Specific Column Contains Only Unique Values
# Write a Pandas program to check that a specific column contains only unique values.


def is_unique(column_name, dataframe):
    if dataframe[column_name].is_unique:
        print("The column {:<20} contains only unique values.".format(column_name))
    else:
        print("The column {:<20} has more than one unique value - has duplicates.".format(column_name))

for col in df.columns:
    is_unique(col, df)

The column date                 has more than one unique value - has duplicates.
The column customer_id          has more than one unique value - has duplicates.
The column product_id           has more than one unique value - has duplicates.
The column quantity             has more than one unique value - has duplicates.
The column price                has more than one unique value - has duplicates.
The column discount             has more than one unique value - has duplicates.
The column revenue              has more than one unique value - has duplicates.
The column store_id             has more than one unique value - has duplicates.
The column region               has more than one unique value - has duplicates.
The column payment_method       has more than one unique value - has duplicates.
The column is_member            has more than one unique value - has duplicates.
The column delivery_days        has more than one unique value - has duplicates.
The column customer_age     

<div class="alert alert-warning">

**8. Detecting Outliers in a DataFrame**

**Write a Pandas program to detect outliers in a DataFrame.**

</div>

In [17]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Add 3 more rows with negative price
df = pd.concat([df, pd.DataFrame({
    'date': pd.date_range(start='2023-02-05', periods=3, freq='D'),
    'customer_id': [1500, 1501, 1502],
    'product_id': [150, 151, 152],
    'quantity': [2, 3, 1],
    'price': [-20.00, -15.00, -30.00],  # Negative prices
    'discount': [0.1, 0.2, 0.15],
    'revenue': [0, 0, 0],  # Placeholder; will be recalculated
    'store_id': [5, 6, 7],
    'region': ['North', 'South', 'East'],
    'payment_method': ['Credit Card', 'Cash', 'Online'],
    'is_member': [True, False, True],
    'delivery_days': [3, 4, 2],
    'customer_age': [25, 34, 45],
    'gender': ['Male', 'Female', 'Other'],
    'product_category': ['Electronics', 'Clothing', 'Books'],
    'rating': [4.5, 3.0, 5.0],
    'return_status': ['Not Returned', 'Returned', 'Not Returned'],
    'feedback': ['Positive', 'Neutral', 'Negative'],
})], ignore_index=True)

# add column with unique value
df['unique_id'] = pd.Series(range(1, len(df) + 1))

# add outliers
df.loc[[0,3,5], ['price']] = [9999.99, 8888.88, 7777.77]

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1458,149,9,9999.99,0.24,90.2196,7,West,Credit Card,True,4,59,Other,Clothing,2.4,Returned,Neutral,16.83,WELCOME5,1.0,1
1,2023-01-02,1682,179,5,68.11,0.21,269.0345,5,South,Credit Card,False,6,49,Other,Electronics,4.0,Returned,Neutral,16.24,,4.3,2
2,2023-01-03,1865,105,8,60.15,0.28,346.464,5,West,Online,False,6,46,Female,Books,1.2,Not Returned,Negative,,,4.6,3
3,2023-01-04,1711,154,4,8888.88,0.04,150.72,1,North,Credit Card,False,5,52,Male,Clothing,2.8,Not Returned,Neutral,11.88,SAVE10,3.5,4
4,2023-01-05,1975,183,4,77.01,0.09,280.3164,2,West,Online,False,1,22,Male,Electronics,4.6,Not Returned,Positive,18.45,WELCOME5,,5
5,2023-01-06,1867,171,2,7777.77,0.1,138.834,9,South,Cash,True,4,26,Other,Food,4.2,Returned,Positive,10.79,,,6
6,2023-01-07,1080,123,2,21.48,0.29,30.5016,1,East,Credit Card,False,6,66,Male,Food,4.4,Returned,Negative,19.86,SAVE10,,7
7,2023-01-08,1675,196,1,45.33,0.27,33.0909,7,East,Online,False,1,54,Other,Food,1.3,Returned,Positive,7.21,FREESHIP,,8
8,2023-01-09,1618,159,7,45.32,0.01,314.0676,6,South,Cash,False,5,35,Other,Books,3.8,Not Returned,Negative,12.71,FREESHIP,,9
9,2023-01-10,1091,138,9,12.84,0.26,85.5144,1,West,Credit Card,False,1,65,Male,Electronics,2.9,Not Returned,Positive,,WELCOME5,3.9,10


<div class="alert alert-success">

**Solution 08:**
</div>

In [18]:
# 8. Detecting Outliers in a DataFrame
# Write a Pandas program to detect outliers in a DataFrame.

def detect_outliers(column_name, dataframe):
    if dataframe[column_name].dtype not in [np.float64, np.int64]:
        print("Column {:>30} is not numeric and will be skipped.".format(column_name))
        return
    else:
        Q1 = dataframe[column_name].quantile(0.25)
        Q3 = dataframe[column_name].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers = dataframe[(dataframe[column_name] < lower_bound) | (dataframe[column_name] > upper_bound)]
    if not outliers.empty:
        print("Column {:>30} OUTLIERS DETECTED.\n####### OUTLIERS #######".format(column_name))
        display(outliers)
    else:
        print("Column {:>30} has no outliers detected.".format(column_name))

for col in df.columns:
    detect_outliers(col, df)

Column                           date is not numeric and will be skipped.
Column                    customer_id has no outliers detected.
Column                     product_id has no outliers detected.
Column                       quantity has no outliers detected.
Column                          price OUTLIERS DETECTED.
####### OUTLIERS #######


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
0,2023-01-01,1458,149,9,9999.99,0.24,90.2196,7,West,Credit Card,True,4,59,Other,Clothing,2.4,Returned,Neutral,16.83,WELCOME5,1.0,1
3,2023-01-04,1711,154,4,8888.88,0.04,150.72,1,North,Credit Card,False,5,52,Male,Clothing,2.8,Not Returned,Neutral,11.88,SAVE10,3.5,4
5,2023-01-06,1867,171,2,7777.77,0.1,138.834,9,South,Cash,True,4,26,Other,Food,4.2,Returned,Positive,10.79,,,6


Column                       discount has no outliers detected.
Column                        revenue OUTLIERS DETECTED.
####### OUTLIERS #######


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating,unique_id
20,2023-01-21,1294,130,8,94.91,0.13,660.5736,1,West,Credit Card,True,3,34,Male,Electronics,3.7,Not Returned,Positive,5.55,WELCOME5,3.0,21


Column                       store_id has no outliers detected.
Column                         region is not numeric and will be skipped.
Column                 payment_method is not numeric and will be skipped.
Column                      is_member is not numeric and will be skipped.
Column                  delivery_days has no outliers detected.
Column                   customer_age has no outliers detected.
Column                         gender is not numeric and will be skipped.
Column               product_category is not numeric and will be skipped.
Column                         rating has no outliers detected.
Column                  return_status is not numeric and will be skipped.
Column                       feedback is not numeric and will be skipped.
Column                  shipping_cost has no outliers detected.
Column                    coupon_code is not numeric and will be skipped.
Column                delivery_rating has no outliers detected.
Column                  