[Pandas Data Validation](https://www.w3resource.com/python-exercises/pandas/pandas-data-validation.php)

In [1]:
import pandas as pd
import numpy as np

<div class="alert alert-warning">

**1. Checking Missing Values in a DataFrame**

**Write a Pandas program to check for missing values in a DataFrame.**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1234,194,6,41.09,0.29,175.0434,8,East,Cash,...,4,30,Female,Books,2.9,Returned,Positive,6.02,SAVE10,1.0
1,2023-01-02,1236,117,3,32.59,0.06,91.9038,6,West,Cash,...,3,48,Other,Clothing,4.5,Not Returned,Positive,12.71,WELCOME5,2.7
2,2023-01-03,1035,103,5,87.29,0.27,318.6085,9,South,Cash,...,4,30,Male,Food,2.6,Not Returned,Negative,9.36,,3.0
3,2023-01-04,1090,111,2,51.45,0.04,98.784,2,West,Cash,...,2,53,Other,Clothing,1.3,Returned,Negative,18.52,WELCOME5,3.4
4,2023-01-05,1660,123,5,75.64,0.29,268.522,8,South,Cash,...,6,36,Other,Food,3.7,Not Returned,Neutral,7.23,SAVE10,2.3
5,2023-01-06,1407,169,3,57.8,0.15,147.39,5,South,Credit Card,...,2,25,Female,Books,2.2,Not Returned,Negative,19.1,,
6,2023-01-07,1267,188,7,93.16,0.21,515.1748,8,North,Cash,...,6,61,Male,Books,3.3,Not Returned,Neutral,,SAVE10,2.5
7,2023-01-08,1230,129,2,14.86,0.15,25.262,7,North,Cash,...,5,39,Female,Food,4.1,Not Returned,Positive,8.51,,3.5
8,2023-01-09,1142,176,5,40.82,0.24,155.116,5,North,Online,...,1,39,Other,Electronics,1.9,Returned,Negative,13.54,,
9,2023-01-10,1704,123,4,21.92,0.29,62.2528,3,West,Online,...,6,41,Other,Books,1.4,Returned,Positive,6.61,WELCOME5,


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Checking Missing Values in a DataFrame
# Write a Pandas program to check for missing values in a DataFrame.

# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

shipping_cost      3
delivery_rating    6
dtype: int64

<div class="alert alert-warning">

**2. Ensuring Consistent Data Types of All Columns**

**Write a Pandas program to check and ensure that the data types of all columns are consistent.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1412,103,3,92.64,0.29,197.3232,2,East,Cash,...,2,67,Other,Books,4.4,Returned,Positive,18.09,FREESHIP,
1,2023-01-02,1873,163,3,52.88,0.21,125.3256,1,West,Cash,...,3,52,Female,Electronics,4.5,Not Returned,Neutral,18.52,,1.5
2,2023-01-03,1554,166,9,40.48,0.21,287.8128,8,South,Cash,...,4,42,Female,Food,4.5,Returned,Negative,5.08,SAVE10,1.6
3,2023-01-04,1887,171,4,77.21,0.06,290.3096,3,South,Credit Card,...,5,20,Male,Clothing,3.8,Not Returned,Positive,15.95,WELCOME5,3.3
4,2023-01-05,1120,196,9,39.78,0.07,332.9586,2,North,Credit Card,...,2,56,Female,Food,1.9,Not Returned,Neutral,16.16,,1.8
5,2023-01-06,1734,178,1,36.4,0.15,30.94,5,South,Cash,...,6,26,Other,Clothing,4.9,Returned,Neutral,5.71,,
6,2023-01-07,1694,137,1,49.18,0.15,41.803,5,North,Online,...,3,38,Male,Electronics,3.1,Returned,Neutral,15.18,FREESHIP,4.2
7,2023-01-08,1492,183,2,22.68,0.14,39.0096,5,South,Credit Card,...,4,46,Female,Books,5.0,Returned,Negative,,SAVE10,
8,2023-01-09,1259,100,7,73.13,0.26,378.8134,1,South,Cash,...,2,41,Male,Food,2.9,Returned,Neutral,7.18,SAVE10,
9,2023-01-10,1373,135,7,34.1,0.19,193.347,4,West,Credit Card,...,6,19,Male,Electronics,1.2,Not Returned,Positive,16.09,WELCOME5,1.9


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Ensuring Consistent Data Types of All Columns
# Write a Pandas program to check and ensure that the data types of all columns are consistent.

# Check data types
data_types = df.dtypes
data_types

date                datetime64[ns]
customer_id                  int64
product_id                   int64
quantity                     int64
price                      float64
discount                   float64
revenue                    float64
store_id                     int64
region                      object
payment_method              object
is_member                     bool
delivery_days                int64
customer_age                 int64
gender                      object
product_category            object
rating                     float64
return_status               object
feedback                    object
shipping_cost              float64
coupon_code                 object
delivery_rating            float64
dtype: object

<div class="alert alert-warning">

**3. Validating the Data Type of a Specific Column**

**Write a Pandas program to validate the data type of a specific column in a DataFrame.**

</div>

In [6]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1165,108,7,64.09,0.01,444.1437,9,North,Online,...,2,67,Other,Electronics,4.5,Not Returned,Neutral,11.54,WELCOME5,2.1
1,2023-01-02,1736,178,2,77.24,0.09,140.5768,9,East,Cash,...,1,65,Female,Books,1.1,Returned,Neutral,14.08,,
2,2023-01-03,1759,190,4,34.3,0.27,100.156,2,East,Credit Card,...,3,23,Female,Electronics,1.5,Not Returned,Positive,13.63,FREESHIP,4.6
3,2023-01-04,1883,195,9,56.13,0.1,454.653,8,West,Cash,...,1,69,Female,Electronics,3.7,Not Returned,Positive,15.32,WELCOME5,
4,2023-01-05,1817,104,8,60.82,0.28,350.3232,6,West,Online,...,6,58,Other,Books,2.2,Not Returned,Negative,13.8,,1.9
5,2023-01-06,1381,118,8,28.24,0.02,221.4016,8,West,Online,...,2,69,Female,Clothing,2.5,Returned,Negative,11.85,FREESHIP,4.8
6,2023-01-07,1246,106,2,40.59,0.19,65.7558,3,West,Online,...,1,29,Male,Electronics,1.8,Not Returned,Neutral,,SAVE10,3.2
7,2023-01-08,1380,133,8,83.09,0.11,591.6008,8,North,Credit Card,...,3,63,Other,Books,3.6,Returned,Positive,,,
8,2023-01-09,1181,164,2,78.37,0.25,117.555,5,West,Credit Card,...,1,61,Other,Food,3.2,Returned,Positive,13.64,,1.8
9,2023-01-10,1386,178,8,27.52,0.14,189.3376,2,West,Cash,...,1,61,Male,Electronics,2.4,Returned,Negative,16.18,FREESHIP,2.6


<div class="alert alert-success">

**Solution 03:**
</div>

In [7]:
# 3. Validating the Data Type of a Specific Column
# Write a Pandas program to validate the data type of a specific column in a DataFrame.

# Check if 'price' column is of type float
is_of_type = 'float64'
is_price_float = df['price'].dtype == is_of_type
is_price_float

# Validate the data type of 'price' column
if is_price_float:
    print("The 'price' column is of type {}.".format(df['price'].dtype))
else:
    print("The 'price' column is not of type {}, it is {}.".format(is_of_type, df['price'].dtype))

The 'price' column is of type float64.


<div class="alert alert-warning">

**4. Checking Duplicate Rows in a DataFrame**

**Write a Pandas program to check duplicate rows in a DataFrame.**

</div>

In [8]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1596,164,3,80.56,0.07,224.7624,9,South,Cash,...,5,21,Male,Food,3.4,Not Returned,Neutral,14.14,SAVE10,2.1
1,2023-01-02,1047,176,9,85.03,0.1,688.743,3,South,Online,...,2,55,Male,Books,1.4,Not Returned,Neutral,10.95,SAVE10,4.9
2,2023-01-03,1001,119,8,74.93,0.3,419.608,1,South,Credit Card,...,3,19,Male,Clothing,4.0,Not Returned,Neutral,14.11,WELCOME5,1.4
3,2023-01-04,1669,147,5,54.39,0.16,228.438,6,North,Online,...,2,31,Female,Electronics,1.8,Not Returned,Positive,9.04,WELCOME5,3.7
4,2023-01-05,1658,124,7,27.26,0.29,135.4822,9,East,Online,...,3,25,Female,Food,3.4,Not Returned,Negative,,SAVE10,3.2
5,2023-01-06,1733,189,3,60.21,0.29,128.2473,4,West,Online,...,6,47,Female,Books,4.8,Returned,Negative,,,
6,2023-01-07,1873,117,5,60.45,0.26,223.665,2,West,Credit Card,...,6,69,Other,Clothing,3.2,Returned,Negative,,SAVE10,3.8
7,2023-01-08,1202,135,7,54.59,0.24,290.4188,8,West,Online,...,2,28,Male,Food,3.3,Not Returned,Neutral,15.54,FREESHIP,3.7
8,2023-01-09,1716,145,1,18.31,0.03,17.7607,9,East,Cash,...,1,66,Male,Electronics,2.9,Returned,Positive,16.27,,3.3
9,2023-01-10,1279,120,2,83.92,0.08,154.4128,8,East,Credit Card,...,1,59,Male,Electronics,3.0,Not Returned,Negative,15.71,SAVE10,4.8


<div class="alert alert-success">

**Solution 04:**
</div>

In [9]:
# 4. Checking Duplicate Rows in a DataFrame
# Write a Pandas program to check duplicate rows in a DataFrame.

# Check for duplicate rows
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
35,2023-01-27,1634,194,4,40.83,0.07,151.8876,2,North,Online,...,5,63,Female,Food,2.4,Not Returned,Negative,8.97,SAVE10,4.9
36,2023-01-14,1142,116,8,95.33,0.12,671.1232,9,South,Credit Card,...,2,22,Female,Food,2.8,Not Returned,Negative,5.4,FREESHIP,
37,2023-01-25,1320,186,3,31.19,0.03,90.7629,6,West,Cash,...,1,67,Male,Electronics,1.7,Returned,Negative,13.8,FREESHIP,4.9
