[Pandas Data Validation](https://www.w3resource.com/python-exercises/pandas/pandas-data-validation.php)

In [1]:
import pandas as pd
import numpy as np

<div class="alert alert-warning">

**1. Checking Missing Values in a DataFrame**

**Write a Pandas program to check for missing values in a DataFrame.**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1701,134,4,28.77,0.12,101.2704,7,East,Credit Card,...,1,56,Male,Clothing,4.1,Not Returned,Positive,9.28,,3.4
1,2023-01-02,1901,193,7,64.45,0.03,437.6155,3,East,Online,...,5,18,Male,Electronics,3.2,Returned,Negative,14.91,WELCOME5,1.8
2,2023-01-03,1324,136,8,77.78,0.23,479.1248,4,East,Credit Card,...,3,49,Female,Food,2.8,Returned,Neutral,5.4,WELCOME5,
3,2023-01-04,1726,102,6,15.16,0.22,70.9488,6,South,Cash,...,3,61,Other,Books,4.5,Not Returned,Neutral,13.28,FREESHIP,
4,2023-01-05,1572,168,8,60.76,0.2,388.864,8,South,Credit Card,...,5,26,Male,Electronics,4.3,Returned,Neutral,,,
5,2023-01-06,1491,100,9,39.84,0.23,276.0912,2,South,Credit Card,...,3,64,Other,Clothing,3.2,Not Returned,Positive,9.95,SAVE10,
6,2023-01-07,1981,102,2,86.59,0.11,154.1302,4,West,Cash,...,6,52,Male,Clothing,2.3,Not Returned,Negative,,,4.6
7,2023-01-08,1720,152,5,42.63,0.08,196.098,4,South,Credit Card,...,6,56,Other,Food,2.2,Returned,Positive,,SAVE10,2.6
8,2023-01-09,1320,190,6,22.35,0.22,104.598,7,North,Credit Card,...,3,47,Male,Food,3.1,Returned,Neutral,18.0,FREESHIP,1.1
9,2023-01-10,1144,189,5,98.38,0.21,388.601,7,South,Credit Card,...,6,69,Female,Food,4.9,Returned,Positive,,SAVE10,


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Checking Missing Values in a DataFrame
# Write a Pandas program to check for missing values in a DataFrame.

# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

shipping_cost       7
delivery_rating    13
dtype: int64

<div class="alert alert-warning">

**2. Ensuring Consistent Data Types of All Columns**

**Write a Pandas program to check and ensure that the data types of all columns are consistent.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,...,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1682,193,8,58.15,0.17,386.116,1,South,Cash,...,6,25,Male,Books,1.2,Returned,Negative,17.48,,3.3
1,2023-01-02,1692,190,6,19.43,0.22,90.9324,6,West,Credit Card,...,2,36,Other,Books,3.8,Not Returned,Negative,5.87,,1.6
2,2023-01-03,1974,172,5,50.32,0.16,211.344,7,South,Cash,...,6,38,Female,Food,2.2,Returned,Negative,8.27,FREESHIP,1.1
3,2023-01-04,1458,134,3,62.13,0.19,150.9759,9,North,Credit Card,...,6,33,Male,Books,2.0,Returned,Negative,17.48,WELCOME5,
4,2023-01-05,1390,198,3,98.34,0.26,218.3148,9,South,Credit Card,...,6,21,Male,Food,3.0,Returned,Negative,9.32,WELCOME5,1.3
5,2023-01-06,1131,117,9,78.59,0.16,594.1404,7,North,Online,...,6,57,Other,Electronics,2.2,Not Returned,Positive,,FREESHIP,4.9
6,2023-01-07,1499,143,6,86.45,0.09,472.017,2,North,Credit Card,...,4,56,Male,Books,5.0,Returned,Positive,6.31,,
7,2023-01-08,1626,130,6,20.23,0.22,94.6764,4,West,Online,...,3,24,Female,Electronics,3.6,Not Returned,Negative,10.11,SAVE10,3.0
8,2023-01-09,1129,134,6,39.85,0.17,198.453,6,North,Credit Card,...,2,19,Female,Books,3.1,Returned,Neutral,16.63,FREESHIP,
9,2023-01-10,1314,139,3,54.57,0.03,158.7987,7,South,Cash,...,3,65,Other,Electronics,5.0,Returned,Positive,17.69,WELCOME5,2.9


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Ensuring Consistent Data Types of All Columns
# Write a Pandas program to check and ensure that the data types of all columns are consistent.

# Check data types
data_types = df.dtypes
data_types

date                datetime64[ns]
customer_id                  int64
product_id                   int64
quantity                     int64
price                      float64
discount                   float64
revenue                    float64
store_id                     int64
region                      object
payment_method              object
is_member                     bool
delivery_days                int64
customer_age                 int64
gender                      object
product_category            object
rating                     float64
return_status               object
feedback                    object
shipping_cost              float64
coupon_code                 object
delivery_rating            float64
dtype: object