[Pandas Data Validation](https://www.w3resource.com/python-exercises/pandas/pandas-data-validation.php)

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

<div class="alert alert-warning">

**1. Checking Missing Values in a DataFrame**

**Write a Pandas program to check for missing values in a DataFrame.**

</div>

In [2]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1672,150,2,20.95,0.23,32.263,6,North,Credit Card,False,1,22,Other,Electronics,2.8,Not Returned,Negative,,,
1,2023-01-02,1852,100,9,20.23,0.13,158.4009,2,East,Cash,False,1,56,Other,Food,3.2,Returned,Positive,17.02,FREESHIP,3.4
2,2023-01-03,1480,141,8,88.7,0.11,631.544,1,South,Credit Card,True,5,43,Female,Books,3.1,Not Returned,Negative,7.14,,2.5
3,2023-01-04,1167,179,5,70.86,0.16,297.612,9,North,Cash,True,2,65,Male,Books,1.5,Returned,Neutral,10.93,WELCOME5,1.5
4,2023-01-05,1654,198,2,34.17,0.16,57.4056,8,East,Cash,True,3,54,Other,Books,5.0,Returned,Neutral,,SAVE10,
5,2023-01-06,1667,199,4,51.38,0.22,160.3056,6,South,Cash,True,6,49,Other,Books,3.7,Returned,Positive,,,
6,2023-01-07,1460,160,2,20.39,0.17,33.8474,3,North,Cash,True,1,32,Male,Electronics,1.2,Returned,Positive,6.42,FREESHIP,
7,2023-01-08,1623,139,5,53.02,0.19,214.731,6,East,Online,True,1,35,Other,Books,2.3,Returned,Positive,5.29,,4.9
8,2023-01-09,1661,147,3,35.32,0.26,78.4104,6,South,Credit Card,True,6,63,Female,Books,4.3,Not Returned,Neutral,,WELCOME5,2.6
9,2023-01-10,1617,167,9,68.99,0.22,484.3098,3,North,Cash,True,1,55,Other,Food,1.1,Not Returned,Negative,13.6,SAVE10,


<div class="alert alert-success">

**Solution 01:**
</div>

In [3]:
# 1. Checking Missing Values in a DataFrame
# Write a Pandas program to check for missing values in a DataFrame.

# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

shipping_cost      10
delivery_rating    12
dtype: int64

<div class="alert alert-warning">

**2. Ensuring Consistent Data Types of All Columns**

**Write a Pandas program to check and ensure that the data types of all columns are consistent.**

</div>

In [4]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1937,110,6,58.92,0.24,268.6752,4,South,Online,False,3,41,Female,Clothing,2.6,Returned,Neutral,5.92,FREESHIP,4.1
1,2023-01-02,1998,159,9,85.09,0.13,666.2547,5,South,Credit Card,False,1,34,Female,Food,3.1,Not Returned,Neutral,8.5,WELCOME5,2.0
2,2023-01-03,1375,120,4,56.37,0.26,166.8552,5,South,Online,True,4,41,Other,Electronics,3.2,Returned,Positive,13.82,,
3,2023-01-04,1883,159,9,17.85,0.24,122.094,2,North,Cash,False,5,55,Male,Books,1.5,Not Returned,Negative,,FREESHIP,
4,2023-01-05,1707,103,1,62.62,0.16,52.6008,8,West,Credit Card,False,3,58,Other,Food,2.9,Returned,Neutral,16.75,,
5,2023-01-06,1125,155,2,17.24,0.1,31.032,3,West,Credit Card,False,4,64,Female,Food,3.5,Returned,Negative,13.47,,
6,2023-01-07,1779,128,6,45.94,0.27,201.2172,6,West,Cash,False,4,43,Female,Clothing,2.5,Returned,Negative,15.79,SAVE10,
7,2023-01-08,1050,119,5,41.83,0.21,165.2285,1,East,Credit Card,False,6,35,Other,Clothing,3.8,Not Returned,Negative,12.99,SAVE10,5.0
8,2023-01-09,1380,154,7,67.33,0.24,358.1956,9,East,Cash,False,3,40,Female,Electronics,3.2,Returned,Negative,9.1,,3.1
9,2023-01-10,1817,134,8,39.57,0.14,272.2416,3,West,Online,False,4,57,Female,Electronics,4.0,Returned,Negative,,FREESHIP,


<div class="alert alert-success">

**Solution 02:**
</div>

In [5]:
# 2. Ensuring Consistent Data Types of All Columns
# Write a Pandas program to check and ensure that the data types of all columns are consistent.

# Check data types
data_types = df.dtypes
data_types

date                datetime64[ns]
customer_id                  int64
product_id                   int64
quantity                     int64
price                      float64
discount                   float64
revenue                    float64
store_id                     int64
region                      object
payment_method              object
is_member                     bool
delivery_days                int64
customer_age                 int64
gender                      object
product_category            object
rating                     float64
return_status               object
feedback                    object
shipping_cost              float64
coupon_code                 object
delivery_rating            float64
dtype: object

<div class="alert alert-warning">

**3. Validating the Data Type of a Specific Column**

**Write a Pandas program to validate the data type of a specific column in a DataFrame.**

</div>

In [6]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1440,197,6,87.42,0.04,503.5392,8,West,Cash,False,2,35,Other,Clothing,3.9,Not Returned,Negative,,SAVE10,2.3
1,2023-01-02,1677,140,2,47.33,0.27,69.1018,7,East,Cash,True,5,52,Female,Electronics,4.8,Returned,Neutral,15.4,,
2,2023-01-03,1704,150,7,85.58,0.2,479.248,7,West,Credit Card,True,3,63,Female,Food,2.6,Not Returned,Negative,17.84,,2.9
3,2023-01-04,1546,120,5,49.04,0.11,218.228,6,South,Credit Card,False,4,37,Female,Food,2.0,Not Returned,Positive,,,1.3
4,2023-01-05,1270,128,5,14.04,0.23,54.054,5,West,Cash,False,1,57,Other,Food,1.5,Returned,Positive,,FREESHIP,3.9
5,2023-01-06,1806,187,9,22.47,0.22,157.7394,8,East,Cash,False,1,49,Male,Electronics,1.3,Not Returned,Positive,,WELCOME5,
6,2023-01-07,1527,102,9,67.47,0.18,497.9286,5,South,Online,False,1,44,Female,Clothing,3.3,Returned,Positive,19.53,,4.1
7,2023-01-08,1352,112,8,68.5,0.18,449.36,1,East,Online,False,6,34,Male,Food,2.6,Not Returned,Neutral,18.21,FREESHIP,4.6
8,2023-01-09,1030,129,3,99.58,0.11,265.8786,4,East,Online,True,2,49,Female,Books,4.6,Returned,Positive,18.27,FREESHIP,
9,2023-01-10,1565,171,2,50.64,0.22,78.9984,1,West,Credit Card,False,5,29,Male,Electronics,2.5,Not Returned,Positive,9.3,WELCOME5,


<div class="alert alert-success">

**Solution 03:**
</div>

In [7]:
# 3. Validating the Data Type of a Specific Column
# Write a Pandas program to validate the data type of a specific column in a DataFrame.

# Check if 'price' column is of type float
is_of_type = 'float64'
is_price_float = df['price'].dtype == is_of_type
is_price_float

# Validate the data type of 'price' column
if is_price_float:
    print("The 'price' column is of type {}.".format(df['price'].dtype))
else:
    print("The 'price' column is not of type {}, it is {}.".format(is_of_type, df['price'].dtype))

The 'price' column is of type float64.


<div class="alert alert-warning">

**4. Checking Duplicate Rows in a DataFrame**

**Write a Pandas program to check duplicate rows in a DataFrame.**

</div>

In [8]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1482,110,1,48.76,0.03,47.2972,6,East,Cash,True,1,24,Other,Books,2.3,Not Returned,Positive,9.75,,4.0
1,2023-01-02,1586,192,6,47.19,0.06,266.1516,9,East,Cash,True,5,64,Other,Electronics,3.2,Not Returned,Negative,17.69,,4.2
2,2023-01-03,1717,185,9,21.36,0.14,165.3264,7,North,Online,False,6,27,Other,Clothing,2.2,Not Returned,Negative,12.11,SAVE10,4.9
3,2023-01-04,1116,110,2,88.25,0.2,141.2,3,North,Cash,False,1,26,Other,Books,3.1,Returned,Negative,17.88,WELCOME5,3.4
4,2023-01-05,1900,122,1,67.99,0.01,67.3101,2,South,Cash,False,6,30,Male,Food,1.3,Returned,Positive,13.93,,3.3
5,2023-01-06,1563,154,4,62.78,0.12,220.9856,8,East,Credit Card,False,3,45,Female,Books,1.6,Not Returned,Negative,13.08,WELCOME5,
6,2023-01-07,1224,127,1,38.45,0.26,28.453,7,East,Cash,False,1,35,Other,Books,2.5,Returned,Neutral,9.03,,
7,2023-01-08,1042,151,3,45.61,0.25,102.6225,1,East,Cash,False,2,46,Other,Food,1.2,Returned,Positive,6.5,,1.1
8,2023-01-09,1251,103,1,79.77,0.26,59.0298,5,West,Cash,False,3,43,Female,Books,2.1,Returned,Positive,18.59,,1.3
9,2023-01-10,1336,136,2,87.43,0.11,155.6254,9,East,Cash,False,2,21,Male,Food,4.6,Returned,Negative,,,1.2


<div class="alert alert-success">

**Solution 04:**
</div>

In [9]:
# 4. Checking Duplicate Rows in a DataFrame
# Write a Pandas program to check duplicate rows in a DataFrame.

# Check for duplicate rows
duplicate_rows = df[df.duplicated()]
duplicate_rows

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
35,2023-01-27,1464,176,3,47.4,0.07,132.246,5,East,Cash,False,6,52,Other,Clothing,1.8,Returned,Positive,8.95,SAVE10,4.7
36,2023-01-14,1321,155,1,30.75,0.1,27.675,3,South,Online,False,2,50,Other,Electronics,4.4,Not Returned,Neutral,19.64,,
37,2023-01-25,1947,104,2,17.13,0.14,29.4636,8,West,Credit Card,True,1,51,Other,Clothing,4.2,Not Returned,Positive,12.32,,


<div class="alert alert-warning">

**5. Removing Duplicate Rows from a DataFrame**

**Write a Pandas program to remove duplicate rows from a DataFrame.**

</div>

In [10]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1930,103,2,41.86,0.22,65.3016,1,West,Online,True,3,22,Other,Electronics,2.0,Not Returned,Negative,17.27,FREESHIP,
1,2023-01-02,1895,138,3,36.57,0.11,97.6419,7,South,Cash,True,4,54,Female,Clothing,2.9,Not Returned,Positive,15.03,WELCOME5,2.4
2,2023-01-03,1728,185,2,31.09,0.09,56.5838,8,South,Online,True,6,39,Female,Books,4.5,Returned,Neutral,8.27,,1.7
3,2023-01-04,1297,180,9,34.63,0.06,292.9698,2,North,Credit Card,True,2,35,Male,Books,1.9,Returned,Neutral,13.59,FREESHIP,4.2
4,2023-01-05,1863,119,7,62.39,0.24,331.9148,6,North,Cash,True,2,43,Other,Books,4.2,Not Returned,Negative,,FREESHIP,2.4
5,2023-01-06,1813,139,1,72.02,0.06,67.6988,7,East,Cash,False,4,42,Other,Electronics,1.3,Returned,Neutral,,FREESHIP,
6,2023-01-07,1903,194,6,74.09,0.07,413.4222,9,West,Online,False,2,26,Male,Books,4.9,Not Returned,Negative,9.46,,3.5
7,2023-01-08,1544,141,5,46.36,0.12,203.984,6,West,Cash,True,4,22,Other,Food,3.2,Not Returned,Positive,17.02,,2.1
8,2023-01-09,1213,146,5,11.27,0.09,51.2785,3,West,Cash,False,2,64,Female,Food,3.4,Returned,Neutral,7.76,SAVE10,4.9
9,2023-01-10,1535,116,4,64.25,0.15,218.45,2,West,Cash,False,6,26,Other,Clothing,1.1,Returned,Neutral,,,


<div class="alert alert-success">

**Solution 05:**
</div>

In [11]:
# 5. Removing Duplicate Rows from a DataFrame
# Write a Pandas program to remove duplicate rows from a DataFrame.

print("Shape of DataFrame before removing duplicates:\n", df.shape)
# Remove duplicate rows
df = df.drop_duplicates()
print("Shape of DataFrame after removing duplicates:\n", df.shape)

Shape of DataFrame before removing duplicates:
 (38, 21)
Shape of DataFrame after removing duplicates:
 (35, 21)


<div class="alert alert-warning">

**6. Validating Data Based on Custom Conditions**

**Write a Pandas program to validate data based on custom conditions.**

</div>

In [12]:
# Create date range
date_range = pd.date_range(start='2023-01-01', periods=35, freq='D')

# Create a dictionary of data
data = {
    'date': date_range,
    'customer_id': np.random.randint(1000, 2000, size=35),
    'product_id': np.random.randint(100, 200, size=35),
    'quantity': np.random.randint(1, 10, size=35),
    'price': np.round(np.random.uniform(10.0, 100.0, size=35), 2),
    'discount': np.round(np.random.uniform(0, 0.3, size=35), 2),
    'revenue': lambda df: df['quantity'] * df['price'] * (1 - df['discount']),
    'store_id': np.random.randint(1, 10, size=35),
    'region': np.random.choice(['North', 'South', 'East', 'West'], size=35),
    'payment_method': np.random.choice(['Credit Card', 'Cash', 'Online'], size=35),
    'is_member': np.random.choice([True, False], size=35),
    'delivery_days': np.random.randint(1, 7, size=35),
    'customer_age': np.random.randint(18, 70, size=35),
    'gender': np.random.choice(['Male', 'Female', 'Other'], size=35),
    'product_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Food'], size=35),
    'rating': np.round(np.random.uniform(1.0, 5.0, size=35), 1),
    'return_status': np.random.choice(['Returned', 'Not Returned'], size=35),
    'feedback': np.random.choice(['Positive', 'Neutral', 'Negative'], size=35),
}

# Create DataFrame
df = pd.DataFrame(data)

# Compute revenue using the lambda function above
df['revenue'] = df['quantity'] * df['price'] * (1 - df['discount'])

# Add 'shipping_cost' with some missing values
df['shipping_cost'] = np.where(np.random.rand(len(df)) < 0.2, np.nan, np.round(np.random.uniform(5.0, 20.0, size=len(df)), 2))

# Add 'coupon_code' with some missing values
coupon_options = ['SAVE10', 'WELCOME5', 'FREESHIP', np.nan]
df['coupon_code'] = np.random.choice(coupon_options, size=len(df), p=[0.2, 0.2, 0.2, 0.4])

# Add 'delivery_rating' with some missing values
df['delivery_rating'] = np.where(np.random.rand(len(df)) < 0.3, np.nan, np.round(np.random.uniform(1.0, 5.0, size=len(df)), 1))

# Add 3 duplicate rows
df = pd.concat([df, df.sample(n=3, random_state=42)], ignore_index=True)

# Add 3 more rows with negative price
df = pd.concat([df, pd.DataFrame({
    'date': pd.date_range(start='2023-02-05', periods=3, freq='D'),
    'customer_id': [1500, 1501, 1502],
    'product_id': [150, 151, 152],
    'quantity': [2, 3, 1],
    'price': [-20.00, -15.00, -30.00],  # Negative prices
    'discount': [0.1, 0.2, 0.15],
    'revenue': [0, 0, 0],  # Placeholder; will be recalculated
    'store_id': [5, 6, 7],
    'region': ['North', 'South', 'East'],
    'payment_method': ['Credit Card', 'Cash', 'Online'],
    'is_member': [True, False, True],
    'delivery_days': [3, 4, 2],
    'customer_age': [25, 34, 45],
    'gender': ['Male', 'Female', 'Other'],
    'product_category': ['Electronics', 'Clothing', 'Books'],
    'rating': [4.5, 3.0, 5.0],
    'return_status': ['Not Returned', 'Returned', 'Not Returned'],
    'feedback': ['Positive', 'Neutral', 'Negative'],
})], ignore_index=True)

# Display the updated DataFrame
display(df)

Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
0,2023-01-01,1819,173,6,64.9,0.19,315.414,7,East,Online,False,5,43,Female,Food,1.1,Not Returned,Positive,7.32,SAVE10,4.7
1,2023-01-02,1977,116,1,52.17,0.27,38.0841,7,East,Cash,False,5,64,Female,Clothing,1.1,Not Returned,Neutral,18.0,,2.0
2,2023-01-03,1763,168,8,12.34,0.03,95.7584,1,East,Cash,False,1,27,Female,Electronics,3.0,Not Returned,Negative,19.15,,
3,2023-01-04,1888,141,2,47.76,0.2,76.416,1,West,Cash,True,3,69,Male,Clothing,3.0,Returned,Negative,10.47,WELCOME5,
4,2023-01-05,1632,176,6,51.66,0.06,291.3624,5,South,Online,False,2,41,Other,Clothing,2.2,Not Returned,Positive,,FREESHIP,3.4
5,2023-01-06,1110,176,5,55.07,0.07,256.0755,1,West,Cash,False,6,54,Other,Books,4.4,Not Returned,Neutral,9.02,,3.7
6,2023-01-07,1207,189,1,41.1,0.01,40.689,1,North,Credit Card,True,5,52,Male,Electronics,3.6,Not Returned,Neutral,5.79,FREESHIP,2.6
7,2023-01-08,1284,116,4,28.12,0.05,106.856,4,South,Online,True,5,49,Female,Books,4.0,Returned,Negative,12.32,FREESHIP,
8,2023-01-09,1429,143,8,12.06,0.08,88.7616,3,West,Credit Card,False,2,61,Other,Electronics,4.7,Returned,Positive,19.83,,3.9
9,2023-01-10,1029,142,1,18.46,0.15,15.691,9,East,Cash,True,4,52,Female,Electronics,1.9,Not Returned,Positive,11.18,SAVE10,


<div class="alert alert-success">

**Solution 06:**
</div>

In [13]:
# 6. Validating Data Based on Custom Conditions
# Write a Pandas program to validate data based on custom conditions.

# Check if 'price' is greater than 0
valid_price = df['price'] > 0
if valid_price.all():
    print("All prices are valid.")
else:
    print("Some prices are invalid:")
    display(df[~valid_price])

Some prices are invalid:


Unnamed: 0,date,customer_id,product_id,quantity,price,discount,revenue,store_id,region,payment_method,is_member,delivery_days,customer_age,gender,product_category,rating,return_status,feedback,shipping_cost,coupon_code,delivery_rating
38,2023-02-05,1500,150,2,-20.0,0.1,0.0,5,North,Credit Card,True,3,25,Male,Electronics,4.5,Not Returned,Positive,,,
39,2023-02-06,1501,151,3,-15.0,0.2,0.0,6,South,Cash,False,4,34,Female,Clothing,3.0,Returned,Neutral,,,
40,2023-02-07,1502,152,1,-30.0,0.15,0.0,7,East,Online,True,2,45,Other,Books,5.0,Not Returned,Negative,,,
