In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# Set seed for reproducibility
np.random.seed(42)

# Create a sample dataset of customer orders
n_rows = 1000

# Generate random dates in the last year
start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 3, 1)
dates = [start_date + timedelta(days=np.random.randint(0, (end_date - start_date).days)) for _ in range(n_rows)]

# Generate customer IDs with some duplicates and inconsistent formats
customer_formats = ['CUS-{}', 'C{}', 'CUST-{}', 'Customer {}', '{}']
customer_ids = [np.random.choice(customer_formats).format(np.random.randint(1000, 9999)) for _ in range(n_rows)]

# Generate email addresses with some errors
email_domains = ['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com', 'company.com']
emails = []
for i in range(n_rows):
    username = f"user{np.random.randint(100, 999)}"
    domain = np.random.choice(email_domains)
    # Introduce some errors
    if np.random.random() < 0.05:  # Missing @ symbol
        emails.append(f"{username}{domain}")
    elif np.random.random() < 0.05:  # Extra spaces
        emails.append(f" {username}@{domain} ")
    elif np.random.random() < 0.05:  # Typos
        emails.append(f"{username}@{domain.replace('com', 'cm')}")
    else:
        emails.append(f"{username}@{domain}")

# Generate product IDs with some missing values
product_ids = [f"PROD-{np.random.randint(100, 999)}" if np.random.random() > 0.03 else np.nan for _ in range(n_rows)]

# Generate quantities with some outliers
quantities = [np.random.randint(1, 10) if np.random.random() > 0.02 else np.random.randint(100, 1000) for _ in range(n_rows)]

# Generate prices with some negative values and inconsistent formats
prices = []
for _ in range(n_rows):
    price = np.random.uniform(9.99, 199.99)
    if np.random.random() < 0.02:  # Negative price
        price = -price
    if np.random.random() < 0.1:  # String format
        prices.append(f"${price:.2f}")
    elif np.random.random() < 0.1:  # Integer format
        prices.append(int(price))
    else:
        prices.append(price)

# Generate shipping status with some inconsistent values
status_options = ['Shipped', 'shipped', 'SHIPPED', 'In Transit', 'in transit', 'In-Transit', 'Delivered', 'delivered', 'DELIVERED', 'Pending', 'pending']
shipping_status = [np.random.choice(status_options) for _ in range(n_rows)]

# Create the DataFrame
df = pd.DataFrame({
    'order_date': dates,
    'customer_id': customer_ids,
    'email': emails,
    'product_id': product_ids,
    'quantity': quantities,
    'price': prices,
    'shipping_status': shipping_status
})

# Add some completely blank rows
blank_indices = np.random.choice(range(n_rows), size=5, replace=False)
for idx in blank_indices:
    df.loc[idx, :] = np.nan

# Add some duplicate rows
dup_indices = np.random.choice(range(n_rows), size=10, replace=False)
df = pd.concat([df, df.loc[dup_indices]], ignore_index=True)

# Print the first few rows to see the data
print(df.head())

  order_date    customer_id                  email product_id  quantity  \
0 2024-04-12           7376    user208@hotmail.com   PROD-642       5.0   
1 2024-12-14  Customer 3393     user349company.com   PROD-626       9.0   
2 2024-09-27          C8240   user958@company.com    PROD-645     874.0   
3 2024-04-16          C7190      user951@yahoo.com   PROD-942       1.0   
4 2024-03-12       CUS-7493      user519@yahoo.com   PROD-115       7.0   

        price shipping_status  
0   27.347234       DELIVERED  
1   99.343948         Shipped  
2   77.172318      In Transit  
3  147.403597         Shipped  
4         159       delivered  


In [2]:
df_clean = df.dropna(how='all')

In [3]:
df_clean.loc[:, 'shipping_status'] = df_clean['shipping_status'].str.lower().str.strip()

In [4]:
df_clean['shipping_status']

Unnamed: 0,shipping_status
0,delivered
1,shipped
2,in transit
3,shipped
4,delivered
...,...
1005,delivered
1006,shipped
1007,in transit
1008,in transit


In [5]:
df_clean.loc[:,'customer_id'] = 'CUS-' + df_clean['customer_id'].str.extract(r'(\d+)').fillna('0000')

In [6]:
df_clean.loc[:,'price'] = pd.to_numeric(df_clean['price'].astype(str).str.replace('$', ''), errors='coerce')

In [7]:
df_clean.loc[:,'email'] = df_clean['email'].str.strip().str.replace(r'([^@]+)([^@]*\.com)', r'\1@\2')

In [8]:
df_clean.loc[:,'quantity'] = df_clean['quantity'].clip(upper=df_clean['quantity'].quantile(0.75) + \
                                                       1.5 * (df_clean['quantity'].quantile(0.75) - \
                                                              df_clean['quantity'].quantile(0.25)))

In [9]:
df_clean.loc[:,'shipping_status'] = df_clean['shipping_status'].replace({'in transit': 'in_transit', 'in-transit': 'in_transit', 'shipped': 'shipped', 'delivered': 'delivered', 'pending': 'pending'})

In [10]:
df_clean = df_clean.drop_duplicates(subset=['customer_id', 'order_date', 'product_id'])

In [11]:
df_clean['is_valid_email'] = df_clean['email'].str.contains(r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$')


In [12]:
df_clean = df_clean.sort_values('order_date').groupby('customer_id').ffill()