## Compare Data Completeness Over Time

**Description**: Analyze the trend of missing data in `"sales_data.csv"` over several months stored in a "date" column. Visualize missing data rates by month.

In [1]:
# Write your code from here
import pandas as pd
import numpy as np

# Create a sample DataFrame
data = {
    'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-02-01', '2023-02-02', '2023-03-01'],
    'product_id': [101, 102, 103, 104, 105, 106],
    'quantity': [10, 5, np.nan, 20, 15, 12],  # Some missing values in 'quantity'
    'price': [500, 300, 200, 400, np.nan, 450]  # Some missing values in 'price'
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
df.to_csv('sales_data.csv', index=False)

print("sales_data.csv file created successfully!")

sales_data.csv file created successfully!


In [2]:
import pandas as pd
import matplotlib.pyplot as plt

def load_sales_data(file_path):
    """Load sales data from CSV and check for necessary columns."""
    try:
        df = pd.read_csv(file_path)
        required_columns = ['date', 'quantity', 'price']
        
        # Check if all required columns exist
        if not all(col in df.columns for col in required_columns):
            raise ValueError(f"Missing required columns. Expected columns: {required_columns}")
        
        # Validate data types for 'quantity' and 'price'
        if not pd.api.types.is_numeric_dtype(df['quantity']) or not pd.api.types.is_numeric_dtype(df['price']):
            raise ValueError("'quantity' and 'price' columns must contain numeric values.")
        
        return df
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def analyze_missing_data(df):
    """Analyze and visualize missing data trends over time."""
    # Convert 'date' column to datetime (optimized method)
    df['date'] = pd.to_datetime(df['date'], errors='coerce')

    # Check if any conversion failed
    if df['date'].isnull().any():
        print("Warning: Some dates could not be parsed.")
    
    # Group by month and count missing data
    df['month'] = df['date'].dt.to_period('M')
    missing_data = df.isnull().sum(axis=1)
    
    # Group by month and calculate missing data rate
    missing_data_by_month = df.groupby('month')[missing_data].mean()

    # Plotting the missing data rates
    plt.figure(figsize=(10, 6))
    missing_data_by_month.plot(kind='bar', color='orange')
    plt.title('Missing Data Rates by Month')
    plt.xlabel('Month')
    plt.ylabel('Missing Data Rate')
    plt.xticks(rotation=45)
    plt.show()

# Main flow
file_path = 'sales_data.csv'
df = load_sales_data(file_path)
if df is not None:
    analyze_missing_data(df)

KeyError: 'Columns not found: 0, 1'