In [2]:
# Common Data Errors Examples

# 1. Missing Data:
# Task 1: Review a dataset where some customer emails are missing. Identify how
# many records are incomplete.
# Task 2: Examine a sales dataset with missing transaction dates and determine the
# percentage of missing data.
# Task 3: Identify missing department information in an employee registry.


missing_emails = df['email'].isna().sum()
print(f"Missing email records: {missing_emails}")
missing_dates = df['transaction_date'].isna().mean() * 100
print(f"Percentage of missing transaction dates: {missing_dates:.2f}%")

missing_departments = df['department'].isna().sum()
print(f"Missing department entries: {missing_departments}")



# 2. Duplicate Data:
# Task 1: Analyze a customer dataset with duplicate entries and count the number of
# duplicates.
# Task 2: Review supplier data and identify any repeated supplier names.
# Task 3: Examine a product inventory list for duplicates in product IDs.

duplicate_customers = df.duplicated().sum()
print(f"Duplicate records in customer dataset: {duplicate_customers}")
duplicate_supplier_names = df['supplier_name'].duplicated().sum()
print(f"Duplicate supplier names: {duplicate_supplier_names}")
duplicate_product_ids = df['product_id'].duplicated().sum()
print(f"Duplicate product IDs: {duplicate_product_ids}")





# 3. Inconsistent Formatting:
# Task 1: Spot inconsistencies in date formats (e.g., DD/MM/YYYY vs. MM/DD/YYYY)
# in a dataset.
# Task 2: Identify phone numbers with varying formats in a contact list.
# Task 3: Review address data for discrepancies in state abbreviations (e.g., CA vs.
# Calif.).

# Try parsing and flag parsing failures
df['parsed_date'] = pd.to_datetime(df['date'], errors='coerce')
inconsistent_dates = df['parsed_date'].isna().sum()
print(f"Inconsistent date formats: {inconsistent_dates}")

df['phone_cleaned'] = df['phone'].str.replace(r'\D', '', regex=True)
inconsistent_phones = df[df['phone_cleaned'].str.len() != 10]
print(f"Inconsistent phone number formats:\n{inconsistent_phones}")
state_variants = df['state'].str.lower().value_counts()
print(f"State name variations:\n{state_variants}")



# 4. Data Drift:
# Task 1: Compare monthly revenues over six months to identify data drift.
# Task 2: Analyze user engagement metrics from a web application over different
# quarters.
# Task 3: Review a stock price dataset to detect any anomalies over a year.

monthly_revenue = df.groupby('month')['revenue'].sum()
print("Monthly revenue:")
print(monthly_revenue)
df['quarter'] = pd.to_datetime(df['date']).dt.to_period('Q')
engagement_by_quarter = df.groupby('quarter')['engagement_metric'].mean()
print("Engagement by quarter:")
print(engagement_by_quarter)
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
stock_anomalies = df['stock_price'].rolling(window=30).mean().plot(title="30-Day Rolling Avg of Stock Price")
plt.show()




NameError: name 'df' is not defined