In [None]:
# Data Quality and Automation Project Example

# Import necessary libraries
import pandas as pd
import numpy as np

# Load sample dataset
# You can upload your file by running this cell and selecting the file.
from google.colab import files
uploaded = files.upload()

# Read the uploaded CSV file into a pandas DataFrame
df = pd.read_csv(next(iter(uploaded)))

# Preview the dataset
print("Initial Dataset:")
df.head()

# Step 1: Data Quality Checks
print("\n--- Data Quality Checks ---")

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values[missing_values > 0])

# Check for duplicate records
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Check for invalid data types (numeric columns with strings, etc.)
print("\nData Types:")
print(df.dtypes)

# Step 2: Data Automation (Cleaning)

# Handle missing values (you can choose different strategies here)
print("\n--- Handling Missing Values ---")
# Option 1: Fill missing values with mean for numerical columns
df.fillna(df.mean(), inplace=True)

# Option 2: Drop rows with missing values
# df.dropna(inplace=True)

# Remove duplicate records
print("\n--- Removing Duplicates ---")
df.drop_duplicates(inplace=True)

# Step 3: Data Standardization (Optional)
# Example: Converting text columns to lowercase for consistency
df['text_column'] = df['text_column'].str.lower()

# Preview cleaned dataset
print("\nCleaned Dataset:")
df.head()

# Step 4: Save the cleaned dataset
df.to_csv('cleaned_data.csv', index=False)
print("\nCleaned dataset saved as 'cleaned_data.csv'")
