In [39]:
# Task 1: Load a CSV Dataset
# Description: Load a CSV file into a Pandas DataFrame and print the first five rows to understand the structure of the dataset.


import pandas as pd
import os

# Load dataset once for all tasks
def load_data(file_path):
    """Load dataset with error handling."""
    try:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"Error: The file '{file_path}' does not exist.")
        
        df = pd.read_csv(file_path)
        
        # Check for empty dataset
        if df.empty:
            raise ValueError("Error: The dataset is empty.")
        
        return df
    
    except Exception as e:
        print(e)
        return None

# Task 1: Display first five rows
def preview_data(df):
    """Preview first five rows of dataset."""
    print("\nDataset Preview:")
    print(df.head())

In [40]:
# Task 2: Check for missing values
def check_missing_values(df):
    """Identify columns with missing values."""
    missing_columns = df.isnull().sum()[df.isnull().sum() > 0]
    print("\nColumns with missing values:")
    print(missing_columns)

In [41]:
# Task 3: Visualize missing data
def visualize_missing_data(df):
    """Generate heatmap for missing values."""
    import seaborn as sns
    import matplotlib.pyplot as plt

    plt.figure(figsize=(10, 6))
    sns.heatmap(df.isnull(), cmap="coolwarm", cbar=False)
    plt.title("Missing Data Heatmap")
    plt.show()

In [42]:
# Task 4: Remove columns with >50% missing values
def remove_high_missing_columns(df):
    """Drop columns with over 50% missing values."""
    threshold = len(df) * 0.5
    df_cleaned = df.dropna(thresh=threshold, axis=1)
    print("\nRemaining Columns After Removal:")
    print(df_cleaned.columns)
    return df_cleaned

In [43]:
# Task 5: Identify duplicate rows
def find_duplicates(df):
    """Find duplicate rows."""
    duplicates = df[df.duplicated()]
    if not duplicates.empty:
        print("\nDuplicate rows found:")
        print(duplicates)
    else:
        print("\nNo duplicate rows found.")

In [44]:
# Task 6: Remove duplicate rows
def remove_duplicates(df):
    """Remove duplicate rows and verify removal."""
    df_cleaned = df.drop_duplicates()
    remaining_duplicates = df_cleaned.duplicated().sum()
    print(f"\nDuplicate rows remaining after cleanup: {remaining_duplicates}")
    return df_cleaned

In [45]:
# Task 7: Check data inconsistencies
def check_inconsistencies(df):
    """Detect inconsistencies in categorical columns."""
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        df[col] = df[col].astype(str)
        inconsistent_rows = df[(df[col] != df[col].str.strip()) | (df[col] != df[col].str.lower())]
        if not inconsistent_rows.empty:
            print(f"\nInconsistencies in '{col}':")
            print(inconsistent_rows[[col]])

In [46]:
# Task 8: Get summary of data quality
def data_quality_summary(df):
    """Generate data quality summary."""
    summary = {
        "Total Records": len(df),
        "Duplicate Rows": df.duplicated().sum(),
        "Missing Values": df.isnull().sum()[df.isnull().sum() > 0].to_dict()
    }
    print("\nData Quality Summary:")
    print(summary)

In [47]:
# Task 9: Generate a data quality report
def data_quality_report(df):
    """Generate full data quality report with basic stats and distributions."""
    report = {
        "Total Records": len(df),
        "Duplicate Rows": df.duplicated().sum(),
        "Missing Values": df.isnull().sum()[df.isnull().sum() > 0].to_dict(),
        "Numerical Summary": df.describe().to_dict(),
        "Categorical Distributions": {col: df[col].value_counts().to_dict() for col in df.select_dtypes(include=['object']).columns}
    }
    import pprint
    pprint.pprint(report)

In [48]:
# Task 10: Advanced Data Imputation
def impute_missing_data(df):
    """Impute missing values (numerical: mean, categorical: mode) and verify."""
    numerical_cols = df.select_dtypes(include=['number']).columns
    df[numerical_cols] = df[numerical_cols].apply(lambda col: col.fillna(col.mean()))

    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].apply(lambda col: col.fillna(col.mode()[0] if not col.mode().empty else col))

    # Verify imputation success
    if df.isnull().sum().sum() == 0:
        print("\nImputation successful: No missing values remain.")
    else:
        print("\nWarning: Some missing values still exist.")

    return df

# Main execution
file_path = "your_dataset.csv"
df = load_data(file_path)

if df is not None:
    preview_data(df)
    check_missing_values(df)
    visualize_missing_data(df)
    df = remove_high_missing_columns(df)
    find_duplicates(df)
    df = remove_duplicates(df)
    check_inconsistencies(df)
    data_quality_summary(df)
    data_quality_report(df)
    df = impute_missing_data(df)
else:
    print("Data loading failed. Process terminated.")

Error: The file 'your_dataset.csv' does not exist.
Data loading failed. Process terminated.
