Error Handling after intial cleaning

In [None]:
import os
import pandas as pd
import subprocess


# Shell-based outlier removal function
def remove_outliers(input_file: str, output_file: str):
    """Remove outliers using an awk shell command."""
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Error: File '{input_file}' does not exist.")

    # Awk command to remove outliers
    awk_command = (
        f"awk -F, 'NR==1 || ($1 >= 15 && $1 <= 60 && $4 >= 0 && $4 <= 4 && "
        f"$5 >= 0 && $5 <= 5 && $6 >= 0 && $6 <= 5 && $7 >= 0 && $7 <= 5)' "
        f"{input_file} > {output_file}"
    )

    try:
        subprocess.run(awk_command, shell=True, check=True)
        print(f"Outliers removed. Output saved to {output_file}")
    except subprocess.CalledProcessError as e:
        raise RuntimeError(f"Error while running awk command: {e}")


# Load dataset function
def load_dataset(file_path: str) -> pd.DataFrame:
    """Load a dataset from a CSV file."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Error: File '{file_path}' does not exist.")
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        raise ValueError(f"Error reading file '{file_path}': {e}")
    return data


# Data cleaning function
def clean_data(data: pd.DataFrame) -> pd.DataFrame:
    """Perform data cleaning tasks on the DataFrame."""
    if data.empty:
        raise ValueError("The dataset is empty and cannot be cleaned.")

    # Handle missing numeric values by filling with the mean
    numeric_columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()
    for col in numeric_columns:
        data[col].fillna(data[col].mean(), inplace=True)

    # Handle missing categorical values by filling with the mode
    categorical_columns = data.select_dtypes(include=["object"]).columns.tolist()
    for col in categorical_columns:
        data[col].fillna(data[col].mode()[0], inplace=True)

    # Remove duplicate rows
    data.drop_duplicates(inplace=True)

    return data


# Save dataset function
def save_dataset(data: pd.DataFrame, output_path: str):
    """Save a DataFrame to a CSV file."""
    try:
        data.to_csv(output_path, index=False)
        print(f"Data saved to {output_path}")
    except Exception as e:
        raise IOError(f"Error saving file '{output_path}': {e}")


# Main script workflow
if __name__ == "__main__":
    INPUT_FILE = "cleaned_students_mental_health_survey.csv"
    OUTPUT_FILE = "outliers_removed.csv"
    FINAL_OUTPUT_FILE = "final_cleaned_data.csv"

    try:
        # Step 1: Remove outliers
        remove_outliers(INPUT_FILE, OUTPUT_FILE)

        # Step 2: Load the outlier-removed dataset
        dataset = load_dataset(OUTPUT_FILE)

        # Step 3: Clean the data
        cleaned_data = clean_data(dataset)

        # Step 4: Save the cleaned dataset
        save_dataset(cleaned_data, FINAL_OUTPUT_FILE)

    except Exception as e:
        print(f"An error occurred: {e}")


An error occurred: Error: File 'cleaned_students_mental_health_survey.csv' does not exist.


Unit Tests to test edge cases


In [None]:
import unittest

class TestDataCleaning(unittest.TestCase):
    def setUp(self):
        """Set up sample data for testing."""
        # Sample dataset for testing
        self.sample_data = pd.DataFrame({
            "Stress_Level": [5, 8, None, 7, 0],
            "Depression_Score": [10, None, 15, 20, 500],
            "CGPA": ["3.5", " 4.0 ", "3,8", None, "4.0"],
            "Substance_Use": [None, "None", "Alcohol", "Tobacco", "None"]
        })
        self.cleaned_data = validate_and_clean_data(self.sample_data.copy())

    def test_file_existence(self):
        """Test if file existence check works properly."""
        self.assertRaises(FileNotFoundError, load_dataset, "nonexistent_file.csv")

    def test_numeric_cleaning(self):
        """Test numeric column cleaning."""
        # Check if numeric columns are properly converted
        self.assertTrue(pd.api.types.is_numeric_dtype(self.cleaned_data["Stress_Level"]))
        self.assertTrue(pd.api.types.is_numeric_dtype(self.cleaned_data["CGPA"]))
        # Check if missing numeric values are filled
        self.assertFalse(self.cleaned_data["Stress_Level"].isnull().any())
        self.assertFalse(self.cleaned_data["CGPA"].isnull().any())

    def test_categorical_cleaning(self):
        """Test categorical column cleaning."""
        # Check if missing categorical values are filled
        self.assertFalse(self.cleaned_data["Substance_Use"].isnull().any())
        # Check if duplicate rows were removed
        self.assertEqual(self.cleaned_data.duplicated().sum(), 0)

    def test_empty_dataset(self):
        """Test handling of an empty dataset."""
        empty_data = pd.DataFrame()
        with self.assertRaises(ValueError):
            validate_and_clean_data(empty_data)

    def test_invalid_file_format(self):
        """Test handling of invalid file formats."""
        with self.assertRaises(ValueError):
            load_dataset("invalid_file.txt")

    def test_remove_outliers(self):
        remove_outliers(self.input_file, self.output_file)
        # Load the output file and check for outliers
        output_data = pd.read_csv(self.output_file)
        # Expect only rows where Age is within 15-60 (removing 65 and 70)
        self.assertTrue((output_data["Age"] >= 15).all() and (output_data["Age"] <= 60).all())
        self.assertTrue(len(output_data) < len(self.outlier_test_data), "Outliers were not removed correctly.")


# Run the tests
unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestDataCleaning))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

<unittest.runner.TextTestResult run=5 errors=0 failures=0>