Error Handling after intial cleaning

In [7]:
import pandas as pd
import os

def load_dataset(file_path):
    """Load a dataset and handle file-related errors."""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Error: File '{file_path}' does not exist.")
    if not file_path.endswith(".csv"):
        raise ValueError(f"Error: File '{file_path}' is not a CSV file.")
    try:
        data = pd.read_csv(file_path)
    except Exception as e:
        raise ValueError(f"Error reading file: {e}")
    return data

def validate_and_clean_data(data):
    """Perform data cleaning with error handling."""
    if data.empty:
        raise ValueError("Error: The dataset is empty.")

    # Check and clean numeric columns
    numeric_columns = data.select_dtypes(include=["float64", "int64"]).columns.tolist()

    # Check for string columns that can be converted to numeric
    potential_numeric_columns = data.select_dtypes(include=["object"]).columns
    for col in potential_numeric_columns:
        try:
            # Attempt to cast the column to numeric
            temp_col = pd.to_numeric(data[col].replace({',': '', ' ': ''}, regex=True), errors='coerce')
            # If all values can be converted or result in NaN (indicating numeric potential), add to numeric columns
            if temp_col.notnull().any():
                numeric_columns.append(col)
                data[col] = temp_col  # Convert the column in place
        except Exception as e:
            print(f"Error evaluating column '{col}' for numeric conversion: {e}")

    # Clean numeric columns
    for col in numeric_columns:
        try:
            # Fill missing numeric values with column mean
            data[col].fillna(data[col].mean(), inplace=True)
        except Exception as e:
            print(f"Error cleaning numeric column '{col}': {e}")

    # Check and clean categorical columns
    categorical_columns = data.select_dtypes(include=["object"]).columns
    for col in categorical_columns:
        if data[col].isnull().sum() > 0:
            data[col].fillna(data[col].mode()[0], inplace=True)

    # Remove duplicate rows
    data.drop_duplicates(inplace=True)

    # Return cleaned data
    return data

def save_cleaned_data(data, output_path):
    """Save the cleaned dataset to a file."""
    try:
        data.to_csv(output_path, index=False)
    except Exception as e:
        raise IOError(f"Error saving cleaned data: {e}")


Unit Tests to test edge cases


In [9]:
import unittest

class TestDataCleaning(unittest.TestCase):
    def setUp(self):
        """Set up sample data for testing."""
        # Sample dataset for testing
        self.sample_data = pd.DataFrame({
            "Stress_Level": [5, 8, None, 7, 0],
            "Depression_Score": [10, None, 15, 20, 5],
            "CGPA": ["3.5", " 4.0 ", "3,8", None, "4.0"],
            "Substance_Use": [None, "None", "Alcohol", "Tobacco", "None"]
        })
        self.cleaned_data = validate_and_clean_data(self.sample_data.copy())

    def test_file_existence(self):
        """Test if file existence check works properly."""
        self.assertRaises(FileNotFoundError, load_dataset, "nonexistent_file.csv")

    def test_numeric_cleaning(self):
        """Test numeric column cleaning."""
        # Check if numeric columns are properly converted
        self.assertTrue(pd.api.types.is_numeric_dtype(self.cleaned_data["Stress_Level"]))
        self.assertTrue(pd.api.types.is_numeric_dtype(self.cleaned_data["CGPA"]))
        # Check if missing numeric values are filled
        self.assertFalse(self.cleaned_data["Stress_Level"].isnull().any())
        self.assertFalse(self.cleaned_data["CGPA"].isnull().any())

    def test_categorical_cleaning(self):
        """Test categorical column cleaning."""
        # Check if missing categorical values are filled
        self.assertFalse(self.cleaned_data["Substance_Use"].isnull().any())
        # Check if duplicate rows were removed
        self.assertEqual(self.cleaned_data.duplicated().sum(), 0)

    def test_empty_dataset(self):
        """Test handling of an empty dataset."""
        empty_data = pd.DataFrame()
        with self.assertRaises(ValueError):
            validate_and_clean_data(empty_data)

    def test_invalid_file_format(self):
        """Test handling of invalid file formats."""
        with self.assertRaises(ValueError):
            load_dataset("invalid_file.txt")


# Run the tests
unittest.TextTestRunner().run(unittest.TestLoader().loadTestsFromTestCase(TestDataCleaning))


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting

<unittest.runner.TextTestResult run=5 errors=0 failures=0>