In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Step 1: Load the dataset
file_path = "data_cleaning_demo.csv"  # Replace with your filepath if needed
df = pd.read_csv(file_path)

print("Original Data:")
print(df)

# Step 2: Handle missing values
df['Age'].fillna(df['Age'].mean(), inplace=True)  # Fill missing Age with mean
df['Email'].fillna("unknown@example.com", inplace=True)  # Fill missing Email with default value
df.dropna(subset=['Name'], inplace=True)  # Drop rows with missing Name

# Step 3: Remove duplicates
df.drop_duplicates(inplace=True)

# Step 4: Convert data types
df['Joining_Date'] = pd.to_datetime(df['Joining_Date'], errors='coerce')  # Convert to datetime

# Step 5: Rename columns
df.rename(columns={'Salary': 'Annual_Salary'}, inplace=True)

# Step 6: Filter data (e.g., keep rows with Age > 20)
df = df[df['Age'] > 20]

# Step 7: Export cleaned data
output_file = "cleaned_data.csv"
df.to_csv(output_file, index=False)

print("\nCleaned Data:")
print(df)
print(f"\nCleaned data saved to {output_file}")

# Step 8: Handling outliers using Z-score
# Calculate Z-scores
z_scores = np.abs(zscore(df.select_dtypes(include=[np.number])))  # Calculate Z-scores for numeric columns
threshold = 3  # Define a threshold for outliers

# Identify and remove outliers (rows where any Z-score is above the threshold)
outliers = (z_scores > threshold).any(axis=1)
df_outliers = df[~outliers]  # Remove rows with outliers

print("\nTask 7: Handling Outliers using Z-score")
print("Rows with outliers removed:")
print(df_outliers)


Original Data:
   ID     Name   Age              Email Joining_Date  Salary
0   1    Alice  25.0  alice@example.com   10-05-2021   50000
1   2      Bob   NaN    bob@example.com   19-08-2020   45000
2   3  Charlie  30.0                NaN   15-01-2022   60000
3   4      NaN  28.0  diana@example.com   20-03-2023   48000
4   5      Eve  22.0    eve@example.com   30-11-2019   51000
5   6    Alice  25.0  alice@example.com   10-05-2021   50000

Cleaned Data:
   ID     Name   Age                Email Joining_Date  Annual_Salary
0   1    Alice  25.0    alice@example.com   2021-10-05          50000
1   2      Bob  26.0      bob@example.com          NaT          45000
2   3  Charlie  30.0  unknown@example.com          NaT          60000
4   5      Eve  22.0      eve@example.com          NaT          51000
5   6    Alice  25.0    alice@example.com   2021-10-05          50000

Cleaned data saved to cleaned_data.csv

Task 7: Handling Outliers using Z-score
Rows with outliers removed:
   ID     Name

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)  # Fill missing Age with mean
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Email'].fillna("unknown@example.com", inplace=True)  # Fill missing Email with default value
