In [2]:
import pandas as pd
import numpy as np

# Step 1: Load the dataset
file_path = "data_cleaning_demo.csv"  # Replace with your filepath if needed
df = pd.read_csv(file_path)

print("Original Data:")
print(df)

# Step 2: Handle missing data
# Fill missing 'Age' with the mean value of 'Age'
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Drop rows with missing 'Name'
df.dropna(subset=['Name'], inplace=True)

# Fill missing 'Email' with a placeholder
df['Email'].fillna("noemail@example.com", inplace=True)

# Fill missing 'Joining_Date' with today's date (or another default date)
df['Joining_Date'].fillna(pd.Timestamp.today().strftime('%Y-%m-%d'), inplace=True)

# Step 3: String Manipulation
# Strip whitespace from 'Name' column
df['Name'] = df['Name'].str.strip()

# Convert 'Name' to title case
df['Name'] = df['Name'].str.title()

# Extract domain from 'Email'
df['Email_Domain'] = df['Email'].str.extract(r'@([a-zA-Z0-9.-]+)')

# Step 4: Export cleaned data
output_file = "cleaned_data_with_strings.csv"  # Output file path remains the same
df.to_csv(output_file, index=False)

print("\nCleaned Data:")
print(df)

print(f"\nCleaned data saved to {output_file}")


Original Data:
   ID     Name   Age              Email Joining_Date  Salary
0   1    Alice  25.0  alice@example.com   10-05-2021   50000
1   2      Bob   NaN    bob@example.com   19-08-2020   45000
2   3  Charlie  30.0                NaN   15-01-2022   60000
3   4      NaN  28.0  diana@example.com   20-03-2023   48000
4   5      Eve  22.0    eve@example.com   30-11-2019   51000
5   6    Alice  25.0  alice@example.com   10-05-2021   50000

Cleaned Data:
   ID     Name   Age                Email Joining_Date  Salary Email_Domain
0   1    Alice  25.0    alice@example.com   10-05-2021   50000  example.com
1   2      Bob  26.0      bob@example.com   19-08-2020   45000  example.com
2   3  Charlie  30.0  noemail@example.com   15-01-2022   60000  example.com
4   5      Eve  22.0      eve@example.com   30-11-2019   51000  example.com
5   6    Alice  25.0    alice@example.com   10-05-2021   50000  example.com

Cleaned data saved to cleaned_data_with_strings.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Email'].fillna("noemail@example.com", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are set