## Sample Data

In [1]:
data = [
	{"name": "alice smith", "age": 30, "email": "alice@example.com", "salary": 50000.00, "join_date": "2022-03-15"},
	{"name": "bob gray", "age": 17, "email": "bob@not-an-email", "salary": 60000.00, "join_date": "invalid-date"},
	{"name": "charlie brown", "age": None, "email": "charlie@example.com", "salary": -1500.00, "join_date": "2022-09-21"},
	{"name": "dave davis", "age": 45, "email": "dave@example.com", "salary": 70000.00, "join_date": "2021-07-01"},
	{"name": "eve green", "age": 25, "email": "eve@example.com", "salary": None, "join_date": "2023-12-31"},
]

## Data Cleaning One-Liners

In [2]:
# Capitalizing the names for consistency
data = [{**d, "name": d["name"].title()} for d in data]

In [3]:
# Converting age to an integer type, defaulting to 25 if conversion fails
data = [{**d, "age": int(d["age"]) if isinstance(d["age"], (int, float)) else 25} for d in data]

In [4]:
# Ensuring age is an integer within the range of 18 to 60; otherwise, set to 25
data = [{**d, "age": d["age"] if isinstance(d["age"], int) and 18 <= d["age"] <= 60 else 25} for d in data]

In [5]:
data = [{**d, "email": d["email"] if "@" in d["email"] and "." in d["email"] else "invalid@example.com"} for d in data]

In [6]:
data = [{**d, "salary": d["salary"] if d["salary"] is not None else 30000.00} for d in data]

In [7]:
from datetime import datetime
data = [{**d, "join_date": (lambda x: (datetime.strptime(x, '%Y-%m-%d').date() if '-' in x and len(x) == 10 else datetime.strptime(x, '%d-%m-%Y').date()) if x and 'invalid-date' not in x else '2023-01-01')(d['join_date'])} for d in data]


In [8]:
# Replacing negative salary values with zero to ensure all values are non-negative.
data = [{**d, "salary": max(d["salary"], 0)} for d in data]

In [9]:
# Keeping only unique entries based on the name field
data = {tuple(d.items()) for d in data}  # Using a set to remove duplicates
data = [dict(t) for t in data]  # Converting back to list of dictionaries

In [10]:
# Normalizing salary values to a percentage of the maximum salary
max_salary = max(d["salary"] for d in data)
data = [{**d, "salary": (d["salary"] / max_salary * 100) if max_salary > 0 else 0} for d in data]

In [11]:
# Trimming whitespace from names for cleaner data
data = [{**d, "name": d["name"].strip()} for d in data]

## Data After Cleaning Steps

In [12]:
data

[{'name': 'Bob Gray',
  'age': 25,
  'email': 'invalid@example.com',
  'salary': 85.71428571428571,
  'join_date': '2023-01-01'},
 {'name': 'Alice Smith',
  'age': 30,
  'email': 'alice@example.com',
  'salary': 71.42857142857143,
  'join_date': datetime.date(2022, 3, 15)},
 {'name': 'Charlie Brown',
  'age': 25,
  'email': 'charlie@example.com',
  'salary': 0.0,
  'join_date': datetime.date(2022, 9, 21)},
 {'name': 'Dave Davis',
  'age': 45,
  'email': 'dave@example.com',
  'salary': 100.0,
  'join_date': datetime.date(2021, 7, 1)},
 {'name': 'Eve Green',
  'age': 25,
  'email': 'eve@example.com',
  'salary': 42.857142857142854,
  'join_date': datetime.date(2023, 12, 31)}]