In [1]:
""" DATA CLEANING """

' DATA CLEANING '

In [2]:
import pandas as pd
import numpy as np

data = {
    "Name": ["Alice", "Bob", np.nan, "David", "Eve"],
    "Age": [25, 30, None, 40, 35],
    "Salary": [50000, 60000, 55000, None, 70000],
    "Department": ["HR", "Tech", "Tech", "Marketing", "HR"],
    "Join_Date": ["2020-01-15", "2019-03-22", "2021-12-01", None, "2022-06-10"]
}

df = pd.DataFrame(data)
print(df)

    Name   Age   Salary Department   Join_Date
0  Alice  25.0  50000.0         HR  2020-01-15
1    Bob  30.0  60000.0       Tech  2019-03-22
2    NaN   NaN  55000.0       Tech  2021-12-01
3  David  40.0      NaN  Marketing        None
4    Eve  35.0  70000.0         HR  2022-06-10


In [4]:
""" Handle Missing Data"""

# Find Missing Values
print(df.isna())
print(df.isna().sum())

    Name    Age  Salary  Department  Join_Date
0  False  False   False       False      False
1  False  False   False       False      False
2   True   True   False       False      False
3  False  False    True       False       True
4  False  False   False       False      False
Name          1
Age           1
Salary        1
Department    0
Join_Date     1
dtype: int64


In [6]:
# Solution 1: Remove Missing Values

# Drop rows with ANY missing values
df_cleaned = df.dropna()

# Drop rows where ALL values are missing
df.dropna(how='all')

# Drop columns with missing values
df.dropna(axis=1)

Unnamed: 0,Department
0,HR
1,Tech
2,Tech
3,Marketing
4,HR


In [45]:
# Solutiion 2: Fill Missing Values

# Fill with specific value
df["Salary"] = df["Salary"].fillna(65000)

# Fill with mean
df["Age"] = df["Age"].fillna(df["Age"].mean())

# Forward fill (use previous value)
df["Hire_Date"] = df["Hire_Date"].ffill()

In [8]:
""" Fixing Data Types"""

' Fixing Data Types'

In [10]:
# Check Current Types
print(df.dtypes)

Name           object
Age           float64
Salary        float64
Department     object
Join_Date      object
dtype: object


In [14]:
# Convert Types

# Convert to integer (handles NaN with 'int64')
df["Age"] = df["Age"].astype("int64")

# Convert to datetime
df["Join_Date"] = pd.to_datetime(df["Join_Date"])

# Convert to category (saves memory)
df["Department"] = df["Department"].astype("category")

In [None]:
""" REMOVING DUPLICATES"""

In [19]:
# Find duplicates rows (all columns)
print(df.duplicated())

# Remove duplicates (keep first occurrence)
df.drop_duplicates(inplace=True)

# Check duplicates in specific columns
df.drop_duplicates(subset=["Name"], keep=False)

0    False
1    False
2    False
3    False
4    False
dtype: bool


Unnamed: 0,Name,Age,Salary,Department,Join_Date
0,Alice,25,50000.0,HR,2020-01-15
1,Bob,30,60000.0,Tech,2019-03-22
2,,32,55000.0,Tech,2021-12-01
3,David,40,65000.0,Marketing,2021-12-01
4,Eve,35,70000.0,HR,2022-06-10


In [None]:
"""STRING CLEANING"""

In [37]:
# Strip whitespace
df["Full_Name"] = df["Full_Name"].str.strip()

# Replace values
df["Dept"] = df["Dept"].str.replace("HR", "Human Resources")
df["Dept"] = df["Dept"].str.replace("Tech", "Technology")

# Handle NaN in strings
df["Full_Name"] = df["Full_Name"].fillna("Unknown")

In [None]:
""" RENAMING COLUMNS """

In [43]:
# Rename specific columns
df.rename(columns={"Join_Date": "Hire_Date"}, inplace=True)

# Rename all columns
df.columns = ["Full_Name", "Age", "Salary", "Dept", "Hire_Date"]

print(df)

  Full_Name  Age   Salary                        Dept  Hire_Date
0     Alice   25  50000.0             Human Resources 2020-01-15
1       Bob   30  60000.0  Technologynolonologynology 2019-03-22
2   Unknown   32  55000.0  Technologynolonologynology 2021-12-01
3     David   40  65000.0                   Marketing 2021-12-01
4       Eve   35  70000.0             Human Resources 2022-06-10
