In [None]:
import re


import pandas as pd

# Load the messy data
url = 'https://www.dropbox.com/scl/fi/qxjv1jjosw1eueouv7v1w/messy_data.csv?rlkey=9qu8bsjd4wgi6rz0br4hrudjd&st=9ngzdwui&dl=1'
df = pd.read_csv(url)

# Preview the data
df.head()



df.info()

# Check for missing values
df.isnull().sum()

# Check for duplicate rows
df.duplicated().sum()

# Check column names
df.columns

# For example: Fill missing 'Age' with median
df['Age'] = df['Age'].fillna(df['Age'].median())

#drop duplicates
df = df.drop_duplicates()

#fix email format


# Keep only professional emails
df['Email'] = df['Email'].apply(lambda x: x if re.match(r"[^@]+@[^@]+\.[^@]+", str(x)) and not re.search(r'gmail|yahoo|hotmail', str(x), re.I) else None)

# Drop rows with invalid emails
df = df.dropna(subset=['Email'])

#clean name field

df['Name'] = df['Name'].str.replace('[^a-zA-Z\s]', '', regex=True).str.strip().str.title()

#standardise join dates
df['Join Date'] = pd.to_datetime(df['Join Date'], errors='coerce')

#standardisse department names

df['Department'] = df['Department'].str.lower().str.strip()
#map the short names to standard names
department_map = {
    'hr': 'HR',
    'engineering': 'Engineering',
    'eng': 'Engineering',
    'engg': 'Engineering',
    'marketing': 'Marketing',
    'market': 'Marketing',
    'sales': 'Sales',
    'sale': 'Sales',
    'support': 'Support',
    'supp': 'Support'
}

def clean_department(dept):
    dept = str(dept).lower().strip()
    for keyword in department_map:
        if dept.startswith(keyword):
            return department_map[keyword]
    # fill unmatched departments with other
    return 'Other'

df['Department'] = df['Department'].apply(clean_department)


#handle salary noise
# Remove currency symbols or commas
df['Salary'] = df['Salary'].replace('[\$,]', '', regex=True).astype(float)

# Remove salary out of range
df = df[(df['Salary'] >= 10000) & (df['Salary'] <= 500000)]

#save
df.to_csv("cleaned_dataset.csv", index=False)


#download
from google.colab import files
files.download("cleaned_dataset.csv")





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000 entries, 0 to 10999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  11000 non-null  int64  
 1   ID          11000 non-null  object 
 2   Name        8667 non-null   object 
 3   Age         9253 non-null   float64
 4   Email       9731 non-null   object 
 5   Join Date   8808 non-null   object 
 6   Salary      8761 non-null   float64
 7   Department  8745 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 687.6+ KB


  df['Join Date'] = pd.to_datetime(df['Join Date'], errors='coerce')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>