#### Goal - 3) Data Cleaning & Normalization 
- Fixing incorrect entries or typos 
- Standardizing categorical text values 
- Changing case of text 
- Removing leading/trailing spaces 
- Trimming/formatting string patterns

In [1]:
# 3) Data Cleaning & Normalization - Dataset

import numpy as np
import pandas as pd

# Creating the customized DataFrame for data cleaning practice
data = {
    'customer_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    'name': ['john doe', 'MARY SMITH', 'Robert B. Jr', '  Alice W', 'MiKe thomas', '  nancy lee', 'jULia miller'],
    'gender': ['Male', 'female', 'MALE', 'F', 'Male', 'FEMALE', 'Female'],
    'city': ['new york', 'los angles', ' San fransisco', 'chicago', 'HOUSTON', 'Boston', 'new york'],
    'email': ['john.doe@@email.com', 'marysmith @email.com', 'robert.b@email,com', 'alice.w@emailcom',
              'mike.t@email.com', 'nancy.lee @ email.com', 'julia.miller@email..com'],
    'phone': ['(123)-456-7890', '123 456 7890', '123.456.7890', '+1 1234567890', '001-123-456-7890', '1234567890', '123*456*7890'],
    'profession': ['Softwre Engr', 'Data scienctst', 'Engineer', 'doctor', 'Software Engr', '   Doctor', 'data scientist'],
    'membership_status': ['Gold', 'gold', 'Silver', 'SILVER', 'platinum', 'platnum', 'Platinam']
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df

Unnamed: 0,customer_id,name,gender,city,email,phone,profession,membership_status
0,1001,john doe,Male,new york,john.doe@@email.com,(123)-456-7890,Softwre Engr,Gold
1,1002,MARY SMITH,female,los angles,marysmith @email.com,123 456 7890,Data scienctst,gold
2,1003,Robert B. Jr,MALE,San fransisco,"robert.b@email,com",123.456.7890,Engineer,Silver
3,1004,Alice W,F,chicago,alice.w@emailcom,+1 1234567890,doctor,SILVER
4,1005,MiKe thomas,Male,HOUSTON,mike.t@email.com,001-123-456-7890,Software Engr,platinum
5,1006,nancy lee,FEMALE,Boston,nancy.lee @ email.com,1234567890,Doctor,platnum
6,1007,jULia miller,Female,new york,julia.miller@email..com,123*456*7890,data scientist,Platinam


In [2]:
# basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   customer_id        7 non-null      int64 
 1   name               7 non-null      object
 2   gender             7 non-null      object
 3   city               7 non-null      object
 4   email              7 non-null      object
 5   phone              7 non-null      object
 6   profession         7 non-null      object
 7   membership_status  7 non-null      object
dtypes: int64(1), object(7)
memory usage: 580.0+ bytes


#### 3-1) Fixing incorrect entries or typos = 
- replace(), map(), apply() (with custom functions), 
- 
Dictionary-based correction
-  Manual inspection or domain rules 

In [3]:
# fix inconsistent values of gender column
#df['gender'] = df['gender'].str.title()
#df.loc[3,'gender'] = 'Female'

# Or this all can be replaced using map
# map() (for known mappings) = Use when every unique value has a known mapping
# map() returns NaN for unmatched values — safer for controlled data
correction = {'F':'Female','female':'Female','MALE':'Male','FEMALE':'Female','Male':'Male','Female':'Female'}
df['gender'] = df['gender'].map(correction)

# apply() with Custom Functions
# Use when correction requires logic (e.g. case-insensitive matching, trimming spaces)
df['city'] = df['city'].apply(lambda x: x.strip().title())

# Dictionary-Based Correction using replace
# Build a mapping dictionary for frequent typos or variants.
corrections = {'platinum':'Platinum','platnum':'Platinum','Platinam':'Platinum'}
df['membership_status'] = df['membership_status'].replace(corrections)

# Manual Inspection / Domain Rules
# Sometimes needed for rare errors or business-specific corrections.
# e.g., Fix ages > 120 as outliers
df.loc[df['customer_id'] < 1002, 'customer_id'] = np.nan
df

Unnamed: 0,customer_id,name,gender,city,email,phone,profession,membership_status
0,,john doe,Male,New York,john.doe@@email.com,(123)-456-7890,Softwre Engr,Gold
1,1002.0,MARY SMITH,Female,Los Angles,marysmith @email.com,123 456 7890,Data scienctst,gold
2,1003.0,Robert B. Jr,Male,San Fransisco,"robert.b@email,com",123.456.7890,Engineer,Silver
3,1004.0,Alice W,Female,Chicago,alice.w@emailcom,+1 1234567890,doctor,SILVER
4,1005.0,MiKe thomas,Male,Houston,mike.t@email.com,001-123-456-7890,Software Engr,Platinum
5,1006.0,nancy lee,Female,Boston,nancy.lee @ email.com,1234567890,Doctor,Platinum
6,1007.0,jULia miller,Female,New York,julia.miller@email..com,123*456*7890,data scientist,Platinum


#### 3-2) Standardizing categorical text values =
str.lower(), str.strip(), str.replace(), replace() 
with dictionary, map() 

In [4]:
# str.lower() — Make everything lowercase
df['profession'] = df['profession'].str.lower() # upper,title,capitalize etc.

# str.strip() — Remove extra spaces
df['profession'] = df['profession'].str.strip() # lstrip, rstrip

# str.replace() — Fix minor text patterns
df['email'] = df['email'].str.replace(' ', '').str.replace('@', '#%')

# replace() with Dictionary — Fix known typos/variants
# df['email'] = df['email'].replace({'#%':'@','com':'COM'}) # it works only for entire string value

# for replacing substring
df['email'] = df['email'].str.replace('#%','@')
df['email'] = df['email'].str.replace('com','COM')

# for both replacements in one go
# df['email'] = df['email'].str.replace('#%', '@', regex=True).str.replace('com', 'COM', regex=True)

df

Unnamed: 0,customer_id,name,gender,city,email,phone,profession,membership_status
0,,john doe,Male,New York,john.doe@@email.COM,(123)-456-7890,softwre engr,Gold
1,1002.0,MARY SMITH,Female,Los Angles,marysmith@email.COM,123 456 7890,data scienctst,gold
2,1003.0,Robert B. Jr,Male,San Fransisco,"robert.b@email,COM",123.456.7890,engineer,Silver
3,1004.0,Alice W,Female,Chicago,alice.w@emailCOM,+1 1234567890,doctor,SILVER
4,1005.0,MiKe thomas,Male,Houston,mike.t@email.COM,001-123-456-7890,software engr,Platinum
5,1006.0,nancy lee,Female,Boston,nancy.lee@email.COM,1234567890,doctor,Platinum
6,1007.0,jULia miller,Female,New York,julia.miller@email..COM,123*456*7890,data scientist,Platinum


#### 3-3) Trimming/formatting string patterns =
str.replace(), str.extract(), str.contains() (with 
regex), Regex functions, str.translate()

In [5]:
# str.replace() =  
# regx = False (Default before Pandas v1.4)
df['email'] = df['email'].str.replace('.', '=', regex=False)
    # --> Treats the pattern as a plain string, not a regular expression.
    # --> Replaces all dashes - exactly — no pattern matching.

# regx = True  (Default from Pandas v1.4+)
#df['column_name'].str.replace(r'\d', '', regex=True)
    # --> Treats the pattern as a regular expression (regex).
    # --> Removes all digits using regex \d

In [6]:
df1 = pd.DataFrame(data)
df

Unnamed: 0,customer_id,name,gender,city,email,phone,profession,membership_status
0,,john doe,Male,New York,john=doe@@email=COM,(123)-456-7890,softwre engr,Gold
1,1002.0,MARY SMITH,Female,Los Angles,marysmith@email=COM,123 456 7890,data scienctst,gold
2,1003.0,Robert B. Jr,Male,San Fransisco,"robert=b@email,COM",123.456.7890,engineer,Silver
3,1004.0,Alice W,Female,Chicago,alice=w@emailCOM,+1 1234567890,doctor,SILVER
4,1005.0,MiKe thomas,Male,Houston,mike=t@email=COM,001-123-456-7890,software engr,Platinum
5,1006.0,nancy lee,Female,Boston,nancy=lee@email=COM,1234567890,doctor,Platinum
6,1007.0,jULia miller,Female,New York,julia=miller@email==COM,123*456*7890,data scientist,Platinum


In [7]:
# str.extract() — Extract pattern match using regex

df1['domain'] = df1['email'].str.extract(r'@(\w+)\.')  
df1

# Extracts domain name from email like abc@gmail.com → "gmail"
# Returns a new column with extracted value (1st capturing group)

Unnamed: 0,customer_id,name,gender,city,email,phone,profession,membership_status,domain
0,1001,john doe,Male,new york,john.doe@@email.com,(123)-456-7890,Softwre Engr,Gold,email
1,1002,MARY SMITH,female,los angles,marysmith @email.com,123 456 7890,Data scienctst,gold,email
2,1003,Robert B. Jr,MALE,San fransisco,"robert.b@email,com",123.456.7890,Engineer,Silver,
3,1004,Alice W,F,chicago,alice.w@emailcom,+1 1234567890,doctor,SILVER,
4,1005,MiKe thomas,Male,HOUSTON,mike.t@email.com,001-123-456-7890,Software Engr,platinum,email
5,1006,nancy lee,FEMALE,Boston,nancy.lee @ email.com,1234567890,Doctor,platnum,
6,1007,jULia miller,Female,new york,julia.miller@email..com,123*456*7890,data scientist,Platinam,email


In [8]:
df1[df1['email'].str.contains(r'\.com$', regex=True)]

# Returns rows where email ends with .com
# Useful for filtering or conditional logic

# returned only index - 0,1,4,5,6

Unnamed: 0,customer_id,name,gender,city,email,phone,profession,membership_status,domain
0,1001,john doe,Male,new york,john.doe@@email.com,(123)-456-7890,Softwre Engr,Gold,email
1,1002,MARY SMITH,female,los angles,marysmith @email.com,123 456 7890,Data scienctst,gold,email
4,1005,MiKe thomas,Male,HOUSTON,mike.t@email.com,001-123-456-7890,Software Engr,platinum,email
5,1006,nancy lee,FEMALE,Boston,nancy.lee @ email.com,1234567890,Doctor,platnum,
6,1007,jULia miller,Female,new york,julia.miller@email..com,123*456*7890,data scientist,Platinam,email


In [9]:
# using contains without regex=True
df1['gender'].str.contains('Male')

0     True
1    False
2    False
3    False
4     True
5    False
6    False
Name: gender, dtype: bool

In [10]:
# str.translate() — Replace multiple characters at once (character-level)
# Much faster than str.replace() for single-character replacements
# Best for fast, multiple single-character replacements without regex.

table = str.maketrans({'@': '', '#': '', '%': ''})  
df1['clean_email'] = df1['email'].str.translate(table)
df1

# str.maketrans() is a Python method that creates a translation map for character-by-character replacements.
# maketrans() is used with str.translate() like above

Unnamed: 0,customer_id,name,gender,city,email,phone,profession,membership_status,domain,clean_email
0,1001,john doe,Male,new york,john.doe@@email.com,(123)-456-7890,Softwre Engr,Gold,email,john.doeemail.com
1,1002,MARY SMITH,female,los angles,marysmith @email.com,123 456 7890,Data scienctst,gold,email,marysmith email.com
2,1003,Robert B. Jr,MALE,San fransisco,"robert.b@email,com",123.456.7890,Engineer,Silver,,"robert.bemail,com"
3,1004,Alice W,F,chicago,alice.w@emailcom,+1 1234567890,doctor,SILVER,,alice.wemailcom
4,1005,MiKe thomas,Male,HOUSTON,mike.t@email.com,001-123-456-7890,Software Engr,platinum,email,mike.temail.com
5,1006,nancy lee,FEMALE,Boston,nancy.lee @ email.com,1234567890,Doctor,platnum,,nancy.lee email.com
6,1007,jULia miller,Female,new york,julia.miller@email..com,123*456*7890,data scientist,Platinam,email,julia.milleremail..com
