In [14]:
!pip install Faker



## Generate Synthetic Data

In [25]:
import pandas as pd
from faker import Faker
import random

# Initialize Faker to generate synthetic data
fake = Faker()

# Set seed for reproducibility
Faker.seed(42)

# Generate synthetic data
data = []
for _ in range(1000):
    data.append({
        'Name': fake.name(),
        'Age': random.randint(18, 70),
        'Email': fake.email(),
        'Phone': fake.phone_number(),
        'Address': fake.address(),
        'Salary': random.randint(20000, 150000),
        'Join_Date': fake.date_this_decade(),
        'Employment_Status': random.choice(['Full-Time', 'Part-Time', 'Contract']),
        'Department': random.choice(['IT', 'Engineering','Finance', 'HR', 'Marketing'])
    })

# Let's tweak the records a bit!
# Introduce missing values
for i in random.sample(range(len(data)), 50):
    data[i]['Email'] = None

# Introduce duplicate records
data.extend(random.sample(data, 100))

# Introduce outliers
for i in random.sample(range(len(data)), 20):
    data[i]['Salary'] = random.randint(200000, 500000)

# Create dataframe
df = pd.DataFrame(data)

## Step 1: Understanding the Data

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               1100 non-null   object
 1   Age                1100 non-null   int64 
 2   Email              1047 non-null   object
 3   Phone              1100 non-null   object
 4   Address            1100 non-null   object
 5   Salary             1100 non-null   int64 
 6   Join_Date          1100 non-null   object
 7   Employment_Status  1100 non-null   object
 8   Department         1100 non-null   object
dtypes: int64(2), object(7)
memory usage: 77.5+ KB


In [28]:
df.head()

Unnamed: 0,Name,Age,Email,Phone,Address,Salary,Join_Date,Employment_Status,Department
0,Allison Hill,43,donaldgarcia@example.net,+1-219-560-0133,"79402 Peterson Drives Apt. 511\nDavisstad, PA ...",59590,2023-07-12,Contract,Finance
1,Kimberly Dudley,44,smiller@example.net,+1-659-931-0341x316,"55341 Amanda Gardens Apt. 764\nLake Mark, WI 0...",116362,2020-12-31,Full-Time,Finance
2,Ethan Adams,32,robinbradley@example.net,837-767-2423x88496,"710 Eric Estate\nCarlsonfurt, MS 78605",94306,2024-05-09,Contract,Marketing
3,Tricia Valencia,26,frazierdanny@example.net,001-645-514-6270x48281,"809 Burns Creek\nNatashaport, IA 08093",91173,2021-01-19,Full-Time,HR
4,Angela Dennis,20,spenceamanda@example.org,(578)624-8963,"8713 Caleb Brooks Apt. 930\nLake Crystalbury, ...",68415,2023-10-04,Contract,IT


## Step 2: Handling Duplicate Records

In [18]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print("Number of duplicate rows:", duplicates)

# Removing duplicate rows
df.drop_duplicates(inplace=True)

Number of duplicate rows: 100


## Step 3: Handling Missing Data

In [19]:
# Check for missing values
missing_values = df.isna().sum()
print("Missing Values:")
print(missing_values)

Missing Values:
Name                  0
Age                   0
Email                50
Phone                 0
Address               0
Salary                0
Join_Date             0
Employment_Status     0
Department            0
dtype: int64


In [20]:
# Handling missing values by filling with a placeholder
df['Email'].fillna('unknown@example.com', inplace=True)

##  Step 4: Transforming Data

In [21]:
# Convert 'Join_Date' to datetime
df['Join_Date'] = pd.to_datetime(df['Join_Date'])
print("Join_Date after conversion:")
print(df['Join_Date'].head())

Join_Date after conversion:
0   2023-07-12
1   2020-12-31
2   2024-05-09
3   2021-01-19
4   2023-10-04
Name: Join_Date, dtype: datetime64[ns]


In [22]:
# Creating a new feature 'Years_Employed' based on 'Join_Date'
df['Years_Employed'] = pd.Timestamp.now().year - df['Join_Date'].dt.year
print("New feature 'Years_Employed':")
print(df[['Join_Date', 'Years_Employed']].head())

New feature 'Years_Employed':
   Join_Date  Years_Employed
0 2023-07-12               1
1 2020-12-31               4
2 2024-05-09               0
3 2021-01-19               3
4 2023-10-04               1


## Step 5: Cleaning Text Data

In [23]:
# Clean address strings
df['Address'] = df['Address'].str.replace('\n', ' ', regex=False)
print("Address after text cleaning:")
print(df['Address'].head())

Address after text cleaning:
0    79402 Peterson Drives Apt. 511 Davisstad, PA 35172
1     55341 Amanda Gardens Apt. 764 Lake Mark, WI 07832
2                 710 Eric Estate Carlsonfurt, MS 78605
3                 809 Burns Creek Natashaport, IA 08093
4    8713 Caleb Brooks Apt. 930 Lake Crystalbury, CA...
Name: Address, dtype: object


## Step 6: Handling Outliers

In [24]:
# Detecting outliers using z-score
z_scores = (df['Salary'] - df['Salary'].mean()) / df['Salary'].std()
outliers = df[abs(z_scores) > 3]
print("Outliers based on Salary:")
print(outliers[['Name', 'Salary']].head())

Outliers based on Salary:
                Name  Salary
16    Michael Powell  414854
131    Holly Jimenez  258727
240  Daniel Williams  371500
328    Walter Bishop  332554
352     Ashley Munoz  278539
