# Case 1

## Generate Data

In [196]:
# first_name, last_name, age, country
from faker import Faker
import pandas as pd
from random import randint

faker = Faker()
countries = [
    "US", "UK", "ID", "SG", "MA",
    "TO", "GH", "IS", "ER", "BB"
]

number_of_data = 500000
dummy_data = []

for i in range(number_of_data):
    name = faker.name().split()
    first_name, last_name = name[0], name[-1]
    age = randint(10, 60)
    country = countries[randint(0, 9)]

    dummy_data.append({
        "first_name": first_name,
        "last_name": last_name,
        "age": str(age),
        "country": country
    })

df = pd.DataFrame(dummy_data)

df.head()


Unnamed: 0,first_name,last_name,age,country
0,Craig,Wilkinson,21,BB
1,Carmen,Barnes,55,SG
2,John,Blake,34,GH
3,Carrie,Moore,60,US
4,Jeremy,Lopez,44,UK


In [None]:
import numpy as np
from random import random

messy_df = df.copy()

# Set the probability of a None value
missing_rate = 0.2 

# Function to randomly insert None
def insert_missing_values(df, column, rate, add_unknown_and_random_str=False):
    mask = np.random.rand(len(df)) < rate
    if add_unknown_and_random_str:
        prob = random()
        if prob < 0.5:
            df.loc[mask, column] = "Unknown"
        else:
            df.loc[mask, column] = "cfasew"
    else:
        df.loc[mask, column] = None

# Apply to desired columns
for col in ['first_name', 'last_name', 'age', 'country']:
    insert_missing_values(messy_df, col, missing_rate)

for col in ['age', 'country']:
    insert_missing_values(messy_df, col, missing_rate, add_unknown_and_random_str=True)

messy_df.head(10)

Unnamed: 0,first_name,last_name,age,country
0,Craig,,,BB
1,Carmen,,,
2,John,Blake,34.0,
3,Carrie,Moore,,US
4,Jeremy,Lopez,,UK
5,Gregory,Palmer,33.0,BB
6,Stephen,Barajas,56.0,ER
7,James,Nunez,51.0,GH
8,Mary,Brown,,
9,Mr.,Perez,59.0,UK


## Original Script

In [199]:
import pandas as pd

def clean_customer_data(df):
    known_countries = ["US", "UK", "ID", "SG"]
    full_names = []
    cleaned_ages = []
    is_adults = []
    standardized_countries = []

    for i in range(len(df)):
        row = df.iloc[i]

        # String concat with NaN checking
        first = str(row['first_name']) if pd.notna(row['first_name']) else "Unknown"
        last = str(row['last_name']) if pd.notna(row['last_name']) else "Unknown"
        full_name = first + " " + last
        full_names.append(full_name)

        # Data type conversion & filtering
        age_raw = row['age']
        if pd.isna(age_raw) or str(age_raw).strip().lower() == 'unknown':
            age = -1
        else:
            try:
                age = int(age_raw)
            except:
                age = -1
        cleaned_ages.append(age)

        # NA + conditional logic
        is_adults.append(True if age >= 18 else False)

        # String cleaning & filtering
        country = str(row['country']).strip().upper() if pd.notna(row['country']) else "XX"
        if country not in known_countries:
            country = "XX"
        standardized_countries.append(country)

    # Add results back to DataFrame
    df['full_name'] = full_names
    df['cleaned_age'] = cleaned_ages
    df['is_adult'] = is_adults
    df['country_code'] = standardized_countries

    return df

In [200]:
import timeit

df1 = messy_df.copy()
time_taken = timeit.timeit(lambda: clean_customer_data(df1), number=1)
print(f"Time taken: {time_taken:.4f} seconds")

Time taken: 5.9515 seconds


## Optimized Script

In [216]:
known_countries = ["US", "UK", "ID", "SG"]

def clean_customer_name(name):
    if pd.isna(name):
        return "Unknown"
    
    return name

def clean_customer_age(age):
    if pd.isna(age) or str(age).strip().lower() == "unknown":
        return -1
    
    try:
        return int(age)
    except:
        return -1

def is_customer_adult(age):
    return age >= 18

def clean_customer_country(country):
    if country not in known_countries or country is None:
        return  "XX"

    return country

def optimized_clean_customer_data(df):
    df["full_name"] = df["first_name"].apply(clean_customer_name) + " " + df["last_name"].apply(clean_customer_name)
    df["cleaned_age"] = df["age"].apply(clean_customer_age)
    df["is_adult"] = df["cleaned_age"].apply(is_customer_adult)
    df["country_code"] = df["country"].apply(clean_customer_country)

    return df

In [217]:
df2 = messy_df.copy()

time_taken = timeit.timeit(lambda: optimized_clean_customer_data(df2), number=1)
print(f"Time taken: {time_taken:.4f} seconds")

Time taken: 0.4712 seconds


In [218]:
df1.equals(df2)

True

# Case 2

## Generate Data

In [42]:
# entry_time, exit_time, break_duration, depratment
from random import randint
import pandas as pd

possible_departments = [
    "Sales", "Logistics", "Data", "Marketing", "Accounting",
    "Finance", "IT", "Product" 
]

number_of_data = 1000000
dummy_data = []
for i in range(number_of_data):
    dummy_data.append({
        "entry_time": f"{randint(8, 10):02}:{randint(0, 59):02}:{randint(0, 59):02}",
        "exit_time": f"{randint(16, 18):02}:{randint(0, 59):02}:{randint(0, 59):02}",
        "break_duration": f"{randint(30, 90)}",
        "department": possible_departments[randint(0, 7)]
    })

df = pd.DataFrame(dummy_data)
df.head()

Unnamed: 0,entry_time,exit_time,break_duration,department
0,09:44:42,17:42:40,89,Logistics
1,08:56:34,16:43:40,78,Marketing
2,09:14:36,17:32:47,39,Logistics
3,09:12:48,17:13:11,41,Sales
4,08:50:08,16:30:19,63,IT


## Original Script

In [43]:
from datetime import datetime, timedelta

def preprocess_time_entries(df):
    work_durations = []
    work_day_labels = []
    departments = []
    
    for i in range(len(df)):
        row = df.iloc[i]
        
        # Clean entry_time and exit_time, convert to datetime
        try:
            entry_time = datetime.strptime(str(row['entry_time']), '%H:%M:%S')
        except ValueError:
            entry_time = None
        
        try:
            exit_time = datetime.strptime(str(row['exit_time']), '%H:%M:%S')
        except ValueError:
            exit_time = None
        
        # Handle break_duration, converting to minutes or setting to 0 if invalid
        try:
            break_duration = int(row['break_duration'])
        except (ValueError, TypeError):
            break_duration = 0
        
        # Calculate work duration in seconds, then convert to HH:MM:SS
        if entry_time and exit_time:
            total_seconds = (exit_time - entry_time).seconds - (break_duration * 60)
            work_duration = str(timedelta(seconds=total_seconds))
            work_durations.append(work_duration)
        else:
            work_durations.append('Invalid Time')
        
        # Assign work day labels based on work duration
        if entry_time and exit_time:
            total_hours = (exit_time - entry_time).seconds / 3600 - break_duration / 60
            if total_hours < 5:
                work_day_labels.append('Short Day')
            elif total_hours >= 8:
                work_day_labels.append('Long Day')
            else:
                work_day_labels.append('Regular Day')
        else:
            work_day_labels.append('Invalid Entry')
        
        # Clean department column
        department = row['department'] if pd.notna(row['department']) and row['department'].strip() != "" else "Unknown"
        departments.append(department)
    
    # Add results back to the DataFrame
    df['work_duration'] = work_durations
    df['work_day_label'] = work_day_labels
    df['department'] = departments
    
    return df

In [44]:
df1 = df.copy()

time_taken = timeit.timeit(lambda: preprocess_time_entries(df1), number=1)
print(f"Time taken: {time_taken:.4f} seconds")

Time taken: 17.8385 seconds


In [62]:
def clean_time(time):
    try:
        return datetime.strptime(time, "%H:%M:%S")
    except:
        return None
    
def calculate_work_duration(entry_time, exit_time, break_duration):
    entry_time = clean_time(entry_time)
    exit_time = clean_time(exit_time)
    try:
        break_duration = int(break_duration)
    except:
        break_duration = 0

    if entry_time and exit_time:
        total_seconds = (exit_time - entry_time).seconds - (break_duration * 60)
        work_duration = str(timedelta(seconds=total_seconds))

        return work_duration
    
    else:
        return "Invalid Time"
    
def label_work_duration(work_duration):
    if work_duration != "Invalid Time":
        total_hours = (work_duration - datetime.strptime("00:00:00", "%H:%M:%S")).seconds

        if total_hours < 5:
            return 'Short Day'
        elif total_hours >= 8:
            return 'Long Day'
        else:
            return 'Regular Day'
    
    else:
        return "Invalid Entry"

def clean_deparment(department):
    if pd.notna(department) and department.strip() != "":
        return department
    else:
        return "Unknown"
    
def optimized_preprocess_time_entries(df):
    df["work_duration"] = df.apply(lambda x: calculate_work_duration(x.entry_time, x.exit_time, x.break_duration), axis=1)
    df["work_day_label"] = df["work_duration"].apply(label_work_duration)
    df["department"] = df["department"].apply(clean_deparment)

In [63]:
df2 = df.copy()

time_taken = timeit.timeit(lambda: optimized_preprocess_time_entries(df2), number=1)
print(f"Time taken: {time_taken:.4f} seconds")

TypeError: unsupported operand type(s) for -: 'str' and 'datetime.datetime'