# Email cleaning script
This notebook identifies non-existent emails based on the 'failure_message' column in the 'deliveries.csv' dataset
Script created on Monday April 14th, 2025 by Andrea Ross Orozco
Kezko Data & Analytics

In [31]:
# Library imports
import pandas as pd # Data manipulation

Loads the dataset from a CSV file using the specified delimiter. Returns a pandas DataFrame with the contents of the file.

**Parameters:**

*   file_path: Path to the CSV file.
*   delimiter: Character used to separate values in the file.


**Returns**: A DataFrame containing the raw data.

In [32]:
# Load CSV file into pandas DataFrame
def load_data(filepath: str, delimiter: str = ';') -> pd.DataFrame:
    try:
        # Read CSV file into DataFrame
        df = pd.read_csv(filepath, delimiter=delimiter)
        return df
    except FileNotFoundError:
        # Raise an error if the file is not found
        raise FileNotFoundError(f"The file at {filepath} was not found.")
    except Exception as e:
        # Raise a generic error for any other exceptions
        raise Exception(f"An error occurred while loading the file: {e}")

Selects only the columns relevant to the email cleaning task: id, email, and failure_message.

**Parameters:**
*   df: The original DataFrame.

Returns: A filtered DataFrame with only the necessary columns.

In [33]:
# Filter DataFrame to include only relevant columns
def filter_relevant_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
    # Check if the required columns are present in the DataFrame
    missing_cols = [col for col in columns if col not in df.columns]
    if missing_cols:
        # Raise an error if any required columns are missing
        raise ValueError(f" The following required columns are missing: {missing_cols}")
    return df[columns]

Filters the DataFrame to include only rows where the failure_message contains the phrase "does not exist".

**Parameters:**
*   df: Filtered DataFrame
*   keyword: The phrase to search within failure_message

**Returns**: A new DataFrame with only non-existent emails

In [34]:
# Extract rows with failure messages indicating non-existent emails
def extract_nonexistent_emails(df: pd.DataFrame) -> pd.DataFrame:
    # Look for failure messages indicating non-existent emails
    mask = df['failure_message'].str.contains('does not exist', case=False, na=False) # Case insensitive and ignore NaN
    nonexistent_df = df[mask].drop_duplicates(subset='email') # Drop duplicates based on email
    print(f"Found {nonexistent_df.shape[0]} non-existent emails.") # Print the number of non-existent emails
    return nonexistent_df

Exports the given DataFrame to a CSV file using a comma as the delimiter.

**Parameters:**
*   df: DataFrame to be saved.
*   output_path: Desired path for the resulting CSV file.


In [35]:
# Save the DataFrame to a CSV file
def save_to_csv(df: pd.DataFrame, output_path: str, delimiter: str = ','):
    df.to_csv(output_path, index=False, sep=delimiter) # Save without index
    print(f" Non-existent emails saved to {output_path}") # Print confirmation message

In [36]:
# Main function to execute the email cleaning pipeline
def main():
    input_file = 'deliveries.csv'  # Update path as needed
    output_file = 'non_existent_emails.csv' # Update path as needed

    df = load_data(input_file) # Load the data
    df_relevant = filter_relevant_columns(df, ['id', 'email', 'failure_message']) # Filter relevant columns
    df_nonexistent = extract_nonexistent_emails(df_relevant) # Extract non-existent emails
    save_to_csv(df_nonexistent, output_file) # Save to CSV

In [37]:
# Run the main function
if __name__ == '__main__':
    main()

Found 20 non-existent emails.
 Non-existent emails saved to non_existent_emails.csv
