In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import os

In [12]:
input_file = 'advcy_data_final 2.csv'


def load_dataset(file_path):
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        print(f"Dataset '{file_path}' loaded successfully.\n")
        return df
    else:
        raise FileNotFoundError(f"The file '{file_path}' does not exist. Please check the file path.")

# Load the dataset
df = load_dataset(input_file)


Dataset 'advcy_data_final 2.csv' loaded successfully.



In [13]:
# Display the first five rows of the dataset
print("First five rows of the dataset:")
display(df.head())


First five rows of the dataset:


Unnamed: 0,sustainer,gender,client_state_rank,all_state_rank,client_zip_rank,all_zip_rank,age,account_id,gift_date,gift_min_t12m,...,drtv_t12m,other_t12m,no_channel_t12m,member_t12m,gift_min_lt,gift_max_lt,gift_avg_lt,gift_count_lt,promo_count_lt,cons_month_lt
0,0,0,36,29,3553,34364,,1030075C,2020-03-23,1.0,...,0,0,0,0,1.0,2.0,1.5,2,2,2
1,0,3,33,41,2,22870,,1067737C,2019-09-09,2.0,...,0,0,0,0,2.0,2.0,2.0,1,1,1
2,0,0,42,31,2594,35599,,1075779C,2020-03-23,2.0,...,0,0,0,0,2.0,2.0,2.0,1,1,1
3,0,0,42,31,1394,29773,,1090840C,2020-06-12,2.0,...,0,0,0,0,2.0,2.0,2.0,1,1,1
4,0,0,42,31,581,28865,,1155437C,2020-11-02,2.0,...,0,0,0,0,2.0,2.0,2.0,1,1,1


In [14]:

def handle_missing_values(df, strategy='fill', fill_value=0):
    if strategy == 'fill':
        df_filled = df.fillna(fill_value)
        print(f"Missing values filled with {fill_value}.\n")
        return df_filled
    elif strategy == 'drop':
        df_dropped = df.dropna()
        print("Rows with missing values have been dropped.\n")
        return df_dropped
    else:
        raise ValueError("Unsupported strategy for handling missing values.")

# Handle missing values by filling with 0
df = handle_missing_values(df, strategy='fill', fill_value=0)

Missing values filled with 0.



In [15]:
numerical_columns = df.select_dtypes(include=[np.number]).columns
print("Numerical columns:", numerical_columns.tolist())


Numerical columns: ['sustainer', 'gender', 'client_state_rank', 'all_state_rank', 'client_zip_rank', 'all_zip_rank', 'age', 'gift_min_t12m', 'gift_max_t12m', 'gift_avg_t12m', 'gift_count_t12m', 'promo_count_t12m', 'cons_month_t12m', 'dm_t12m', 'tm_t12m', 'wm_t12m', 'digital_t12m', 'f2f_t12m', 'events_t12m', 'drtv_t12m', 'other_t12m', 'no_channel_t12m', 'member_t12m', 'gift_min_lt', 'gift_max_lt', 'gift_avg_lt', 'gift_count_lt', 'promo_count_lt', 'cons_month_lt']


Detecting Outliers We'll detect outliers using two methods:

Z-Score Method, Interquartile Range (IQR) Method

In [16]:
# Function to detect outliers using Z-Score
def detect_outliers_zscore(df, numerical_columns, threshold=3):
    outliers = {}
    print("Detecting outliers using Z-Score method\n")
    for column in numerical_columns:
        if df[column].nunique() > 1:
            z_scores = np.abs(stats.zscore(df[column]))
            outliers_indices = np.where(z_scores > threshold)[0]
            outliers[column] = outliers_indices
            print(f"Z-Score Outliers in '{column}': {len(outliers_indices)} detected")
        else:
            outliers[column] = np.array([])
            print(f"Z-Score Outliers in '{column}': 0 detected (constant column)")
    print("\n")
    return outliers

# Detect outliers using Z-Score
outliers_zscore = detect_outliers_zscore(df, numerical_columns)


Detecting outliers using Z-Score method

Z-Score Outliers in 'sustainer': 0 detected
Z-Score Outliers in 'gender': 0 detected
Z-Score Outliers in 'client_state_rank': 223 detected
Z-Score Outliers in 'all_state_rank': 194 detected
Z-Score Outliers in 'client_zip_rank': 0 detected
Z-Score Outliers in 'all_zip_rank': 486 detected
Z-Score Outliers in 'age': 520 detected
Z-Score Outliers in 'gift_min_t12m': 100 detected
Z-Score Outliers in 'gift_max_t12m': 52 detected
Z-Score Outliers in 'gift_avg_t12m': 46 detected
Z-Score Outliers in 'gift_count_t12m': 200 detected
Z-Score Outliers in 'promo_count_t12m': 220 detected
Z-Score Outliers in 'cons_month_t12m': 525 detected
Z-Score Outliers in 'dm_t12m': 0 detected
Z-Score Outliers in 'tm_t12m': 671 detected
Z-Score Outliers in 'wm_t12m': 0 detected
Z-Score Outliers in 'digital_t12m': 0 detected
Z-Score Outliers in 'f2f_t12m': 5 detected
Z-Score Outliers in 'events_t12m': 5 detected
Z-Score Outliers in 'drtv_t12m': 0 detected (constant column)

In [17]:
# Function to detect outliers using IQR
def detect_outliers_iqr(df, numerical_columns):
    outliers = {}
    print("Detecting outliers using IQR method...\n")
    for column in numerical_columns:
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        condition = (df[column] < lower_bound) | (df[column] > upper_bound)
        outliers_indices = df[condition].index
        outliers[column] = outliers_indices
        print(f"IQR Outliers in '{column}': {len(outliers_indices)} detected")
    print("\n")
    return outliers

# Detect outliers using IQR
outliers_iqr = detect_outliers_iqr(df, numerical_columns)

Detecting outliers using IQR method...

IQR Outliers in 'sustainer': 0 detected
IQR Outliers in 'gender': 4715 detected
IQR Outliers in 'client_state_rank': 603 detected
IQR Outliers in 'all_state_rank': 405 detected
IQR Outliers in 'client_zip_rank': 0 detected
IQR Outliers in 'all_zip_rank': 1419 detected
IQR Outliers in 'age': 2244 detected
IQR Outliers in 'gift_min_t12m': 858 detected
IQR Outliers in 'gift_max_t12m': 866 detected
IQR Outliers in 'gift_avg_t12m': 1598 detected
IQR Outliers in 'gift_count_t12m': 1968 detected
IQR Outliers in 'promo_count_t12m': 2282 detected
IQR Outliers in 'cons_month_t12m': 2509 detected
IQR Outliers in 'dm_t12m': 4486 detected
IQR Outliers in 'tm_t12m': 671 detected
IQR Outliers in 'wm_t12m': 3483 detected
IQR Outliers in 'digital_t12m': 0 detected
IQR Outliers in 'f2f_t12m': 5 detected
IQR Outliers in 'events_t12m': 5 detected
IQR Outliers in 'drtv_t12m': 0 detected
IQR Outliers in 'other_t12m': 28 detected
IQR Outliers in 'no_channel_t12m': 838 

Removing Outliers Based on IQR We'll remove the detected outliers using the IQR method to obtain a dataset without outliers

In [18]:
def remove_outliers_iqr(df, numerical_columns):
    print("Removing outliers based on IQR method...\n")
    Q1 = df[numerical_columns].quantile(0.25)
    Q3 = df[numerical_columns].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    condition = ~((df[numerical_columns] < lower_bound) | (df[numerical_columns] > upper_bound)).any(axis=1)
    df_no_outliers = df[condition]
    print("Outliers removed.")
    print(f"Original dataset size: {df.shape[0]} rows")
    print(f"Dataset without outliers size: {df_no_outliers.shape[0]} rows\n")
    return df_no_outliers

# Remove outliers using IQR
df_without_outliers = remove_outliers_iqr(df, numerical_columns)


Removing outliers based on IQR method...

Outliers removed.
Original dataset size: 19573 rows
Dataset without outliers size: 4283 rows



In [19]:

def display_dataset_without_outliers(df):
    print("Displaying the dataset without outliers:\n")
    # Display the first 5 rows
    print("First five rows of the dataset without outliers:")
    display(df.head())
    
    print("\nSummary Statistics of Dataset Without Outliers:")
    display(df.describe())
    
    # Display the number of rows and columns
    print(f"\nDataset without outliers contains {df.shape[0]} rows and {df.shape[1]} columns.\n")

display_dataset_without_outliers(df_without_outliers)


Displaying the dataset without outliers:

First five rows of the dataset without outliers:


Unnamed: 0,sustainer,gender,client_state_rank,all_state_rank,client_zip_rank,all_zip_rank,age,account_id,gift_date,gift_min_t12m,...,drtv_t12m,other_t12m,no_channel_t12m,member_t12m,gift_min_lt,gift_max_lt,gift_avg_lt,gift_count_lt,promo_count_lt,cons_month_lt
0,0,0,36,29,3553,34364,0.0,1030075C,2020-03-23,1.0,...,0,0,0,0,1.0,2.0,1.5,2,2,2
2,0,0,42,31,2594,35599,0.0,1075779C,2020-03-23,2.0,...,0,0,0,0,2.0,2.0,2.0,1,1,1
3,0,0,42,31,1394,29773,0.0,1090840C,2020-06-12,2.0,...,0,0,0,0,2.0,2.0,2.0,1,1,1
4,0,0,42,31,581,28865,0.0,1155437C,2020-11-02,2.0,...,0,0,0,0,2.0,2.0,2.0,1,1,1
7,0,0,42,31,3862,36077,0.0,1009280C,2019-03-27,1.0,...,0,0,0,0,1.0,1.0,1.0,1,1,1



Summary Statistics of Dataset Without Outliers:


Unnamed: 0,sustainer,gender,client_state_rank,all_state_rank,client_zip_rank,all_zip_rank,age,gift_min_t12m,gift_max_t12m,gift_avg_t12m,...,drtv_t12m,other_t12m,no_channel_t12m,member_t12m,gift_min_lt,gift_max_lt,gift_avg_lt,gift_count_lt,promo_count_lt,cons_month_lt
count,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,...,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0,4283.0
mean,0.511791,0.167873,41.781929,40.130283,3004.435442,34602.411394,0.0,29.437579,39.943789,33.903967,...,0.0,0.0,0.0,0.0,19.583645,48.125001,30.321553,6.931823,6.409993,2.284847
std,0.499919,0.373797,8.974929,9.492116,1363.984256,2624.41941,0.0,20.88431,29.525962,22.526727,...,0.0,0.0,0.0,0.0,9.540737,35.841242,16.275309,6.851124,6.414094,2.247864
min,0.0,0.0,14.0,11.0,60.0,26356.0,0.0,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,0.5,1.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,36.0,33.0,1889.0,33056.0,0.0,15.0,20.0,20.0,...,0.0,0.0,0.0,0.0,10.0,25.0,20.0,2.0,2.0,1.0
50%,1.0,0.0,44.0,43.0,3259.0,35425.0,0.0,25.0,35.0,25.0,...,0.0,0.0,0.0,0.0,20.0,35.0,26.43,5.0,4.0,1.0
75%,1.0,0.0,49.0,49.0,4258.0,36720.5,0.0,35.0,50.0,40.0,...,0.0,0.0,0.0,0.0,25.0,55.0,36.0,9.0,8.0,3.0
max,1.0,1.0,51.0,51.0,4790.0,37514.0,0.0,104.0,200.0,104.0,...,0.0,0.0,0.0,0.0,45.0,200.0,94.92,38.0,37.0,13.0



Dataset without outliers contains 4283 rows and 31 columns.



In [20]:
def save_dataset_without_outliers(df, output_path):
    try:
        df.to_csv(output_path, index=False)
        print(f"Dataset without outliers saved to '{output_path}'.\n")
    except Exception as e:
        print(f"Error saving file: {e}\n")


output_file = 'advcydata_without_outliers.csv'

# Save the cleaned dataset
save_dataset_without_outliers(df_without_outliers, output_file)


Dataset without outliers saved to 'advcydata_without_outliers.csv'.

