# Customer Segmentation - Data Preprocessing

This notebook covers the data preprocessing steps for the customer segmentation project:
1. Loading the data
2. Exploring the raw data
3. Handling missing values
4. Detecting and addressing outliers
5. Feature scaling/normalization
6. Saving the preprocessed data

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Set plotting style
%matplotlib inline
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Increase default figure size
plt.rcParams['figure.figsize'] = [12, 8]

## 1. Data Loading

In [None]:
# Load the customer behavior data
file_path = "../data/customer_behavior_analytcis.csv"
df = pd.read_csv(file_path)

# Display the first few rows
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Initial Data Exploration

In [None]:
# Check data types and basic information
df.info()

In [None]:
# Get statistical summary
df.describe().T

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_info = pd.DataFrame({
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage.round(2)
})

# Display columns with missing values
missing_info[missing_info['Missing Values'] > 0]

## 3. Handling Missing Values

In [None]:
# Create a copy of the original dataframe
df_clean = df.copy()

# Check for rows with multiple missing values
rows_with_multiple_missing = df_clean[df_clean.isnull().sum(axis=1) > 1]
print(f"Number of rows with multiple missing values: {len(rows_with_multiple_missing)}")

# Drop rows with too many missing values (if needed)
if len(rows_with_multiple_missing) > 0:
    df_clean = df_clean.dropna(thresh=df_clean.shape[1]-1)
    print(f"After dropping rows with multiple missing values: {df_clean.shape}")

In [None]:
# Fill remaining missing values with median/mode
for column in df_clean.columns:
    if df_clean[column].isnull().sum() > 0:
        if pd.api.types.is_numeric_dtype(df_clean[column]):
            median_value = df_clean[column].median()
            df_clean[column].fillna(median_value, inplace=True)
            print(f"Filled missing values in '{column}' with median: {median_value}")
        else:
            mode_value = df_clean[column].mode()[0]
            df_clean[column].fillna(mode_value, inplace=True)
            print(f"Filled missing values in '{column}' with mode: {mode_value}")

# Verify no missing values remain
print(f"\nRemaining missing values: {df_clean.isnull().sum().sum()}")

## 4. Detecting and Addressing Outliers

In [None]:
# Create box plots to visualize potential outliers
plt.figure(figsize=(14, 10))

# Select numeric columns excluding customer_id
numeric_columns = [col for col in df_clean.columns 
                   if pd.api.types.is_numeric_dtype(df_clean[col]) and col != 'customer_id']

for i, column in enumerate(numeric_columns):
    plt.subplot(3, 2, i+1)
    sns.boxplot(x=df_clean[column])
    plt.title(f'Box Plot of {column}')
    
plt.tight_layout()
plt.show()

In [None]:
# Function to detect outliers using IQR method
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Check outliers in each feature
for column in numeric_columns:
    outliers, lower_bound, upper_bound = detect_outliers(df_clean, column)
    n_outliers = len(outliers)
    print(f"{column}: {n_outliers} outliers detected ({n_outliers/len(df_clean)*100:.2f}%)")
    print(f"    Bounds: ({lower_bound:.2f}, {upper_bound:.2f})")
    print(f"    Min: {df_clean[column].min()}, Max: {df_clean[column].max()}")
    print("")

In [None]:
# For this segmentation problem, we'll keep the outliers
# They likely represent valid customer behaviors that are important for segmentation
print("Note: For customer segmentation, outliers often represent important customer behaviors.")
print("We'll keep outliers in the dataset as they may help identify distinct segments.")

## 5. Feature Scaling/Normalization

In [None]:
# Store the original dataframe before scaling
df_original = df_clean.copy()

# Identify columns to normalize (excluding customer_id if present)
columns_to_normalize = [col for col in df_clean.columns 
                        if pd.api.types.is_numeric_dtype(df_clean[col]) and col != 'customer_id']

# Create a StandardScaler
scaler = StandardScaler()

# Apply scaling to the selected columns
df_clean[columns_to_normalize] = scaler.fit_transform(df_clean[columns_to_normalize])

# Display the scaled data
print("Data after scaling:")
df_clean[columns_to_normalize].describe().T

In [None]:
# Visualize the distribution of scaled features
plt.figure(figsize=(14, 10))

for i, column in enumerate(columns_to_normalize):
    plt.subplot(3, 2, i+1)
    sns.histplot(df_clean[column], kde=True)
    plt.title(f'Distribution of {column} (Scaled)')
    
plt.tight_layout()
plt.show()

## 6. Save Preprocessed Data

In [None]:
# Create an output directory if it doesn't exist
import os
os.makedirs('../output', exist_ok=True)

# Save the preprocessed data
df_clean.to_csv('../output/preprocessed_data.csv', index=False)
df_original.to_csv('../output/cleaned_data_unscaled.csv', index=False)

print(f"Preprocessed data saved to '../output/preprocessed_data.csv'")
print(f"Cleaned unscaled data saved to '../output/cleaned_data_unscaled.csv'")

## Summary

In this notebook, we've completed the following preprocessing steps:

1. Loaded the customer behavior data
2. Explored the dataset's basic properties
3. Identified and handled missing values
4. Detected outliers (but kept them for segmentation purposes)
5. Normalized the features using StandardScaler
6. Saved both the preprocessed data and the cleaned unscaled data

The preprocessed data is now ready for exploratory data analysis and customer segmentation.