#1. Simulate a 10-Sample Marketing Dataset





In [None]:
import numpy as np
import pandas as pd

# Set a random seed for reproducibility
np.random.seed(6101)

# Simulate some data for marketing analytics (10 samples)
data = {
    'Age': np.random.randint(20, 65, size=10),  # Customer age
    'Income': np.random.normal(50000, 15000, size=10),  # Customer income
    'Marital_Status': np.random.choice(['Single', 'Married', 'Divorced'], size=10),  # Marital status
    'Total_Purchases': np.random.poisson(5, size=10),  # Total number of purchases
    'Campaign_Response': np.random.choice([0, 1], size=10, p=[0.85, 0.15])  # Response to the last campaign
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Introduce some missing values
nan_indices = np.random.choice(df.index, size=int(len(df) * 0.1), replace=False)
df.loc[nan_indices, 'Income'] = np.nan

In [None]:
df

#2. Data Cleaning


##2.1 Data Cleaning: Handling Missing Values


###Example 1 : Filter and Drop

In [None]:
# Step 1: Filter the sample with missing values in 'Income'
nan_income_samples = df[df['Income'].isna()]

In [None]:
nan_income_samples

In [None]:
#Step 2: Delete the samples with missing values in 'Income'
df = df.dropna(subset=['Income'])

In [None]:
df

###Example 2: Replace the missing value using mean value.

In [None]:
from sklearn.impute import SimpleImputer

# Impute the missing values in 'Income' with the mean
imputer = SimpleImputer(strategy='mean')
df['Income'] = imputer.fit_transform(df[['Income']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Income'] = imputer.fit_transform(df[['Income']])


In [None]:
df

##2.2 Data Cleaning: Outlier Detection and Removal

###Example 1 : Filter and Drop

In [None]:
# Step 1 : Calculate IQR
Q1 = df['Income'].quantile(0.25)
Q3 = df['Income'].quantile(0.75)
IQR = Q3 - Q1

# Step 2: Define the Upperbound and Lowerbound of the Outlier
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter the samples with Outliers
outliers = df[(df['Income'] < lower_bound) | (df['Income'] > upper_bound)]

In [None]:
print(lower_bound,upper_bound)

In [None]:
outliers

In [None]:
#Step 3: Find the indices of Outliers
outlier_indices = outliers.index

#Step 4: Delete the samples with Outliers
df_clean = df.drop(outlier_indices)

In [None]:
df_clean

###Example 2: directly detect the outliers

In [None]:
from sklearn.ensemble import IsolationForest

# Detect and remove outliers in the 'Income' feature using Isolation Forest
iso_forest = IsolationForest(contamination=0.25)
outliers = iso_forest.fit_predict(df[['Income']].fillna(df['Income'].mean()))

# Filter out the outliers
df_clean = df[outliers != -1]

df_clean

#3. Data transformation

##3.1 Data Nomalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Normalize 'Age' and 'Income' using MinMaxScaler
scaler = MinMaxScaler()
df_clean[['Age', 'Income']] = scaler.fit_transform(df_clean[['Age', 'Income']])
df_clean

##3.2 Encoding Categorical Data

In [None]:
# One-hot encode 'Marital_Status' using pandas get_dummies
df_encoded = pd.get_dummies(df_clean, columns=['Marital_Status'])

In [None]:
df_encoded

#4. Data Reduction

##4.1 Feature reduction

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

# Select top k features based on chi-squared test
selector = SelectKBest(chi2, k=3)
X_new = selector.fit_transform(df_encoded.drop('Campaign_Response', axis=1), df_encoded['Campaign_Response'])

# Get the selected feature names
selected_features = df_encoded.drop('Campaign_Response', axis=1).columns[selector.get_support()]

print(selected_features)

##4.2 Sample reduction

In [None]:
# Randomly select 80% of the instances
df_sampled = df_encoded.sample(frac=0.8, random_state=0)

df_sampled