In [4]:
# ==========================================================
# üìä MARKETING CAMPAIGN DATA CLEANING SCRIPT
# ==========================================================
# Tujuan: Membersihkan dan menyiapkan dataset untuk Power BI
# Output: marketing_campaign_cleaned.csv
# ==========================================================

# 1Ô∏è‚É£ Import Library
import pandas as pd
import numpy as np

# 2Ô∏è‚É£ Load Dataset
df = pd.read_csv("../Data/marketing_campaign.csv")

# 3Ô∏è‚É£ Cek Struktur Awal
print("Data shape:", df.shape)
print("Missing values per column:\n", df.isnull().sum())

# 4Ô∏è‚É£ Hapus baris kosong / duplikat jika ada
df.drop_duplicates(inplace=True)
df.dropna(subset=["Age", "Income"], inplace=True)

# 5Ô∏è‚É£ Ganti nama kolom agar konsisten (jika ada spasi atau huruf besar)
df.columns = df.columns.str.strip().str.replace(" ", "_").str.lower()

# 6Ô∏è‚É£ Buat kolom baru: AgeGroup
bins = [18, 30, 40, 50, 60, 100]
labels = ["20s", "30s", "40s", "50s", "60+"]
df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=True)

# 7Ô∏è‚É£ Ubah tipe tanggal (jika ada)
date_cols = ["joindate", "lastpurchasedate"]
for col in date_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")

# 8Ô∏è‚É£ Isi nilai kosong (opsional)
df["income"].fillna(df["income"].median(), inplace=True)

# 9Ô∏è‚É£ Buat kolom tambahan analitik (opsional)
if "spent" in df.columns and "income" in df.columns:
    df["spending_ratio"] = (df["spent"] / df["income"]).round(2)

# üîü Simpan ke file CSV baru
output_path = "../Data/marketing_campaign_cleaned.csv"
df.to_csv(output_path, index=False)

print("‚úÖ Data berhasil disimpan ke:", output_path)
print("üìÑ Jumlah kolom akhir:", len(df.columns))
print("üìä Preview:")
print(df.head())


Data shape: (2000, 12)
Missing values per column:
 CustomerID          0
Gender              0
Age                 0
MaritalStatus       0
Education           0
Income              0
Spent               0
CampaignChannel     0
CampaignType        0
Response            0
JoinDate            0
LastPurchaseDate    0
dtype: int64
‚úÖ Data berhasil disimpan ke: ../Data/marketing_campaign_cleaned.csv
üìÑ Jumlah kolom akhir: 14
üìä Preview:
  customerid  gender  age maritalstatus    education  income  spent  \
0      C0001    Male   47       Married       Master   50100  12469   
1      C0002  Female   26       Married  High School   76232  18752   
2      C0003  Female   54      Divorced  High School   54650  12374   
3      C0004  Female   47       Married  High School   78593  12965   
4      C0005    Male   33       Married       Master   77507  10045   

  campaignchannel campaigntype  response   joindate lastpurchasedate  \
0           Email     Discount         0 2019-08-03       201

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["income"].fillna(df["income"].median(), inplace=True)
