In [38]:
import pandas as pd

# 1. Load the Dataset
df = pd.read_csv('/Users/ankita/Downloads/marketing_campaign.csv')  # Make a copy
df_cleaned = df.copy()

# 2. Print column names to check for potential issues
print("Column names before cleaning:\n", df.columns)

# 3. Clean column names (standardize naming)
df_cleaned.columns = df_cleaned.columns.str.strip().str.lower().str.replace(' ', '_', regex=True).str.replace('-', '_', regex=True)

# 4. Handle missing values (fill income with median)
if 'income' in df_cleaned.columns:
    median_income = df_cleaned['income'].median()
    df_cleaned['income'].fillna(median_income, inplace=True)
else:
    print("Column 'income' not found.")

# Handle missing values for other columns (fill categorical columns with 'Unknown')
categorical_columns = ['education', 'marital_status']
for col in categorical_columns:
    if col in df_cleaned.columns:
        df_cleaned[col].fillna('Unknown', inplace=True)
    else:
        print(f"Column '{col}' not found.")

# 5. Remove duplicates
df_cleaned.drop_duplicates(inplace=True)

# 6. Convert 'dt_customer' to datetime
if 'dt_customer' in df_cleaned.columns:
    df_cleaned['dt_customer'] = pd.to_datetime(df_cleaned['dt_customer'], errors='coerce', dayfirst=True)

# 7. Standardize text columns
if 'education' in df_cleaned.columns:
    df_cleaned['education'] = df_cleaned['education'].str.strip().str.title()
if 'marital_status' in df_cleaned.columns:
    df_cleaned['marital_status'] = df_cleaned['marital_status'].str.strip().str.title()

# 8. Ensure correct data types for numeric columns
int_columns = [
    'year_birth', 'kidhome', 'teenhome', 'recency',
    'mntwines', 'mntfruits', 'mntmeatproducts', 'mntfishproducts',
    'mntsweetproducts', 'mntgoldprods', 'numdealspurchases', 'numwebpurchases',
    'numcatalogpurchases', 'numstorepurchases', 'numwebvisitsmonth',
    'acceptedcmp1', 'acceptedcmp2', 'acceptedcmp3', 'acceptedcmp4', 'acceptedcmp5',
    'complain', 'z_costcontact', 'z_revenue', 'response'
]
for col in int_columns:
    if col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].astype(int)
    else:
        print(f"Column '{col}' not found.")

# 9. Check if there are any errors in conversion or nulls after the process
print("Missing values after cleaning:\n", df_cleaned.isnull().sum())
print("\nData types after cleaning:\n", df_cleaned.dtypes)

# 10. Save cleaned dataset
df_cleaned.to_csv("marketing_campaign_cleaned.csv", index=False)

# 11. Optional: print a short summary
print("Cleaning complete. Saved to 'marketing_campaign_cleaned.csv'.")

Column names before cleaning:
 Index(['ID\tYear_Birth\tEducation\tMarital_Status\tIncome\tKidhome\tTeenhome\tDt_Customer\tRecency\tMntWines\tMntFruits\tMntMeatProducts\tMntFishProducts\tMntSweetProducts\tMntGoldProds\tNumDealsPurchases\tNumWebPurchases\tNumCatalogPurchases\tNumStorePurchases\tNumWebVisitsMonth\tAcceptedCmp3\tAcceptedCmp4\tAcceptedCmp5\tAcceptedCmp1\tAcceptedCmp2\tComplain\tZ_CostContact\tZ_Revenue\tResponse'], dtype='object')
Column 'income' not found.
Column 'education' not found.
Column 'marital_status' not found.
Column 'year_birth' not found.
Column 'kidhome' not found.
Column 'teenhome' not found.
Column 'recency' not found.
Column 'mntwines' not found.
Column 'mntfruits' not found.
Column 'mntmeatproducts' not found.
Column 'mntfishproducts' not found.
Column 'mntsweetproducts' not found.
Column 'mntgoldprods' not found.
Column 'numdealspurchases' not found.
Column 'numwebpurchases' not found.
Column 'numcatalogpurchases' not found.
Column 'numstorepurchases' not