In [13]:
# Assignment 2 - Adventure Works Cycles Customer Data Analysis

# ================================
# Part 0: Setup
# ================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings("ignore")

# ================================
# Part I: Load & Inspect Data
# ================================
df = pd.read_csv("AWCustomers.csv")  

print("Original Shape:", df.shape)
print("Original Columns:\n", df.columns.tolist())

# ---- Clean column names ----
df.columns = (
    df.columns.str.strip()
              .str.replace(" ", "_")
              .str.replace("-", "_")
              .str.replace(r"[()]", "", regex=True)
)

print("\nCleaned Columns:\n", df.columns.tolist())

# ================================
# Part I: Feature Selection (Auto-detect)
# ================================
candidate_features = [
    "Age", 
    "Yearly_Income", "YearlyIncome", 
    "Commute_Distance", "CommuteDistance", 
    "Education", 
    "Occupation", 
    "Gender", 
    "Marital_Status", "MaritalStatus", 
    "Bike_Buyer", "BikeBuyer"
]

selected_features = [c for c in candidate_features if c in df.columns]
print("\nSelected Features (auto-detected):", selected_features)

df_selected = df[selected_features]
print("Selected DataFrame Shape:", df_selected.shape)
df_selected.head()

# ================================
# Part II: Preprocessing
# ================================
df_clean = df_selected.dropna()

# Normalize numeric columns if present
num_cols = [c for c in ["Age", "Yearly_Income", "YearlyIncome"] if c in df_clean.columns]
if num_cols:
    scaler = MinMaxScaler()
    df_clean[num_cols] = scaler.fit_transform(df_clean[num_cols])

# Discretize Age if present
if "Age" in df_clean.columns:
    binning = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
    df_clean["Age_binned"] = binning.fit_transform(df_clean[["Age"]])

# Standardize Yearly Income
if "Yearly_Income" in df_clean.columns:
    std_scaler = StandardScaler()
    df_clean["YearlyIncome_std"] = std_scaler.fit_transform(df_clean[["Yearly_Income"]])
elif "YearlyIncome" in df_clean.columns:
    std_scaler = StandardScaler()
    df_clean["YearlyIncome_std"] = std_scaler.fit_transform(df_clean[["YearlyIncome"]])

# One Hot Encoding categorical attributes
categorical_cols = [c for c in ["Commute_Distance", "CommuteDistance", 
                                "Education", "Occupation", 
                                "Gender", "Marital_Status", "MaritalStatus"] if c in df_clean.columns]

if categorical_cols:
    # Fix for sklearn version: use sparse_output instead of sparse
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    encoded_data = encoder.fit_transform(df_clean[categorical_cols])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))
    df_final = pd.concat([df_clean.reset_index(drop=True), encoded_df], axis=1)
else:
    df_final = df_clean.copy()

print("\nFinal Preprocessed Data Shape:", df_final.shape)
df_final.head()

# ================================
# Part III: Similarity & Correlation
# ================================
if df_final.shape[0] >= 2:
    obj1 = df_final.iloc[0].values.reshape(1, -1)
    obj2 = df_final.iloc[1].values.reshape(1, -1)

    smc = np.sum(obj1 == obj2) / len(obj1[0])   # Simple Matching
    binary_obj1 = (obj1 > 0).astype(int).flatten()
    binary_obj2 = (obj2 > 0).astype(int).flatten()
    jaccard_sim = 1 - jaccard(binary_obj1, binary_obj2)  # Jaccard
    cos_sim = cosine_similarity(obj1, obj2)[0][0]        # Cosine

    print("\nSimilarity Measures Between Object 1 and 2:")
    print("Simple Matching Coefficient:", smc)
    print("Jaccard Similarity:", jaccard_sim)
    print("Cosine Similarity:", cos_sim)

# Correlation: Commute Distance vs Yearly Income
commute_map = {
    "0-1 Miles": 1, "1-2 Miles": 2, "2-5 Miles": 3, 
    "5-10 Miles": 4, "10+ Miles": 5
}

df_corr = df_clean.copy()
if "Commute_Distance" in df_corr.columns:
    df_corr["CommuteDistance_num"] = df_corr["Commute_Distance"].map(commute_map)
elif "CommuteDistance" in df_corr.columns:
    df_corr["CommuteDistance_num"] = df_corr["CommuteDistance"].map(commute_map)

if "YearlyIncome_std" in df_corr.columns and "CommuteDistance_num" in df_corr.columns:
    corr, p_value = pearsonr(df_corr["CommuteDistance_num"], df_corr["YearlyIncome_std"])
    print("\nCorrelation Analysis:")
    print("Correlation between Commute Distance & Yearly Income:", corr)
    print("p-value:", p_value)


Original Shape: (18361, 24)
Original Columns:
 ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']

Cleaned Columns:
 ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']

Selected Features (auto-detected): ['YearlyIncome', 'Education', 'Occupation', 'Gender', 'MaritalStatus']
Selected DataFrame Shape: (18361, 5)

Final Preprocessed Data Shape: (18361, 16)


<class 'TypeError'>: '>' not supported between instances of 'str' and 'int'