In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
from scipy.stats import pearsonr

df = pd.read_csv("AWCustomers.csv")
df.columns = (
    df.columns.str.strip()
              .str.replace(" ", "_")
              .str.replace("-", "_")
)

print("Cleaned Columns:", df.columns.tolist())


features = [c for c in [
    "Age","Yearly_Income","Commute_Distance",
    "Education","Occupation","Gender","Marital_Status","Bike_Buyer"
] if c in df.columns]

df = df[features].dropna()
print("\nSelected features:", df.columns.tolist())


num_cols = [c for c in ["Age","Yearly_Income"] if c in df.columns]
if num_cols:
    df[num_cols] = MinMaxScaler().fit_transform(df[num_cols])


cat_cols = [c for c in df.columns if df[c].dtype=="object"]
if cat_cols:
    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

print("\nFinal shape:", df.shape)
print(df.head())


obj1, obj2 = df.iloc[0].values.reshape(1,-1), df.iloc[1].values.reshape(1,-1)


smc = np.mean(obj1 == obj2)

binary_obj1, binary_obj2 = obj1.astype(int).flatten(), obj2.astype(int).flatten()
jaccard_sim = 1 - jaccard(binary_obj1, binary_obj2)


cos_sim = cosine_similarity(obj1, obj2)[0][0]

print("\nSimilarity Between Row 1 and 2:")
print("Simple Matching Coefficient:", smc)
print("Jaccard Similarity:", jaccard_sim)
print("Cosine Similarity:", cos_sim)


if "Commute_Distance" in df.columns:
    commute_map = {
        "0-1 Miles": 1, "1-2 Miles": 2, "2-5 Miles": 3,
        "5-10 Miles": 4, "10+ Miles": 5
    }
    df["CommuteDistance_num"] = df["Commute_Distance"].map(commute_map)

if "Yearly_Income" in df.columns and "CommuteDistance_num" in df.columns:
    corr, p = pearsonr(df["CommuteDistance_num"], df["Yearly_Income"])
    print("\nCorrelation Between Commute Distance & Yearly Income:", corr, "(p-value:", p, ")")


Cleaned Columns: ['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']

Selected features: ['Education', 'Occupation', 'Gender']

Final shape: (18361, 9)
   Education_Graduate Degree  Education_High School  \
0                      False                  False   
1                      False                  False   
2                      False                  False   
3                      False                  False   
4                      False                  False   

   Education_Partial College  Education_Partial High School  \
0                      False                          False   
1                       True                          False   
2                    