In [7]:
import pandas as pd

# Loadingggggg
customers = pd.read_csv("AWCustomers.csv")
sales = pd.read_csv("AWSales.csv")

# Merginggg
customers = customers.merge(sales[['CustomerID','BikeBuyer','AvgMonthSpend']], on='CustomerID', how='left')
customers['BikeBuyer'] = customers['BikeBuyer'].fillna(0).astype(int)
customers['AvgMonthSpend'] = customers['AvgMonthSpend'].fillna(0)

# Selected features
selected = customers[[
    'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus',
    'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome',
    'TotalChildren', 'YearlyIncome', 'BikeBuyer', 'AvgMonthSpend'
]].copy()

#age nikalna from birthdate
selected['BirthDate'] = pd.to_datetime(selected['BirthDate'], errors='coerce')
selected['Age'] = 2025 - selected['BirthDate'].dt.year  # or current year
selected.drop(columns='BirthDate', inplace=True)


In [8]:


data_types = {
    'Age': 'Continuous - Ratio',
    'Education': 'Categorical - Ordinal',
    'Occupation': 'Categorical - Nominal',
    'Gender': 'Categorical - Nominal',
    'MaritalStatus': 'Categorical - Nominal',
    'HomeOwnerFlag': 'Categorical - Nominal (binary)',
    'NumberCarsOwned': 'Discrete - Ratio',
    'NumberChildrenAtHome': 'Discrete - Ratio',
    'TotalChildren': 'Discrete - Ratio',
    'YearlyIncome': 'Continuous - Ratio',
    'AvgMonthSpend': 'Continuous - Ratio',
    'BikeBuyer': 'Categorical - Nominal (binary)'
}

for k,v in data_types.items():
    print(f"{k}: {v}")


Age: Continuous - Ratio
Education: Categorical - Ordinal
Occupation: Categorical - Nominal
Gender: Categorical - Nominal
MaritalStatus: Categorical - Nominal
HomeOwnerFlag: Categorical - Nominal (binary)
NumberCarsOwned: Discrete - Ratio
NumberChildrenAtHome: Discrete - Ratio
TotalChildren: Discrete - Ratio
YearlyIncome: Continuous - Ratio
AvgMonthSpend: Continuous - Ratio
BikeBuyer: Categorical - Nominal (binary)


In [9]:
selected.fillna(selected.median(numeric_only=True), inplace=True)
selected.fillna('Unknown', inplace=True)


In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_cols = ['Age','NumberCarsOwned','NumberChildrenAtHome','TotalChildren','YearlyIncome','AvgMonthSpend']
for col in num_cols:
    selected[col+'_norm'] = scaler.fit_transform(selected[[col]])


In [11]:
from sklearn.preprocessing import KBinsDiscretizer

kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
selected['Age_bin'] = kbins.fit_transform(selected[['Age']])


In [12]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
selected['YearlyIncome_std'] = std_scaler.fit_transform(selected[['YearlyIncome']])


In [13]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['Education','Occupation','Gender','MaritalStatus','HomeOwnerFlag']
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded = encoder.fit_transform(selected[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))
df_final = pd.concat([selected.drop(columns=categorical_cols), encoded_df], axis=1)


In [14]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
import numpy as np

#  two customers
obj1 = df_final.iloc[0].values.reshape(1,-1)
obj2 = df_final.iloc[1].values.reshape(1,-1)

# Onehot binary similarity
bin1 = encoded_df.iloc[0].values
bin2 = encoded_df.iloc[1].values

# Simple Matching Coefficient
smc = np.sum(bin1 == bin2) / len(bin1)

# Jaccard Similarity
jaccard_sim = 1 - jaccard(bin1, bin2)

# Cosine Similarity
cos_sim = cosine_similarity(obj1,obj2)[0][0]

print("Simple Matching Coefficient:", smc)
print("Jaccard Similarity:", jaccard_sim)
print("Cosine Similarity:", cos_sim)


Simple Matching Coefficient: 0.9090909090909091
Jaccard Similarity: 0.6666666666666667
Cosine Similarity: 0.9999999808154699


In [15]:
from scipy.stats import pearsonr

corr, pval = pearsonr(selected['YearlyIncome'], selected['AvgMonthSpend'])
print("Correlation between YearlyIncome and AvgMonthSpend:", corr, "(p-value:", pval,")")


Correlation between YearlyIncome and AvgMonthSpend: 0.5301257155563446 (p-value: 0.0 )
