In [9]:
from google.colab import files
uploaded = files.upload()

import pandas as pd
import numpy as np
from datetime import datetime

customers = pd.read_csv("AWCustomers.csv")
sales = pd.read_csv("AWSales.csv")

customers["BirthDate"] = pd.to_datetime(customers["BirthDate"])
today = datetime.today()
customers["Age"] = customers["BirthDate"].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

data = pd.merge(customers, sales[["CustomerID","BikeBuyer"]], on="CustomerID", how="inner")

selected_features = [
    "Gender", "Age", "MaritalStatus", "Education", "Occupation",
    "HomeOwnerFlag", "NumberCarsOwned", "NumberChildrenAtHome",
    "TotalChildren", "YearlyIncome", "CountryRegionName", "BikeBuyer"
]
df = data[selected_features]

print("Shape:", df.shape)
df.head()

Saving AWCustomers.csv to AWCustomers (2).csv
Saving AWSales.csv to AWSales (1).csv
Shape: (18361, 12)


Unnamed: 0,Gender,Age,MaritalStatus,Education,Occupation,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,CountryRegionName,BikeBuyer
0,M,37,M,Bachelors,Clerical,1,3,0,1,81916,Australia,1
1,M,53,M,Partial College,Clerical,1,2,1,2,81076,Canada,1
2,F,39,S,Bachelors,Clerical,0,3,0,0,86387,United States,1
3,M,47,M,Partial College,Skilled Manual,1,2,1,2,61481,United Kingdom,1
4,M,50,S,Partial College,Skilled Manual,1,1,0,0,51804,Germany,1


In [10]:
data_types = {
    "Gender": "Discrete - Nominal",
    "Age": "Continuous - Ratio",
    "MaritalStatus": "Discrete - Nominal",
    "Education": "Discrete - Ordinal",
    "Occupation": "Discrete - Nominal",
    "HomeOwnerFlag": "Discrete - Binary",
    "NumberCarsOwned": "Discrete - Ratio",
    "NumberChildrenAtHome": "Discrete - Ratio",
    "TotalChildren": "Discrete - Ratio",
    "YearlyIncome": "Continuous - Ratio",
    "CountryRegionName": "Discrete - Nominal",
    "BikeBuyer": "Discrete - Binary (Target)"
}
print(data_types)

{'Gender': 'Discrete - Nominal', 'Age': 'Continuous - Ratio', 'MaritalStatus': 'Discrete - Nominal', 'Education': 'Discrete - Ordinal', 'Occupation': 'Discrete - Nominal', 'HomeOwnerFlag': 'Discrete - Binary', 'NumberCarsOwned': 'Discrete - Ratio', 'NumberChildrenAtHome': 'Discrete - Ratio', 'TotalChildren': 'Discrete - Ratio', 'YearlyIncome': 'Continuous - Ratio', 'CountryRegionName': 'Discrete - Nominal', 'BikeBuyer': 'Discrete - Binary (Target)'}


In [11]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

df = df.dropna()

minmax = MinMaxScaler()
df[["Age", "YearlyIncome"]] = minmax.fit_transform(df[["Age", "YearlyIncome"]])

df["Age_Binned"] = pd.cut(df["Age"], bins=[0,0.25,0.5,0.75,1], labels=["Young","Adult","Middle","Senior"])
df["Income_Binned"] = pd.qcut(df["YearlyIncome"], q=4, labels=["Low","Medium","High","Very High"])

scaler = StandardScaler()
df[["Age", "YearlyIncome"]] = scaler.fit_transform(df[["Age", "YearlyIncome"]])

df_enc = pd.get_dummies(df, columns=[
    "Gender","MaritalStatus","Education","Occupation","CountryRegionName",
    "Age_Binned","Income_Binned"
])
df_enc.head()

Unnamed: 0,Age,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,BikeBuyer,Gender_F,Gender_M,MaritalStatus_M,...,CountryRegionName_United Kingdom,CountryRegionName_United States,Age_Binned_Young,Age_Binned_Adult,Age_Binned_Middle,Age_Binned_Senior,Income_Binned_Low,Income_Binned_Medium,Income_Binned_High,Income_Binned_Very High
0,-0.539455,1,3,0,1,0.298555,1,False,True,True,...,False,False,True,False,False,False,False,False,True,False
1,0.881282,1,2,1,2,0.27118,1,False,True,True,...,False,False,False,True,False,False,False,False,True,False
2,-0.361863,0,3,0,0,0.444261,1,True,False,False,...,False,True,True,False,False,False,False,False,True,False
3,0.348506,1,2,1,2,-0.367401,1,False,True,True,...,True,False,False,True,False,False,False,True,False,False
4,0.614894,1,1,0,0,-0.682765,1,False,True,False,...,False,False,False,True,False,False,True,False,False,False


In [12]:
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats

cust1 = df_enc.iloc[0].drop("BikeBuyer").values.reshape(1,-1)
cust2 = df_enc.iloc[1].drop("BikeBuyer").values.reshape(1,-1)


cos_sim = cosine_similarity(cust1, cust2)[0][0]

jac_sim = jaccard_score((cust1>0).astype(int)[0], (cust2>0).astype(int)[0])

smc = np.mean(cust1 == cust2)

print("Cosine Similarity:", cos_sim)
print("Jaccard Similarity:", jac_sim)
print("Simple Matching Coefficient:", smc)

if "CommuteDistance" in sales.columns:
    merged = pd.merge(customers, sales[["CustomerID","CommuteDistance","BikeBuyer","YearlyIncome"]],
                      on="CustomerID", how="inner")
    commute_codes = merged["CommuteDistance"].astype("category").cat.codes
    corr, p_value = stats.pearsonr(commute_codes, merged["YearlyIncome"])
    print("Correlation (CommuteDistance vs YearlyIncome):", corr, "p-value:", p_value)
else:
    print("CommuteDistance not available in this dataset.")

Cosine Similarity: 0.6959304149614838
Jaccard Similarity: 0.5
Simple Matching Coefficient: 0.6764705882352942
CommuteDistance not available in this dataset.
