                                                           ML LAB ASSIGNMENT 2
                                                   SUBMITTED BY : ABHAYJEET(102303761)

Part I: Based on Feature Selection, Cleaning, and Preprocessing to Construct an Input from Data 
Source 

In [11]:
import pandas as pd

df = pd.read_csv("AWCustomers.csv", encoding="utf-8", low_memory=False)

print("Shape:", df.shape)
print(df.columns.tolist()[:50])  


selected_features = [
    "CustomerID",
    "YearlyIncome",
    "CommuteDistance",
    "NumberChildrenAtHome",
    "TotalChildren",
    "Education",
    "Occupation",
    "MaritalStatus",
    "Gender",
    "HomeOwnerFlag",
    "BikeBuyer"   
]


selected_features = [c for c in selected_features if c in df.columns]
data = df[selected_features].copy()
print("Selected Features:", selected_features)
data.head()


Shape: (18361, 24)
['CustomerID', 'Title', 'FirstName', 'MiddleName', 'LastName', 'Suffix', 'AddressLine1', 'AddressLine2', 'City', 'StateProvinceName', 'CountryRegionName', 'PostalCode', 'PhoneNumber', 'BirthDate', 'Education', 'Occupation', 'Gender', 'MaritalStatus', 'HomeOwnerFlag', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'YearlyIncome', 'LastUpdated']
Selected Features: ['CustomerID', 'YearlyIncome', 'NumberChildrenAtHome', 'TotalChildren', 'Education', 'Occupation', 'MaritalStatus', 'Gender', 'HomeOwnerFlag']


Unnamed: 0,CustomerID,YearlyIncome,NumberChildrenAtHome,TotalChildren,Education,Occupation,MaritalStatus,Gender,HomeOwnerFlag
0,21173,81916,0,1,Bachelors,Clerical,M,M,1
1,13249,81076,1,2,Partial College,Clerical,M,M,1
2,29350,86387,0,0,Bachelors,Clerical,S,F,0
3,13503,61481,1,2,Partial College,Skilled Manual,M,M,1
4,22803,51804,0,0,Partial College,Skilled Manual,S,M,1


In [12]:
new_df = data.copy()
print(new_df.head())


   CustomerID  YearlyIncome  NumberChildrenAtHome  TotalChildren  \
0       21173         81916                     0              1   
1       13249         81076                     1              2   
2       29350         86387                     0              0   
3       13503         61481                     1              2   
4       22803         51804                     0              0   

         Education      Occupation MaritalStatus Gender  HomeOwnerFlag  
0        Bachelors        Clerical             M      M              1  
1  Partial College        Clerical             M      M              1  
2        Bachelors        Clerical             S      F              0  
3  Partial College  Skilled Manual             M      M              1  
4  Partial College  Skilled Manual             S      M              1  


In [13]:
def check_type(series):
    if pd.api.types.is_numeric_dtype(series):
        if series.nunique() < 20:
            return "Discrete, Ratio"
        else:
            return "Continuous, Ratio"
    else:
        if series.name in ["Education", "CommuteDistance"]:
            return "Categorical, Ordinal/Nominal"
        else:
            return "Categorical, Nominal"

for col in new_df.columns:
    print(f"{col} -> {check_type(new_df[col])}")


CustomerID -> Continuous, Ratio
YearlyIncome -> Continuous, Ratio
NumberChildrenAtHome -> Discrete, Ratio
TotalChildren -> Discrete, Ratio
Education -> Categorical, Ordinal/Nominal
Occupation -> Categorical, Nominal
MaritalStatus -> Categorical, Nominal
Gender -> Categorical, Nominal
HomeOwnerFlag -> Discrete, Ratio


Part II: Data Preprocessing and Transformation 

In [14]:
from sklearn.impute import SimpleImputer
import numpy as np

num_cols = new_df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = [c for c in new_df.columns if c not in num_cols]

num_imputer = SimpleImputer(strategy="median")
new_df[num_cols] = num_imputer.fit_transform(new_df[num_cols])

for c in cat_cols:
    new_df[c] = new_df[c].fillna("Missing")

print(new_df.isnull().sum())


CustomerID              0
YearlyIncome            0
NumberChildrenAtHome    0
TotalChildren           0
Education               0
Occupation              0
MaritalStatus           0
Gender                  0
HomeOwnerFlag           0
dtype: int64


In [15]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
normalized = pd.DataFrame(scaler.fit_transform(new_df[num_cols]),
                          columns=num_cols)
normalized.head()


Unnamed: 0,CustomerID,YearlyIncome,NumberChildrenAtHome,TotalChildren,HomeOwnerFlag
0,0.550398,0.496842,0.0,0.333333,1.0
1,0.121679,0.489453,0.333333,0.666667,1.0
2,0.992804,0.536172,0.0,0.0,0.0
3,0.135422,0.317083,0.333333,0.666667,1.0
4,0.638587,0.231958,0.0,0.0,1.0


In [40]:
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


discretizer = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="quantile")
if "YearlyIncome" in new_df.columns:
    new_df["YearlyIncome_binned"] = discretizer.fit_transform(new_df[["YearlyIncome"]]).astype(int)

print(new_df[["YearlyIncome", "YearlyIncome_binned"]].head(10))

scaler_std = StandardScaler()
standardized = pd.DataFrame(scaler_std.fit_transform(new_df[num_cols]),
                            columns=[c+"_std" for c in num_cols])
standardized.head()



ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
cat_cols = [c for c in new_df.columns if c not in num_cols and c not in ["CustomerID", "BikeBuyer"]]

ohe_arr = ohe.fit_transform(new_df[cat_cols])
ohe_df = pd.DataFrame(ohe_arr, columns=ohe.get_feature_names_out(cat_cols))

ohe_df.head()


   YearlyIncome  YearlyIncome_binned
0       81916.0                    3
1       81076.0                    3
2       86387.0                    3
3       61481.0                    2
4       51804.0                    1
5       61944.0                    2
6       34919.0                    0
7       61832.0                    2
8       83834.0                    3
9       26880.0                    0




Unnamed: 0,Education_Bachelors,Education_Graduate Degree,Education_High School,Education_Partial College,Education_Partial High School,Occupation_Clerical,Occupation_Management,Occupation_Manual,Occupation_Professional,Occupation_Skilled Manual,MaritalStatus_M,MaritalStatus_S,Gender_F,Gender_M,YearlyIncome_binned_0,YearlyIncome_binned_1,YearlyIncome_binned_2,YearlyIncome_binned_3,YearlyIncome_binned_4
0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


Part III: Calculating Proximity /Correlation Analysis of two features 

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

# pick first two rows
i1, i2 = 0, 1

# binary features
bin_features = ohe_df.values
v1_bin, v2_bin = bin_features[i1], bin_features[i2]

# numeric standardized
num_features = standardized.values
v1_num, v2_num = num_features[i1], num_features[i2]

# Simple Matching
sm = (v1_bin == v2_bin).sum() / len(v1_bin)

# Jaccard
intersection = ((v1_bin==1) & (v2_bin==1)).sum()
union = ((v1_bin==1) | (v2_bin==1)).sum()
jaccard = intersection / union if union > 0 else 0

# Cosine
cos = cosine_similarity([v1_num], [v2_num])[0][0]

print("Simple Matching:", sm)
print("Jaccard:", jaccard)
print("Cosine:", cos)



Simple Matching: 0.8947368421052632
Jaccard: 0.6666666666666666
Cosine: -0.0005324909151599176
