# Assignment 2 - Machine Learning

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard
from scipy.stats import pearsonr

df = pd.read_csv('AWCustomers.csv')
df.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,LastUpdated
0,21173,,Chad,C,Yuan,,7090 C. Mount Hood,,Wollongong,New South Wales,...,Bachelors,Clerical,M,M,1,3,0,1,81916,2017-03-06
1,13249,,Ryan,,Perry,,3651 Willow Lake Rd,,Shawnee,British Columbia,...,Partial College,Clerical,M,M,1,2,1,2,81076,2017-03-06
2,29350,,Julia,,Thompson,,1774 Tice Valley Blvd.,,West Covina,California,...,Bachelors,Clerical,F,S,0,3,0,0,86387,2017-03-06
3,13503,,Theodore,,Gomez,,2103 Baldwin Dr,,Liverpool,England,...,Partial College,Skilled Manual,M,M,1,2,1,2,61481,2017-03-06
4,22803,,Marshall,J,Shan,,Am Gallberg 234,,Werne,Nordrhein-Westfalen,...,Partial College,Skilled Manual,M,S,1,1,0,0,51804,2017-03-06


## Part I: Feature Selection & Data Type Identification

In [2]:
selected_features = ['YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren', 'Gender', 'MaritalStatus', 'Occupation', 'Education']
df_selected = df[selected_features].copy()
df_selected

Unnamed: 0,YearlyIncome,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,Gender,MaritalStatus,Occupation,Education
0,81916,3,0,1,M,M,Clerical,Bachelors
1,81076,2,1,2,M,M,Clerical,Partial College
2,86387,3,0,0,F,S,Clerical,Bachelors
3,61481,2,1,2,M,M,Skilled Manual,Partial College
4,51804,1,0,0,M,S,Skilled Manual,Partial College
...,...,...,...,...,...,...,...,...
18356,52953,1,0,2,F,M,Skilled Manual,Graduate Degree
18357,60992,2,0,0,F,S,Skilled Manual,Bachelors
18358,51859,2,0,0,F,S,Skilled Manual,Partial College
18359,87177,0,0,0,F,S,Clerical,High School


## Part II: Data Preprocessing & Transformation

In [3]:
df_selected = df_selected.dropna()
scaler = MinMaxScaler()
df_selected[['YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']] = scaler.fit_transform(df_selected[['YearlyIncome', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']])
df_selected['YearlyIncome_std'] = StandardScaler().fit_transform(df_selected[['YearlyIncome']])
ohe = OneHotEncoder(sparse_output=False)
encoded = ohe.fit_transform(df_selected[['Gender', 'MaritalStatus', 'Occupation', 'Education']])
encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(['Gender', 'MaritalStatus', 'Occupation', 'Education']))
df_final = pd.concat([df_selected.reset_index(drop=True), encoded_df], axis=1)
df_final

Unnamed: 0,YearlyIncome,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,Gender,MaritalStatus,Occupation,Education,YearlyIncome_std,Gender_F,...,Occupation_Clerical,Occupation_Management,Occupation_Manual,Occupation_Professional,Occupation_Skilled Manual,Education_Bachelors,Education_Graduate Degree,Education_High School,Education_Partial College,Education_Partial High School
0,0.496842,0.6,0.000000,0.333333,M,M,Clerical,Bachelors,0.298555,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.489453,0.4,0.333333,0.666667,M,M,Clerical,Partial College,0.271180,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.536172,0.6,0.000000,0.000000,F,S,Clerical,Bachelors,0.444261,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.317083,0.4,0.333333,0.666667,M,M,Skilled Manual,Partial College,-0.367401,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.231958,0.2,0.000000,0.000000,M,S,Skilled Manual,Partial College,-0.682765,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18356,0.242065,0.2,0.000000,0.666667,F,M,Skilled Manual,Graduate Degree,-0.645321,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
18357,0.312781,0.4,0.000000,0.000000,F,S,Skilled Manual,Bachelors,-0.383337,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
18358,0.232442,0.4,0.000000,0.000000,F,S,Skilled Manual,Partial College,-0.680973,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
18359,0.543121,0.0,0.000000,0.000000,F,S,Clerical,High School,0.470006,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## Part III: Similarity & Correlation

In [4]:
# Keep only numeric columns
df_numeric = df_final.select_dtypes(include=[np.number])

obj1 = df_numeric.iloc[0].values.reshape(1, -1)
obj2 = df_numeric.iloc[1].values.reshape(1, -1)

simple_matching = np.mean(obj1 == obj2)
jaccard_sim = 1 - jaccard(obj1.flatten(), obj2.flatten())
cosine_sim = cosine_similarity(obj1, obj2)[0][0]

simple_matching, jaccard_sim, cosine_sim

(np.float64(0.631578947368421),
 np.float64(0.7),
 np.float64(0.7701138573381161))

In [5]:
corr = pearsonr(df_selected['NumberCarsOwned'], df_selected['YearlyIncome'])[0]
corr

np.float64(0.47730015236317)