In [16]:
import pandas as pd

### LoadDatSet

In [17]:
df = pd.read_csv('D:\Courses\Machine Learning\Class Assignment\AWCustomers.csv')

In [18]:
df.head()

Unnamed: 0,CustomerID,Title,FirstName,MiddleName,LastName,Suffix,AddressLine1,AddressLine2,City,StateProvinceName,...,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,LastUpdated
0,21173,,Chad,C,Yuan,,7090 C. Mount Hood,,Wollongong,New South Wales,...,Bachelors,Clerical,M,M,1,3,0,1,81916,2017-03-06
1,13249,,Ryan,,Perry,,3651 Willow Lake Rd,,Shawnee,British Columbia,...,Partial College,Clerical,M,M,1,2,1,2,81076,2017-03-06
2,29350,,Julia,,Thompson,,1774 Tice Valley Blvd.,,West Covina,California,...,Bachelors,Clerical,F,S,0,3,0,0,86387,2017-03-06
3,13503,,Theodore,,Gomez,,2103 Baldwin Dr,,Liverpool,England,...,Partial College,Skilled Manual,M,M,1,2,1,2,61481,2017-03-06
4,22803,,Marshall,J,Shan,,Am Gallberg 234,,Werne,Nordrhein-Westfalen,...,Partial College,Skilled Manual,M,S,1,1,0,0,51804,2017-03-06


### Part 1 : (a) Select Features

In [29]:
# Convert BirthDate column to datetime
df["BirthDate"] = pd.to_datetime(df["BirthDate"], errors="coerce")

# Today's date
today = pd.to_datetime("today")

# Function to calculate age
def calculate_age(birthdate):
    if pd.isnull(birthdate):
        return None
    return today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))

# Apply function to create Age column
df["Age"] = df["BirthDate"].apply(calculate_age)

In [34]:
slected_features =[
    'Age','Education','Occupation','Gender',
    'MaritalStatus','HomeOwnerFlag','NumberCarsOwned',
    'YearlyIncome'
]

In [35]:
df_selected = df[slected_features]
print("Selected Feature:")
df_selected.head()

Selected Feature:


Unnamed: 0,Age,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,YearlyIncome
0,37,Bachelors,Clerical,M,M,1,3,81916
1,53,Partial College,Clerical,M,M,1,2,81076
2,39,Bachelors,Clerical,F,S,0,3,86387
3,47,Partial College,Skilled Manual,M,M,1,2,61481
4,50,Partial College,Skilled Manual,M,S,1,1,51804


### (b) New DataFrame with selected features

In [36]:
input_df = df_selected.copy()
print("New DataFrame shape:", input_df.shape)
input_df.head()

New DataFrame shape: (18361, 8)


Unnamed: 0,Age,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,YearlyIncome
0,37,Bachelors,Clerical,M,M,1,3,81916
1,53,Partial College,Clerical,M,M,1,2,81076
2,39,Bachelors,Clerical,F,S,0,3,86387
3,47,Partial College,Skilled Manual,M,M,1,2,61481
4,50,Partial College,Skilled Manual,M,S,1,1,51804


### (c) Determine data type of each attribute

In [37]:
data_types = {
    "Age": "Continuous (Ratio)",
    "Gender": "Nominal",
    "MaritalStatus": "Nominal",
    "Education": "Ordinal",
    "Occupation": "Nominal",
    "HomeOwnerFlag": "Nominal (Binary)",
    "NumberCarsOwned": "Discrete (Ratio)",
    "YearlyIncome": "Continuous (Ratio)"
}

for feature, dtype in data_types.items():
    print(f"{feature} → {dtype}")



Age → Continuous (Ratio)
Gender → Nominal
MaritalStatus → Nominal
Education → Ordinal
Occupation → Nominal
HomeOwnerFlag → Nominal (Binary)
NumberCarsOwned → Discrete (Ratio)
YearlyIncome → Continuous (Ratio)


### Part 2 : (a) Handling NullValues

In [39]:
from sklearn.impute import SimpleImputer

# Copy data for preprocessing
df_preprocessed = df_selected.copy()

# Separate categorical and numerical columns
categorical_cols = ["Gender", "MaritalStatus", "Education", "Occupation", "HomeOwnerFlag"]
numerical_cols = ["Age", "NumberCarsOwned", "YearlyIncome"]

# Imputer for categorical (use most frequent / mode)
imputer_cat = SimpleImputer(strategy="most_frequent")
df_preprocessed[categorical_cols] = imputer_cat.fit_transform(df_preprocessed[categorical_cols])

# Imputer for numerical (use mean)
imputer_num = SimpleImputer(strategy="mean")
df_preprocessed[numerical_cols] = imputer_num.fit_transform(df_preprocessed[numerical_cols])

print("Null values handled")
print(df_preprocessed.isnull().sum())


Null values handled
Age                0
Education          0
Occupation         0
Gender             0
MaritalStatus      0
HomeOwnerFlag      0
NumberCarsOwned    0
YearlyIncome       0
dtype: int64


### (b) Normalization (Min-Max Scaling)

In [41]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_preprocessed[["Age", "NumberCarsOwned", "YearlyIncome"]] = scaler.fit_transform(
    df_preprocessed[["Age", "NumberCarsOwned", "YearlyIncome"]]
)

print("After Min-Max Normalization:\n")
df_preprocessed.head()


After Min-Max Normalization:



Unnamed: 0,Age,Education,Occupation,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,YearlyIncome
0,0.183099,Bachelors,Clerical,M,M,1,0.6,0.496842
1,0.408451,Partial College,Clerical,M,M,1,0.4,0.489453
2,0.211268,Bachelors,Clerical,F,S,0,0.6,0.536172
3,0.323944,Partial College,Skilled Manual,M,M,1,0.4,0.317083
4,0.366197,Partial College,Skilled Manual,M,S,1,0.2,0.231958


### (c) Discretization (Binning)

In [43]:
# Age → Young, Middle, Old
df_preprocessed["Age_Bin"] = pd.cut(df_preprocessed["Age"], bins=3, labels=["Young", "Middle", "Old"])

# Income → Low, Medium, High
df_preprocessed["Income_Bin"] = pd.cut(df_preprocessed["YearlyIncome"], bins=3, labels=["Low", "Medium", "High"])

print("After Discretization:\n")
df_preprocessed[["Age", "Age_Bin", "YearlyIncome", "Income_Bin"]].head()


After Discretization:



Unnamed: 0,Age,Age_Bin,YearlyIncome,Income_Bin
0,0.183099,Young,0.496842,Medium
1,0.408451,Middle,0.489453,Medium
2,0.211268,Young,0.536172,Medium
3,0.323944,Young,0.317083,Low
4,0.366197,Middle,0.231958,Low


### (d) Standardization (Z-Score)

In [44]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
df_preprocessed[["Age", "NumberCarsOwned", "YearlyIncome"]] = std_scaler.fit_transform(
    df_preprocessed[["Age", "NumberCarsOwned", "YearlyIncome"]]
)

print("After Standardization:")
df_preprocessed[["Age", "NumberCarsOwned", "YearlyIncome"]].head()

After Standardization:


Unnamed: 0,Age,NumberCarsOwned,YearlyIncome
0,-0.543335,1.892524,0.298555
1,0.877531,0.798389,0.27118
2,-0.365726,1.892524,0.444261
3,0.344707,0.798389,-0.367401
4,0.611119,-0.295746,-0.682765


### (e) One-Hot Encoding (Binarization)

In [45]:
df_encoded = pd.get_dummies(
    df_preprocessed,
    columns=["Gender", "MaritalStatus", "Education", "Occupation", "HomeOwnerFlag", "Age_Bin", "Income_Bin"],
    drop_first=True
)

print("After One-Hot Encoding:")
, df_encoded.head()
print("Final shape:", df_encoded.shape)


After One-Hot Encoding:
Final shape: (18361, 18)


### Part III (a) Similarity Measures

In [46]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard

# Pick two objects
obj1 = df_encoded.iloc[0].values.reshape(1, -1)
obj2 = df_encoded.iloc[1].values.reshape(1, -1)

# ----- Simple Matching Coefficient -----
smc = np.sum(obj1 == obj2) / len(obj1[0])

# ----- Jaccard Similarity (only binary attributes) -----
binary_obj1 = (obj1 > 0).astype(int)
binary_obj2 = (obj2 > 0).astype(int)
jaccard_sim = 1 - jaccard(binary_obj1[0], binary_obj2[0])

# ----- Cosine Similarity -----
cosine_sim = cosine_similarity(obj1, obj2)[0][0]

print("Similarity Measures:")
print("Simple Matching Coefficient:", smc)
print("Jaccard Similarity:", jaccard_sim)
print("Cosine Similarity:", cosine_sim)


Similarity Measures:
Simple Matching Coefficient: 0.7222222222222222
Jaccard Similarity: 0.625
Cosine Similarity: 0.6124506812192715


### (b) Correlation Between Features

In [48]:
from scipy.stats import pearsonr

# Pearson correlation between Age and YearlyIncome
corr, p_value = pearsonr(df_selected["Age"].dropna(), df_selected["YearlyIncome"].dropna())

print("\nCorrelation between Age and YearlyIncome:")
print("Pearson Correlation:", corr)
print("P-value:", p_value)



Correlation between Age and YearlyIncome:
Pearson Correlation: 0.02639292606532578
P-value: 0.0003479670425956854
