In [1]:
import pandas as pd
df=pd.read_csv("/content/AWCustomers.csv")
print(df)

       CustomerID Title FirstName MiddleName  LastName Suffix  \
0           21173   NaN      Chad          C      Yuan    NaN   
1           13249   NaN      Ryan        NaN     Perry    NaN   
2           29350   NaN     Julia        NaN  Thompson    NaN   
3           13503   NaN  Theodore        NaN     Gomez    NaN   
4           22803   NaN  Marshall          J      Shan    NaN   
...           ...   ...       ...        ...       ...    ...   
18356       25414   NaN     Grace          C    Bailey    NaN   
18357       11459   NaN     Tasha        NaN      Deng    NaN   
18358       12160   NaN    Jaclyn        NaN     Zhang    NaN   
18359       14353   NaN      Erin          I      Reed    NaN   
18360       16676   NaN    Amanda        NaN     Perry    NaN   

                 AddressLine1 AddressLine2            City  \
0          7090 C. Mount Hood          NaN      Wollongong   
1         3651 Willow Lake Rd          NaN         Shawnee   
2      1774 Tice Valley Blvd.    

In [2]:
newdf=df.iloc[:,[0,2,11,13,14,15,16,17,18,19,20,21,22]]
print(newdf)

       CustomerID FirstName PostalCode   BirthDate        Education  \
0           21173      Chad       2500  1987-11-13        Bachelors   
1           13249      Ryan    V9B 2C3  1972-07-21  Partial College   
2           29350     Julia      91791  1985-11-09        Bachelors   
3           13503  Theodore     L4 4HB  1977-10-18  Partial College   
4           22803  Marshall      59368  1975-02-05  Partial College   
...           ...       ...        ...         ...              ...   
18356       25414     Grace      92118  1990-11-11  Graduate Degree   
18357       11459     Tasha       2444  1992-10-13        Bachelors   
18358       12160    Jaclyn      97005  1983-11-24  Partial College   
18359       14353      Erin    V7L 4J4  1995-06-15      High School   
18360       16676    Amanda      91941  1986-03-26  Graduate Degree   

           Occupation Gender MaritalStatus  HomeOwnerFlag  NumberCarsOwned  \
0            Clerical      M             M              1            

In [3]:
types = {
    "CustomerID": "Discrete and Nominal",
    "FirstName": "Discrete and Nominal",
    "PostalCode": "Discrete and Nominal",
    "BirthDate": "Continuous and Interval",
    "Education": "Discrete and Ordinal",
    "Occupation": "Discrete and Nominal",
    "Gender": "Discrete and Nominal",
    "MaritalStatus": "Discrete and Nominal",
    "HomeOwnerFlag": "Discrete and Nominal",
    "NumberCarsOwned": "Discrete and Ratio",
    "NumberChildrenAtHome": "Discrete and Ratio",
    "TotalChildren": "Discrete and Ratio",
    "YearlyIncome": "Continuous and Ratio"
}

In [4]:
print(newdf.isnull())

       CustomerID  FirstName  PostalCode  BirthDate  Education  Occupation  \
0           False      False       False      False      False       False   
1           False      False       False      False      False       False   
2           False      False       False      False      False       False   
3           False      False       False      False      False       False   
4           False      False       False      False      False       False   
...           ...        ...         ...        ...        ...         ...   
18356       False      False       False      False      False       False   
18357       False      False       False      False      False       False   
18358       False      False       False      False      False       False   
18359       False      False       False      False      False       False   
18360       False      False       False      False      False       False   

       Gender  MaritalStatus  HomeOwnerFlag  NumberCarsOwned  \

In [5]:
# Handling Nulls
newdf = newdf.copy()

for col in newdf.columns:
    if newdf[col].dtype in ['float64', 'int64']:
        newdf[col] = newdf[col].fillna(newdf[col].median())
    else:
        newdf[col] = newdf[col].fillna(newdf[col].mode()[0])

#Normalization

from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
numeric_cols = newdf.select_dtypes(include=[np.number]).columns

newdf[numeric_cols] = scaler.fit_transform(newdf[numeric_cols])


#Discretization(Binning)
newdf['Income_bin'] = pd.cut(newdf['YearlyIncome'], bins=3, labels=['Low','Medium','High'])



#Standardization
from sklearn.preprocessing import StandardScaler

scaler_std = StandardScaler()
newdf[numeric_cols] = scaler_std.fit_transform(newdf[numeric_cols])

#Binarization
newdf = pd.get_dummies(newdf, drop_first=True)
print(newdf)

       CustomerID  HomeOwnerFlag  NumberCarsOwned  NumberChildrenAtHome  \
0        0.174472       0.798603         1.892524             -0.594371   
1       -1.310484       0.798603         0.798389              1.163279   
2        1.706839      -1.252187         1.892524             -0.594371   
3       -1.262884       0.798603         0.798389              1.163279   
4        0.479933       0.798603        -0.295746             -0.594371   
...           ...            ...              ...                   ...   
18356    0.969234      -1.252187        -0.295746             -0.594371   
18357   -1.645929      -1.252187         0.798389             -0.594371   
18358   -1.514562      -1.252187         0.798389             -0.594371   
18359   -1.103595      -1.252187        -1.389881             -0.594371   
18360   -0.668265       0.798603         0.798389             -0.594371   

       TotalChildren  YearlyIncome  FirstName_Abby  FirstName_Abhijit  \
0           0.161342      

In [12]:
#Calculation proximity
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import jaccard

obj1 = newdf.iloc[0].values.reshape(1,-1)
obj2 = newdf.iloc[1].values.reshape(1,-1)

# Cosine Similarity
cos_sim = cosine_similarity(obj1, obj2)[0][0]

# Jaccard Similarity (binary columns only)
jac_sim = 1 - jaccard(newdf.iloc[0].values, newdf.iloc[1].values)

# Simple Matching Coefficient (SMC)
smc = (newdf.iloc[0] == newdf.iloc[1]).sum() / len(newdf.columns)

print("Cosine:", cos_sim)
print("Jaccard:", jac_sim)
print("SMC:", smc)


#Correlation analysis - numeric
corr = newdf['TotalChildren'].corr(newdf['YearlyIncome'])
print("Correlation:", corr)

Cosine: 0.32559043220453193
Jaccard: 0.5333333333333333
SMC: 0.9987010175362633
Correlation: 0.022013822892024183
