In [1]:
# Important libraries are imported
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, chi2
import tensorflow as tf

In [2]:
# Load the dataset
df = pd.read_csv("car_purchasingNew.csv", encoding='ISO-8859-1')

In [3]:
# Rename columns
df.rename(columns={'customer name': 'customer_name', 
                   'customer e-mail': 'customer_email',
                   'annual Salary': 'annual_Salary',
                   'credit card debt': 'credit_card_debt', 
                   'net worth': 'net_worth', 
                   'car purchase amount': 'car_purchase_amount'}, inplace=True)

In [4]:
# Define function to classify price ranges
def classify_price_range(car_purchase_amount):
    if car_purchase_amount < 37000:
        return 1
    elif 37000 <= car_purchase_amount < 50000:
        return 2
    elif 50000 <= car_purchase_amount < 70000:
        return 3
    elif 70000 <= car_purchase_amount < 80000:
        return 4

# Apply the function to 'car_purchase_amount' column
df['price_range'] = df['car_purchase_amount'].apply(classify_price_range)

In [5]:
# Drop unneeded columns
df.drop(["customer_name", "JobTitle", "customer_email", "country", "Benefits"], axis=1, inplace=True)

# Replace non-numeric and 0 values with NaN
df['price_range'] = pd.to_numeric(df['price_range'], errors='coerce')
df['price_range'].replace(0, np.nan, inplace=True)

# Fill NaN values with 1 and convert to integers
df['price_range'] = df['price_range'].fillna(1).astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['price_range'].replace(0, np.nan, inplace=True)


In [6]:
# Replace encoded values with category labels
df['price_range'] = df['price_range'].replace({1: "1: Cheap", 2: "2: Avg-", 3: "3: Avg+", 4: "4: Expensive"})

In [7]:
# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(df['price_range'])
y = tf.keras.utils.to_categorical(y)

# Select best features using chi-squared test
X = df.drop("price_range", axis=1)
X_cat = X.astype(int).clip(lower=0)
chi2_features = SelectKBest(chi2, k=len(X_cat.columns))
best_features = chi2_features.fit(X_cat, y.astype(int))

# Display feature scores
df_features = pd.DataFrame(best_features.scores_, index=X.columns, columns=['Score'])
df_features.sort_values(by='Score', ascending=False, inplace=True)
print(df_features)

                            Score
net_worth            6.211209e+06
car_purchase_amount  1.020839e+06
OvertimePay          5.933901e+04
OtherPay             4.333394e+04
BasePay              9.356526e+03
TotalPay             7.529052e+03
TotalPayBenefits     7.529052e+03
credit_card_debt     2.603966e+03
age                  2.242998e+02
gender               1.043063e+00


In [8]:
# Split the dataset
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5)

In [9]:
# Display categories
categories = list(np.unique(df['price_range']))
print(categories)

['1: Cheap', '2: Avg-', '3: Avg+', '4: Expensive']
