<h4><b>Feature Engineering- Variable Combination</b></h4>

In [2]:
# installing the featuretools
# importing the necessary modules
import pandas as pd
import numpy as np

# pip install featuretools
import featuretools as ft

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFE, mutual_info_regression
from sklearn.model_selection import train_test_split

In [20]:
# Loading the dataset
df = pd.read_csv("car_purchasingNew.csv", encoding='ISO-8859-1')

In [None]:
# df.columns

Index(['customer name', 'JobTitle', 'customer e-mail', 'country', 'gender',
       'age', 'BasePay', 'OvertimePay', 'OtherPay', 'Benefits', 'TotalPay',
       'TotalPayBenefits', 'credit card debt', 'net worth',
       'car purchase amount'],
      dtype='object')

In [21]:
df

Unnamed: 0,customer name,JobTitle,customer e-mail,country,gender,age,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,credit card debt,net worth,car purchase amount
0,Martina Avila,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,Bulgaria,0,42,167411.18,0.00,400184.25,,567595.43,567595.43,11609.380910,238961.2505,35321.45877
1,Harlan Barnes,CAPTAIN III (POLICE DEPARTMENT),eu.dolor@diam.co.uk,Belize,0,41,155966.02,245131.88,137811.38,,538909.28,538909.28,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,CAPTAIN III (POLICE DEPARTMENT),vulputate.mauris.sagittis@ametconsectetueradip...,Algeria,1,43,212739.13,106088.18,16452.60,,335279.91,335279.91,11160.355060,638467.1773,42925.70921
3,Jade Cunningham,WIRE ROPE CABLE MAINTENANCE MECHANIC,malesuada@dignissim.com,Cook Islands,1,58,77916.00,56120.71,198306.90,,332343.61,332343.61,14426.164850,548599.0524,67422.36313
4,Cedric Leach,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",felis.ullamcorper.viverra@egetmollislectus.net,Brazil,1,57,134401.60,9737.00,182234.59,,326373.19,326373.19,5358.712177,560304.0671,55915.46248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Walter,TRANSIT SUPERVISOR,ligula@Cumsociis.ca,Nepal,0,41,87384.60,97729.16,837.79,,185951.55,185951.55,6995.902524,541670.1016,48901.44342
496,Vanna,"LIEUTENANT, FIRE DEPARTMENT",Cum.sociis.natoque@Sedmolestie.edu,Zimbabwe,1,38,123105.00,38790.92,23865.00,,185760.92,185760.92,12301.456790,360419.0988,31491.41457
497,Pearl,DEPUTY DIRECTOR V,penatibus.et@massanonante.com,Philippines,1,54,185724.50,0.00,0.00,,185724.50,185724.50,10611.606860,764531.3203,64147.28888
498,Nell,MANAGER VIII,Quisque.varius@arcuVivamussit.net,Botswana,1,59,185724.50,0.00,0.00,,185724.50,185724.50,14013.034510,337826.6382,45442.15353


In [22]:
# the columns name were named
df.rename(columns={'customer name': 'customer_name', 
                   'customer e-mail': 'customer_email',
                     'net worth': 'net_worth',
                       'credit card debt':'credit_card_debt', 'car purchase amount':'car_purchase_amount'}, inplace=True)


# here I did drop some of these variables
df = df.drop(['customer_name', 'JobTitle', 'customer_email', 'country', 'gender', 'Benefits'], axis=1)

In [23]:
df.columns

Index(['age', 'BasePay', 'OvertimePay', 'OtherPay', 'TotalPay',
       'TotalPayBenefits', 'credit_card_debt', 'net_worth',
       'car_purchase_amount'],
      dtype='object')

<h4>Deep Feature Synthesis (DFS)- generating freshly new variables using Brute Force</h4>

In [24]:
# Here, the Featuretools used to automatically create new features
# -derived columns based on the existing ones in the dataset
# define target variable
target = "car_purchase_amount"

# save for later use
y = df[target]

# create an entity set with the red wine data
es = ft.EntitySet(id="car_purchase_price")
es = es.add_dataframe(dataframe_name="car_price_data", dataframe=df.drop(target, axis=1), index="index", make_index=True)

# Use DFS to generate new variables (features), aggregate BASED on target variable: quality
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="car_price_data",
    
    # agg_primitives=[],
    # if you use agg primitives, define also the calculations:
    # groupby_trans_primitives=["cum_mean"],
    # trans_primitives=["multiply_numeric"],
    trans_primitives=["divide_numeric"],
    # The variable car_purchase_amount is the target (dependent variable)
    # and is removed from the dataset before generating new features.
    # increase depth if you want to capture more complex
    # relationships in the data
    max_depth=1 
)


# division a bit problematic, since often times
# values we divide with something very small (but not 0)
# this results in many rows and columns to go into infinity
# we have to take care of this

# THIS PART ONLY REQUIRED IF YOU DO DIVISION
# IT REMOVES ALL COLUMNS WITH INFINITE VALUES
feature_matrix.replace([np.inf, -np.inf], np.nan, inplace=True)
feature_matrix.dropna(axis=1, inplace=True)

# It is later re-attached to the feature_matrix.
# re-attach y back to feature_matrix
feature_matrix[target] = y

feature_matrix

Unnamed: 0_level_0,age,BasePay,OvertimePay,OtherPay,TotalPay,TotalPayBenefits,credit_card_debt,net_worth,BasePay / TotalPay,BasePay / TotalPayBenefits,...,credit_card_debt / TotalPay,credit_card_debt / TotalPayBenefits,credit_card_debt / age,credit_card_debt / net_worth,net_worth / BasePay,net_worth / TotalPay,net_worth / TotalPayBenefits,net_worth / age,net_worth / credit_card_debt,car_purchase_amount
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,42,167411.18,0.00,400184.25,567595.43,567595.43,11609.380910,238961.2505,0.294948,0.294948,...,0.020454,0.020454,276.413831,0.048583,1.427391,0.421006,0.421006,5689.553583,20.583462,35321.45877
1,41,155966.02,245131.88,137811.38,538909.28,538909.28,9572.957136,530973.9078,0.289411,0.289411,...,0.017764,0.017764,233.486759,0.018029,3.404420,0.985275,0.985275,12950.583117,55.466028,45115.52566
2,43,212739.13,106088.18,16452.60,335279.91,335279.91,11160.355060,638467.1773,0.634512,0.634512,...,0.033287,0.033287,259.543141,0.017480,3.001174,1.904281,1.904281,14848.073891,57.208500,42925.70921
3,58,77916.00,56120.71,198306.90,332343.61,332343.61,14426.164850,548599.0524,0.234444,0.234444,...,0.043407,0.043407,248.726980,0.026296,7.040904,1.650698,1.650698,9458.604352,38.028059,67422.36313
4,57,134401.60,9737.00,182234.59,326373.19,326373.19,5358.712177,560304.0671,0.411803,0.411803,...,0.016419,0.016419,94.012494,0.009564,4.168879,1.716759,1.716759,9829.895914,104.559463,55915.46248
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,41,87384.60,97729.16,837.79,185951.55,185951.55,6995.902524,541670.1016,0.469932,0.469932,...,0.037622,0.037622,170.631769,0.012915,6.198691,2.912964,2.912964,13211.465893,77.426765,48901.44342
496,38,123105.00,38790.92,23865.00,185760.92,185760.92,12301.456790,360419.0988,0.662707,0.662707,...,0.066222,0.066222,323.722547,0.034131,2.927737,1.940231,1.940231,9484.713126,29.298896,31491.41457
497,54,185724.50,0.00,0.00,185724.50,185724.50,10611.606860,764531.3203,1.000000,1.000000,...,0.057136,0.057136,196.511238,0.013880,4.116481,4.116481,4.116481,14157.987413,72.046706,64147.28888
498,59,185724.50,0.00,0.00,185724.50,185724.50,14013.034510,337826.6382,1.000000,1.000000,...,0.075451,0.075451,237.509059,0.041480,1.818966,1.818966,1.818966,5725.875224,24.108029,45442.15353


<h4>It can be seen that the output is a new DataFrame-df (feature_matrix) that contains both the original columns and the newly derived features.
The matrix has 500 rows (which are same as the original dataset) and 51 columns, indicating that 42 new features were created from the original 9 features.

I think the features can uncover patterns or relationships that can improve the predictive performance of models.</h4>

In [25]:
# typical X/y -split
X = feature_matrix.drop("car_purchase_amount", axis=1)
y = feature_matrix['car_purchase_amount']

# define model (linear regression in this example)
# technically you can use pretty much any classic ML algorithm
model = LinearRegression()
# model = RandomForestRegressor()

# create RFE, place the model and choose amount of optimal variables
rfe = RFE(estimator=model, n_features_to_select=12)

# fit the RFE model with our data
rfe.fit(X, y)

# get rankings and the results
rankings = rfe.ranking_
support = rfe.support_

results_df = pd.DataFrame({
    "Feature": X.columns,
    "Ranking": rankings,
    "Selected": support
}).sort_values(by="Ranking")

# if having lots of new variables, use Data Wrangler or
# something else to see all results
results_df

Unnamed: 0,Feature,Ranking,Selected
13,OtherPay / BasePay,1,True
15,OtherPay / TotalPayBenefits,1,True
19,OvertimePay / BasePay,1,True
39,age / net_worth,1,True
36,age / TotalPay,1,True
37,age / TotalPayBenefits,1,True
38,age / credit_card_debt,1,True
44,credit_card_debt / net_worth,1,True
40,credit_card_debt / BasePay,1,True
41,credit_card_debt / TotalPay,1,True


<h4><b>Using Ration Features</b></h4>

In [None]:
import pandas as pd
from itertools import combinations
from tabulate import tabulate

# Load your dataset
df = pd.read_csv('car_purchasing.csv', encoding='ISO-8859-1')

# I drop any target column 
# preferrably 'car_purchase_amount'
# and then keep numerical features for ratio feature generation
numerical_features = df.drop("car_purchase_amount", axis=1).columns

# I initialize a DataFrame to hold new ratio features
ratio_features = pd.DataFrame()

# Then generate ratio features using all possible combinations of numerical features
for feature1, feature2 in combinations(numerical_features, 2):
    # Also, create a new column as the ratio of two features
    new_feature_name = f"{feature1} / {feature2}"
    ratio_features[new_feature_name] = df[feature1] / (df[feature2] + 1e-6)  # Avoid division by zero

# Combine original dataset with generated ratio features
df_with_ratios = pd.concat([df, ratio_features], axis=1)



# Inspect the updated dataset
# print(df_with_ratios.head())
print(tabulate(df_with_ratios, headers="keys", tablefmt="psql"))


+-----+-------+-----------+---------------+------------+------------+--------------------+--------------------+-------------+-----------------------+-----------------+---------------------+------------------+------------------+--------------------------+--------------------------+-------------------+-------------------------+----------------------+----------------------+------------------------------+------------------------------+-----------------------+--------------------------+--------------------------+----------------------------------+----------------------------------+---------------------------+-----------------------+-------------------------------+-------------------------------+------------------------+-------------------------------+-------------------------------+------------------------+---------------------------------------+--------------------------------+--------------------------------+
|     |   age |   BasePay |   OvertimePay |   OtherPay |   TotalPay |   TotalPay

<h4>I will give definitions to the variables for understanding purposes-

age: The age of the individual in years.

BasePay: The individual's base salary.

OvertimePay: Compensation for overtime work.

OtherPay: Additional pay (bonuses, allowances, etc.).

TotalPay: Sum of BasePay, OvertimePay, and OtherPay.
Formula: TotalPay = BasePay + OvertimePay + OtherPay

TotalPayBenefits: TotalPay including additional benefits.

credit_card_debt: The individual's credit card debt.

net_worth: The individual's net worth (total assets minus liabilities).

car_purchase_amount: The amount the individual spent on a car purchase.
</h4>

<h4><b>Here, training the model with 3 types of model and then compare</b></h4>

In [38]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score, mean_absolute_error
from sklearn import metrics

# Assuming your DataFrame is named 'df'
# Remove the target variable from the features
X = df.drop(columns=['car_purchase_amount'])  # Features
y = df['car_purchase_amount']  # Target variable

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 3: Make predictions on the test set
test_predictions = rf_model.predict(X_test)

# MAE - Mean average error
print("MAE")
print(round(metrics.mean_absolute_error(y_test, test_predictions), 2), "%")

# MSE - Mean square error
print("\nMSE")
print(round(metrics.mean_squared_error(y_test, test_predictions), 2), "%^2")

# RMSE - Root mean square error
print('\nRMSE:')
print(round(np.sqrt(metrics.mean_squared_error(y_test, test_predictions)), 2), "%")

# R-squared. 0 = the model descibes the dataset poorly
# 1 = model describes the dataset perfectly
print('\nR-squared:')
print(round(metrics.r2_score(y_test, test_predictions), 2))

# Explained Variance Score => 0 = the model descibes the dataset poorly
# 1 = model describes the dataset perfectly
# high variance score = model is a good fit for the data 
# low variance score = model is not a good fit for the data
# the higher the score, the model is more able to explain the variation in the data
# if score is low, we might need more and better data
print("\nExplained variance score:")
print(round(metrics.explained_variance_score(y_test, test_predictions), 2))



MAE
5335.4 %

MSE
42037542.83 %^2

RMSE:
6483.64 %

R-squared:
0.61

Explained variance score:
0.62


<h4>Above shows the best model as it has the lowest error and at least higher Explained variance score (62%) and R-squared 61%</h4>

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df'
# Remove the target variable from the features
X = df.drop(columns=['car_purchase_amount'])  # Features
y = df['car_purchase_amount']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
xg_reg = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100)


# Define the parameters grid
param_grid = {
    'max_depth': [3],
    'learning_rate': [0.1],
    'subsample': [1.0],
    'colsample_bytree': [0.5],
    'n_estimators': [50]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xg_reg, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit grid search
grid_search.fit(X_train, y_train)

# Get best parameters and estimator
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Predict using the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the model
mae_best = mean_absolute_error(y_test, y_pred_best)
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = mean_squared_error(y_test, y_pred_best, squared=False)
r2_best = r2_score(y_test, y_pred_best)

# Print the evaluation metrics
print(f"Best Model Parameters: {best_params}")
print(f"Mean Absolute Error (MAE): {mae_best}")
print(f"Mean Squared Error (MSE): {mse_best}")
print(f"Root Mean Squared Error (RMSE): {rmse_best}")
print(f"R^2 Score: {r2_best}")


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Model Parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
Mean Absolute Error (MAE): 5648.008485943749
Mean Squared Error (MSE): 46435885.14304955
Root Mean Squared Error (RMSE): 6814.3880974779795
R^2 Score: 0.5699312185378609




In [None]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Assuming your DataFrame is named 'df'
# Remove the target variable from the features
X = df.drop(columns=['car_purchase_amount'])  # Features
y = df['car_purchase_amount']  # Target variable

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost Regressor model
xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)


# Step 4: Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R^2 Score: {r2}")


Mean Absolute Error (MAE): 6973.0031267875
Mean Squared Error (MSE): 76335437.3235097
Root Mean Squared Error (RMSE): 8737.015355572503
R^2 Score: 0.29301469303390315


