In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error


In [47]:
data = pd.read_csv('datasets/retail_sales_dataset.csv')
data.head()

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


Transaction, date, and customer ID are not important. Total amount is always Quantity * Price per Unit. One of the three features is not required.

In [48]:
data = data[["Gender", "Age", "Product Category", "Quantity", "Price per Unit", "Total Amount"]]
data.head()

Unnamed: 0,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,Male,34,Beauty,3,50,150
1,Female,26,Clothing,2,500,1000
2,Male,50,Electronics,1,30,30
3,Male,37,Clothing,1,500,500
4,Male,30,Beauty,2,50,100


We should use one-hot for gender and product category (both are not ordinal)

In [49]:
dummies = pd.get_dummies(data[["Gender", "Product Category"]])
data.drop(["Gender", "Product Category"], axis=1, inplace=True)
data = pd.concat([data, dummies], axis=1)
data.head()

Unnamed: 0,Age,Quantity,Price per Unit,Total Amount,Gender_Female,Gender_Male,Product Category_Beauty,Product Category_Clothing,Product Category_Electronics
0,34,3,50,150,0,1,1,0,0
1,26,2,500,1000,1,0,0,1,0
2,50,1,30,30,0,1,0,0,1
3,37,1,500,500,0,1,0,1,0
4,30,2,50,100,0,1,1,0,0


We will try to predict Total Amount with everything other than Price per Unit

In [50]:
X = data[["Age",	"Quantity", "Gender_Female", "Gender_Male", "Product Category_Beauty","Product Category_Clothing","Product Category_Electronics"]]
y = data["Total Amount"]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [53]:
pred_Y = LinearRegression().fit(X_train, y_train).predict(X_test)

In [56]:
mse = mean_squared_error(y_test, pred_Y)
print("MSE: ", mse)

MSE:  260380.42247420442
