# Imports

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score,mean_absolute_error

# Importing dataset

In [2]:
df= pd.read_csv('customer_shopping_data.csv')

# Basic information

In [3]:
df.sample(5)

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
49157,I292776,C255528,Female,45,Clothing,1,300.08,Debit Card,28/09/2021,Kanyon
3855,I259277,C355612,Female,34,Toys,3,107.52,Credit Card,11/8/2021,Metrocity
23091,I990513,C207707,Male,27,Food & Beverage,3,15.69,Cash,5/1/2021,Kanyon
24354,I134526,C177194,Male,67,Clothing,1,300.08,Credit Card,16/08/2021,Viaport Outlet
59440,I418127,C152862,Female,64,Technology,1,1050.0,Cash,18/06/2022,Metrocity


In [4]:
df.columns

Index(['invoice_no', 'customer_id', 'gender', 'age', 'category', 'quantity',
       'price', 'payment_method', 'invoice_date', 'shopping_mall'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99457 entries, 0 to 99456
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   invoice_no      99457 non-null  object 
 1   customer_id     99457 non-null  object 
 2   gender          99457 non-null  object 
 3   age             99457 non-null  int64  
 4   category        99457 non-null  object 
 5   quantity        99457 non-null  int64  
 6   price           99457 non-null  float64
 7   payment_method  99457 non-null  object 
 8   invoice_date    99457 non-null  object 
 9   shopping_mall   99457 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 7.6+ MB


In [6]:
df.duplicated().sum()

0

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,99457.0,43.427089,14.990054,18.0,30.0,43.0,56.0,69.0
quantity,99457.0,3.003429,1.413025,1.0,2.0,3.0,4.0,5.0
price,99457.0,689.256321,941.184567,5.23,45.45,203.3,1200.32,5250.0


In [8]:
df[['age', 'quantity', 'price']].agg(['skew', 'kurtosis']).T

Unnamed: 0,skew,kurtosis
age,0.009021,-1.198354
quantity,-0.001296,-1.295893
price,2.247433,6.16835


# Transformations

In [9]:
df= pd.get_dummies(data= df, columns= ['gender', 'category', 'payment_method', 'shopping_mall'])

In [10]:
# df['invoice_date']=df['invoice_date'].astype('datetime64[ns]')

In [11]:
df.drop(['invoice_no', 'customer_id', 'invoice_date'], axis=1, inplace=True)

In [12]:
df.head(2)

Unnamed: 0,age,quantity,price,gender_Female,gender_Male,category_Books,category_Clothing,category_Cosmetics,category_Food & Beverage,category_Shoes,...,shopping_mall_Cevahir AVM,shopping_mall_Emaar Square Mall,shopping_mall_Forum Istanbul,shopping_mall_Istinye Park,shopping_mall_Kanyon,shopping_mall_Mall of Istanbul,shopping_mall_Metrocity,shopping_mall_Metropol AVM,shopping_mall_Viaport Outlet,shopping_mall_Zorlu Center
0,28,5,1500.4,1,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,21,3,1800.51,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [13]:
df.corr()['price']

age                                0.001694
quantity                           0.344880
price                              1.000000
gender_Female                     -0.001450
gender_Male                        0.001450
category_Books                    -0.157036
category_Clothing                  0.163976
category_Cosmetics                -0.254765
category_Food & Beverage          -0.298954
category_Shoes                     0.397954
category_Souvenir                 -0.159944
category_Technology                0.602977
category_Toys                     -0.207577
payment_method_Cash                0.001497
payment_method_Credit Card        -0.000558
payment_method_Debit Card         -0.001190
shopping_mall_Cevahir AVM         -0.000313
shopping_mall_Emaar Square Mall    0.003704
shopping_mall_Forum Istanbul      -0.003620
shopping_mall_Istinye Park        -0.000880
shopping_mall_Kanyon               0.001274
shopping_mall_Mall of Istanbul     0.002826
shopping_mall_Metrocity         

# Train_test_split

In [14]:
X= df.drop(columns=['price'])
y= df['price']

In [15]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.3, random_state= 2)

# Models

## 1. Linear Regression

In [16]:
lm=LinearRegression()
lm.fit(X_train, y_train)
lm_pred=lm.predict(X_test)

In [17]:
print('R2 score',r2_score(y_test,lm_pred))
print('MAE',mean_absolute_error(y_test,lm_pred))

R2 score 0.8411827242358375
MAE 248.506676869221


## 2. Ridge Regression

In [18]:
rr=Ridge(alpha=10)
rr.fit(X_train, y_train)
rr_pred=rr.predict(X_test)

In [19]:
print('R2 score',r2_score(y_test,rr_pred))
print('MAE',mean_absolute_error(y_test,rr_pred))

R2 score 0.8412064919597729
MAE 248.4496579030048


## 3. Lasso Regression

In [20]:
lr=Lasso(alpha=0.0001)
lr.fit(X_train, y_train)
lr_pred=lr.predict(X_test)

In [21]:
print('R2 score',r2_score(y_test,lr_pred))
print('MAE',mean_absolute_error(y_test,lr_pred))

R2 score 0.8411827398660014
MAE 248.50654065885675


## 4. KNN Regressor

In [22]:
# knn=KNeighborsRegressor(n_neighbors=10)
# knn.fit(X_train, y_train)
# knn_pred=knn.predict(X_test)

In [23]:
# print('R2 score',r2_score(y_test,knn_pred))
# print('MAE',mean_absolute_error(y_test,knn_pred))

## 5. Decision Tree Regressor

In [24]:
dtr=DecisionTreeRegressor(max_depth=8)
dtr.fit(X_train, y_train)
dtr_pred=dtr.predict(X_test)

In [25]:
print('R2 score',r2_score(y_test,dtr_pred))
print('MAE',mean_absolute_error(y_test,dtr_pred))

R2 score 0.9999573177211839
MAE 2.60347393551593


## 6. Support Vector Regressor

In [26]:
# svm=SVR(kernel='rbf',C=10000,epsilon=0.1)
# svm.fit(X_train, y_train)
# svm_pred=svm.predict(X_test)

In [27]:
# print('R2 score',r2_score(y_test,svm_pred))
# print('MAE',mean_absolute_error(y_test,svm_pred))

## 7. Random Forest Regressor

In [28]:
rfr=RandomForestRegressor(n_estimators=100,
                              random_state=3,
                              max_samples=0.5,
                              max_features=0.75,
                              max_depth=15)
rfr.fit(X_train, y_train)
rfr_pred=rfr.predict(X_test)

In [29]:
print('R2 score',r2_score(y_test,rfr_pred))
print('MAE',mean_absolute_error(y_test,rfr_pred))

R2 score 0.9999980975287465
MAE 0.1355615207917331


## 8. Extra Tree Regressor

In [30]:
# etr=ExtraTreesRegressor(n_estimators=200,
#                               random_state=4,
#                               max_features=0.75,
#                               max_depth=20,
#                               max_samples=None,
#                               )
# etr.fit(X_train, y_train)
# etr_pred=rfr.predict(X_test)

In [31]:
# print('R2 score',r2_score(y_test,etr_pred))
# print('MAE',mean_absolute_error(y_test,etr_pred))

R2 score 0.9999980975287465
MAE 0.1355615207917331


## 9. Gradient Boosting Regressor

In [32]:
gbr=GradientBoostingRegressor(n_estimators=500)
gbr.fit(X_train, y_train)
gbr_pred=gbr.predict(X_test)

In [33]:
print('R2 score',r2_score(y_test,gbr_pred))
print('MAE',mean_absolute_error(y_test,gbr_pred))

R2 score 0.9999999835253796
MAE 0.0938527696563289


## 10. CatBoost Regressor

In [34]:
cbr = CatBoostRegressor(verbose=0)
cbr.fit(X_train, y_train)
cbr_pred = cbr.predict(X_test)

In [35]:
print('R2 score',r2_score(y_test,cbr_pred))
print('MAE',mean_absolute_error(y_test,cbr_pred))

R2 score 0.999999999999946
MAE 0.00011447290533843364


## 11. XGBoost Regressor

In [36]:
# xgb=XGBRegressor(n_estimators=100, learning_rate=0.05)
# xgb.fit(X_train, y_train)
# xgb_pred=xgb.predict(X_test)

In [37]:
# print('R2 score',r2_score(y_test,xgb_pred))
# print('MAE',mean_absolute_error(y_test,xgb_pred))

R2 score 0.999944411745229
MAE 4.467575770055355


## 12. LGBM Regressor

In [38]:
lgbm = LGBMRegressor(n_estimators=1500, max_depth=10)
lgbm.fit(X_train, y_train)
lgbm_pred = lgbm.predict(X_test)

In [39]:
print('R2 score',r2_score(y_test,lgbm_pred))
print('MAE',mean_absolute_error(y_test,lgbm_pred))

R2 score 0.9999999999999993
MAE 1.4110167356025985e-05
