In [382]:
# Import the libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.decomposition import PCA

In [383]:
# Load the dataset
sales = pd.read_csv('sales_data.csv')
sales.head()

Unnamed: 0,Store_ID,Revenue,Transactions,Customer_Satisfaction
0,101,50000,1200,8.5
1,102,75000,1800,9.0
2,103,42000,1000,7.8
3,104,89000,2200,9.3
4,105,66000,1500,8.2


In [384]:
sales

Unnamed: 0,Store_ID,Revenue,Transactions,Customer_Satisfaction
0,101,50000,1200,8.5
1,102,75000,1800,9.0
2,103,42000,1000,7.8
3,104,89000,2200,9.3
4,105,66000,1500,8.2
5,106,102000,2500,9.5
6,107,57000,1300,8.0
7,108,98000,2700,9.1
8,109,76000,1900,8.7
9,110,85000,2100,9.2


In [385]:
# Scaling to a range
# Initialize the minmaxscaler
scaler = MinMaxScaler(feature_range=(0,1))

# Select the features to be scales
features = ['Revenue', 'Transactions', 'Customer_Satisfaction']

# Transform the columns
sales[features] = scaler.fit_transform(sales[features])


In [386]:
sales

Unnamed: 0,Store_ID,Revenue,Transactions,Customer_Satisfaction
0,101,0.133333,0.117647,0.411765
1,102,0.55,0.470588,0.705882
2,103,0.0,0.0,0.0
3,104,0.783333,0.705882,0.882353
4,105,0.4,0.294118,0.235294
5,106,1.0,0.882353,1.0
6,107,0.25,0.176471,0.117647
7,108,0.933333,1.0,0.764706
8,109,0.566667,0.529412,0.529412
9,110,0.716667,0.647059,0.823529


In [387]:
# Feature clipping
survey = pd.read_csv('survey_data.csv')
survey

Unnamed: 0,Customer_ID,Purchase_amount,Age,Review_score
0,1,150,25,8
1,2,2300,45,9
2,3,75,120,7
3,4,9100,35,6
4,5,20,22,10
5,6,4500,40,10
6,7,30000,28,2
7,8,600,85,5


In [388]:
# Clip the data
survey['Purchase_amount'] = np.clip(survey['Purchase_amount'],20, 10000)
survey['Age'] = np.clip(survey['Age'], 1, 40)

In [389]:
survey

Unnamed: 0,Customer_ID,Purchase_amount,Age,Review_score
0,1,150,25,8
1,2,2300,40,9
2,3,75,40,7
3,4,9100,35,6
4,5,20,22,10
5,6,4500,40,10
6,7,10000,28,2
7,8,600,40,5


In [390]:
# Log scaling
# Applying log scaling to the data point

survey['Purchase_amount'] = np.log(survey['Purchase_amount'] + 1)
survey['Age'] = np.log(survey['Age'] + 1)
survey['Review_score'] = np.log(survey['Review_score'] + 1)



In [391]:
survey

Unnamed: 0,Customer_ID,Purchase_amount,Age,Review_score
0,1,5.01728,3.258097,2.197225
1,2,7.741099,3.713572,2.302585
2,3,4.330733,3.713572,2.079442
3,4,9.11614,3.583519,1.94591
4,5,3.044522,3.135494,2.397895
5,6,8.412055,3.713572,2.397895
6,7,9.21044,3.367296,1.098612
7,8,6.398595,3.713572,1.791759


In [392]:
sales

Unnamed: 0,Store_ID,Revenue,Transactions,Customer_Satisfaction
0,101,0.133333,0.117647,0.411765
1,102,0.55,0.470588,0.705882
2,103,0.0,0.0,0.0
3,104,0.783333,0.705882,0.882353
4,105,0.4,0.294118,0.235294
5,106,1.0,0.882353,1.0
6,107,0.25,0.176471,0.117647
7,108,0.933333,1.0,0.764706
8,109,0.566667,0.529412,0.529412
9,110,0.716667,0.647059,0.823529


In [393]:
# Z-score / stnadar scaler
Scaler = StandardScaler()

# select the features
Features = ['Revenue', 'Transactions', 'Customer_Satisfaction']

# Apply the standard scaler
sales[Features] = Scaler.fit_transform(sales[Features])

In [394]:
# bucket with equal spaced boundaries
number = {'value': [15, 22, 27, 35, 45, 52, 65, 85, 100]}

number_df = pd.DataFrame(number)

In [395]:
number_df

Unnamed: 0,value
0,15
1,22
2,27
3,35
4,45
5,52
6,65
7,85
8,100


In [396]:
# Apply equal bucketing
number_df['bucket'] = pd.cut(number_df['value'], bins=4, labels=False)
number_df

Unnamed: 0,value,bucket
0,15,0
1,22,0
2,27,0
3,35,0
4,45,1
5,52,1
6,65,2
7,85,3
8,100,3


In [397]:
number_df['bucket_quant'] = pd.qcut(number_df['value'], q=4, labels=False)
number_df

Unnamed: 0,value,bucket,bucket_quant
0,15,0,0
1,22,0,0
2,27,0,0
3,35,0,1
4,45,1,1
5,52,1,2
6,65,2,2
7,85,3,3
8,100,3,3


In [398]:
# Categorical variables
# One-hot encoding
# load the color dataser
color = pd.read_csv('color.csv')
color


Unnamed: 0,ID,Color
0,1,Red
1,2,Blue
2,3,Green
3,4,Red
4,5,Blue
5,6,Yellow
6,7,Green
7,8,Blue
8,9,Red
9,10,Yellow


In [399]:
# Initialise the onehotencoder

encoder = OneHotEncoder(sparse_output = False)

# Fit and transform the color

encoded_color = encoder.fit_transform(color[['Color']])

# Put it in a dataframe
encoded_df = pd.DataFrame(encoded_color, columns=encoder.categories_[0])

# Concatenate the array alongside the dataframe
new_color = pd.concat([color, encoded_df], axis= 1)
new_color

Unnamed: 0,ID,Color,Blue,Green,Red,Yellow
0,1,Red,0.0,0.0,1.0,0.0
1,2,Blue,1.0,0.0,0.0,0.0
2,3,Green,0.0,1.0,0.0,0.0
3,4,Red,0.0,0.0,1.0,0.0
4,5,Blue,1.0,0.0,0.0,0.0
5,6,Yellow,0.0,0.0,0.0,1.0
6,7,Green,0.0,1.0,0.0,0.0
7,8,Blue,1.0,0.0,0.0,0.0
8,9,Red,0.0,0.0,1.0,0.0
9,10,Yellow,0.0,0.0,0.0,1.0


In [400]:
# Label encoder
label_enc = LabelEncoder()
color['color_lable'] = label_enc.fit_transform(color['Color'])

In [401]:
color


Unnamed: 0,ID,Color,color_lable
0,1,Red,2
1,2,Blue,0
2,3,Green,1
3,4,Red,2
4,5,Blue,0
5,6,Yellow,3
6,7,Green,1
7,8,Blue,0
8,9,Red,2
9,10,Yellow,3


In [402]:
# Date time
date = pd.DataFrame({'Timestamp': ['2024-01-02 15:30:45', '2024-05-07 17:35:55']})
date

Unnamed: 0,Timestamp
0,2024-01-02 15:30:45
1,2024-05-07 17:35:55


In [403]:
# decompose this
# Convert this to date time
date['Timestamp'] = pd.to_datetime(date['Timestamp'])


In [404]:
date

Unnamed: 0,Timestamp
0,2024-01-02 15:30:45
1,2024-05-07 17:35:55


In [405]:
# decompose
date['Year'] = date['Timestamp'].dt.year
date['Month'] = date['Timestamp'].dt.month
date['Day'] = date['Timestamp'].dt.day
date['Hour'] = date['Timestamp'].dt.hour
date['Minute'] = date['Timestamp'].dt.minute
date['Second'] = date['Timestamp'].dt.second
date['Weekday'] = date['Timestamp'].dt.weekday


In [406]:
date

Unnamed: 0,Timestamp,Year,Month,Day,Hour,Minute,Second,Weekday
0,2024-01-02 15:30:45,2024,1,2,15,30,45,1
1,2024-05-07 17:35:55,2024,5,7,17,35,55,1


In [407]:
wholesale = pd.read_csv('wholesale_customers_data.csv')

In [408]:
# standarise your dataset
scale = StandardScaler()
scaled_data = scale.fit_transform(wholesale)

In [409]:
scaled_data

array([[ 1.44865163,  0.59066829,  0.05293319, ..., -0.58936716,
        -0.04356873, -0.06633906],
       [ 1.44865163,  0.59066829, -0.39130197, ..., -0.27013618,
         0.08640684,  0.08915105],
       [ 1.44865163,  0.59066829, -0.44702926, ..., -0.13753572,
         0.13323164,  2.24329255],
       ...,
       [ 1.44865163,  0.59066829,  0.20032554, ..., -0.54337975,
         2.51121768,  0.12145607],
       [-0.69029709,  0.59066829, -0.13538389, ..., -0.41944059,
        -0.56977032,  0.21304614],
       [-0.69029709,  0.59066829, -0.72930698, ..., -0.62009417,
        -0.50488752, -0.52286938]], shape=(440, 8))

In [410]:
# apply pca
pca = PCA(n_components=2)

# lets fit
pca_scaled = pca.fit_transform(scaled_data)

In [411]:
pca_scaled

array([[ 8.43938928e-01, -5.15350749e-01],
       [ 1.06267645e+00, -4.84601258e-01],
       [ 1.26914052e+00,  6.82054553e-01],
       [-1.05678239e+00,  6.10821496e-01],
       [ 6.34030484e-01,  9.74198836e-01],
       [ 5.30110898e-01, -5.85411382e-01],
       [ 3.99560245e-01, -8.46342878e-01],
       [ 7.91165650e-01, -4.61140175e-01],
       [-6.63831131e-01, -5.77465454e-01],
       [ 2.15068480e+00, -6.17913981e-01],
       [ 1.25599389e+00, -5.12452434e-01],
       [-1.68210612e-01, -6.88796117e-01],
       [ 1.43985588e+00,  6.08852947e-01],
       [ 1.40695611e+00, -1.60564663e-01],
       [ 1.37395999e+00,  6.85013841e-02],
       [-1.07181521e+00, -5.19859188e-01],
       [ 1.25801233e+00, -1.12165441e+00],
       [-6.18599013e-01,  2.82406711e-01],
       [ 8.67093429e-01,  2.20448419e-01],
       [-4.84666795e-01, -5.76463994e-01],
       [ 3.31514687e-01, -1.94171620e-01],
       [-1.25444108e+00, -3.05734448e-01],
       [-6.95754707e-01,  2.16418147e+00],
       [ 4.

In [412]:
# variance ratio
pca.explained_variance_ratio_

array([0.38750123, 0.22374588])