In [167]:
# Import the libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder

In [168]:
# Load the dataset
sales = pd.read_csv('sales_data.csv')
sales.head()

Unnamed: 0,Store_ID,Revenue,Transactions,Customer_Satisfaction
0,101,50000,1200,8.5
1,102,75000,1800,9.0
2,103,42000,1000,7.8
3,104,89000,2200,9.3
4,105,66000,1500,8.2


In [169]:
sales

Unnamed: 0,Store_ID,Revenue,Transactions,Customer_Satisfaction
0,101,50000,1200,8.5
1,102,75000,1800,9.0
2,103,42000,1000,7.8
3,104,89000,2200,9.3
4,105,66000,1500,8.2
5,106,102000,2500,9.5
6,107,57000,1300,8.0
7,108,98000,2700,9.1
8,109,76000,1900,8.7
9,110,85000,2100,9.2


In [170]:
# Scaling to a range
# Initialize the minmaxscaler
scaler = MinMaxScaler(feature_range=(0,1))

# Select the features to be scales
features = ['Revenue', 'Transactions', 'Customer_Satisfaction']

# Transform the columns
sales[features] = scaler.fit_transform(sales[features])


In [171]:
sales

Unnamed: 0,Store_ID,Revenue,Transactions,Customer_Satisfaction
0,101,0.133333,0.117647,0.411765
1,102,0.55,0.470588,0.705882
2,103,0.0,0.0,0.0
3,104,0.783333,0.705882,0.882353
4,105,0.4,0.294118,0.235294
5,106,1.0,0.882353,1.0
6,107,0.25,0.176471,0.117647
7,108,0.933333,1.0,0.764706
8,109,0.566667,0.529412,0.529412
9,110,0.716667,0.647059,0.823529


In [172]:
# Feature clipping
survey = pd.read_csv('survey_data.csv')
survey

Unnamed: 0,Customer_ID,Purchase_amount,Age,Review_score
0,1,150,25,8
1,2,2300,45,9
2,3,75,120,7
3,4,9100,35,6
4,5,20,22,10
5,6,4500,40,10
6,7,30000,28,2
7,8,600,85,5


In [173]:
# Clip the data
survey['Purchase_amount'] = np.clip(survey['Purchase_amount'],20, 10000)
survey['Age'] = np.clip(survey['Age'], 1, 40)

In [174]:
survey

Unnamed: 0,Customer_ID,Purchase_amount,Age,Review_score
0,1,150,25,8
1,2,2300,40,9
2,3,75,40,7
3,4,9100,35,6
4,5,20,22,10
5,6,4500,40,10
6,7,10000,28,2
7,8,600,40,5


In [175]:
# Log scaling
# Applying log scaling to the data point

survey['Purchase_amount'] = np.log(survey['Purchase_amount'] + 1)
survey['Age'] = np.log(survey['Age'] + 1)
survey['Review_score'] = np.log(survey['Review_score'] + 1)



In [176]:
survey

Unnamed: 0,Customer_ID,Purchase_amount,Age,Review_score
0,1,5.01728,3.258097,2.197225
1,2,7.741099,3.713572,2.302585
2,3,4.330733,3.713572,2.079442
3,4,9.11614,3.583519,1.94591
4,5,3.044522,3.135494,2.397895
5,6,8.412055,3.713572,2.397895
6,7,9.21044,3.367296,1.098612
7,8,6.398595,3.713572,1.791759


In [177]:
sales

Unnamed: 0,Store_ID,Revenue,Transactions,Customer_Satisfaction
0,101,0.133333,0.117647,0.411765
1,102,0.55,0.470588,0.705882
2,103,0.0,0.0,0.0
3,104,0.783333,0.705882,0.882353
4,105,0.4,0.294118,0.235294
5,106,1.0,0.882353,1.0
6,107,0.25,0.176471,0.117647
7,108,0.933333,1.0,0.764706
8,109,0.566667,0.529412,0.529412
9,110,0.716667,0.647059,0.823529


In [178]:
# Z-score / stnadar scaler
Scaler = StandardScaler()

# select the features
Features = ['Revenue', 'Transactions', 'Customer_Satisfaction']

# Apply the standard scaler
sales[Features] = Scaler.fit_transform(sales[Features])

In [179]:
# bucket with equal spaced boundaries
number = {'value': [15, 22, 27, 35, 45, 52, 65, 85, 100]}

number_df = pd.DataFrame(number)

In [180]:
number_df

Unnamed: 0,value
0,15
1,22
2,27
3,35
4,45
5,52
6,65
7,85
8,100


In [181]:
# Apply equal bucketing
number_df['bucket'] = pd.cut(number_df['value'], bins=4, labels=False)
number_df

Unnamed: 0,value,bucket
0,15,0
1,22,0
2,27,0
3,35,0
4,45,1
5,52,1
6,65,2
7,85,3
8,100,3


In [182]:
number_df['bucket_quant'] = pd.qcut(number_df['value'], q=4, labels=False)
number_df

Unnamed: 0,value,bucket,bucket_quant
0,15,0,0
1,22,0,0
2,27,0,0
3,35,0,1
4,45,1,1
5,52,1,2
6,65,2,2
7,85,3,3
8,100,3,3


In [183]:
# Categorical variables
# One-hot encoding
# load the color dataser
color = pd.read_csv('color.csv')
color


Unnamed: 0,ID,Color
0,1,Red
1,2,Blue
2,3,Green
3,4,Red
4,5,Blue
5,6,Yellow
6,7,Green
7,8,Blue
8,9,Red
9,10,Yellow


In [184]:
# Initialise the onehotencoder

encoder = OneHotEncoder(sparse_output = False)

# Fit and transform the color

encoded_color = encoder.fit_transform(color[['Color']])

# Put it in a dataframe
encoded_df = pd.DataFrame(encoded_color, columns=encoder.categories_[0])

# Concatenate the array alongside the dataframe
new_color = pd.concat([color, encoded_df], axis= 1)
new_color

Unnamed: 0,ID,Color,"(Blue,)","(Green,)","(Red,)","(Yellow,)"
0,1,Red,0.0,0.0,1.0,0.0
1,2,Blue,1.0,0.0,0.0,0.0
2,3,Green,0.0,1.0,0.0,0.0
3,4,Red,0.0,0.0,1.0,0.0
4,5,Blue,1.0,0.0,0.0,0.0
5,6,Yellow,0.0,0.0,0.0,1.0
6,7,Green,0.0,1.0,0.0,0.0
7,8,Blue,1.0,0.0,0.0,0.0
8,9,Red,0.0,0.0,1.0,0.0
9,10,Yellow,0.0,0.0,0.0,1.0
