In [1]:
#Import necessary Libraries
import pandas as pd

In [17]:
# Read data
df = pd.read_csv(r"Income_Input.csv")

## Basic Sanity Checks

In [18]:
df.shape

(10000, 1)

In [19]:
df.head(5)

Unnamed: 0,Income($)
0,48969
1,96432
2,11400
3,59341
4,91562


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   Income($)  10000 non-null  int64
dtypes: int64(1)
memory usage: 78.2 KB


### Create a new column called "Customer Category" binning based on the Income

In [22]:
df['Customer_Category'] = pd.cut(df['Income($)'], bins=[100, 10000, 30000, 50000, 100000], \
                                 labels= ['Low Spend', 'Medium Spend', 'High Spend', 'High Net Worth Customer'])

In [23]:
df.head()

Unnamed: 0,Income($),Customer_Category
0,48969,High Spend
1,96432,High Net Worth Customer
2,11400,Medium Spend
3,59341,High Net Worth Customer
4,91562,High Net Worth Customer


In [24]:
df.groupby('Customer_Category').agg({'Income($)': ['max', 'min', 'mean']})

Unnamed: 0_level_0,Income($),Income($),Income($)
Unnamed: 0_level_1,max,min,mean
Customer_Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Low Spend,9995,103,5103.165503
Medium Spend,29989,10003,19936.146208
High Spend,49999,30007,40018.590864
High Net Worth Customer,99998,50003,74985.411219


### Binning based on the percentile of the data

In [25]:
quantile_incomerange = df['Income($)'].quantile([0, 0.2, 0.5, 0.95, 1])
quantile_incomerange

0.00      100.00
0.20    19769.80
0.50    49601.50
0.95    94882.05
1.00    99998.00
Name: Income($), dtype: float64

In [26]:
incomerange = list(quantile_incomerange.values)
incomerange

[100.0, 19769.800000000003, 49601.5, 94882.04999999999, 99998.0]

In [27]:
df['Percentile_based_category'] = pd.cut(df['Income($)'], bins=incomerange, 
                                         labels=['Lower Band', 'Medium Band', 'Upper Band', 'Top 5 Percentile'])

### View the df after the addition of new column

In [28]:
df.head()

Unnamed: 0,Income($),Customer_Category,Percentile_based_category
0,48969,High Spend,Medium Band
1,96432,High Net Worth Customer,Top 5 Percentile
2,11400,Medium Spend,Lower Band
3,59341,High Net Worth Customer,Upper Band
4,91562,High Net Worth Customer,Upper Band


### New Group By

In [29]:
df.groupby('Percentile_based_category').agg({'Income($)': ['min', 'max', 'mean']})

Unnamed: 0_level_0,Income($),Income($),Income($)
Unnamed: 0_level_1,min,max,mean
Percentile_based_category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Lower Band,103,19765,9982.985493
Medium Band,19771,49601,34800.238667
Upper Band,49602,94882,72234.785333
Top 5 Percentile,94883,99998,97525.646


# Dummification of Data

In [30]:
df['Customer_Category'].unique()

['High Spend', 'High Net Worth Customer', 'Medium Spend', 'Low Spend', NaN]
Categories (4, object): ['Low Spend' < 'Medium Spend' < 'High Spend' < 'High Net Worth Customer']

In [31]:
df['Customer_Category'].value_counts()

High Net Worth Customer    4956
High Spend                 2036
Medium Spend               2004
Low Spend                  1003
Name: Customer_Category, dtype: int64

In [33]:
dummy = pd.get_dummies(df['Customer_Category'])

In [34]:
dummy

Unnamed: 0,Low Spend,Medium Spend,High Spend,High Net Worth Customer
0,0,0,1,0
1,0,0,0,1
2,0,1,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
9995,0,1,0,0
9996,0,0,0,1
9997,0,0,0,1
9998,0,1,0,0


In [35]:
df[['Low Spend', 'Medium Spend', 'High Spend', 'High Net Worth Customer']] = dummy

In [36]:
df.head()

Unnamed: 0,Income($),Customer_Category,Percentile_based_category,Low Spend,Medium Spend,High Spend,High Net Worth Customer
0,48969,High Spend,Medium Band,0,0,1,0
1,96432,High Net Worth Customer,Top 5 Percentile,0,0,0,1
2,11400,Medium Spend,Lower Band,0,1,0,0
3,59341,High Net Worth Customer,Upper Band,0,0,0,1
4,91562,High Net Worth Customer,Upper Band,0,0,0,1


## Logical checks post dummification

In [37]:
df['Customer_Category'].value_counts()

High Net Worth Customer    4956
High Spend                 2036
Medium Spend               2004
Low Spend                  1003
Name: Customer_Category, dtype: int64

In [38]:
print("Low Spend", df['Low Spend'].sum())
print("Medium Spend", df['Medium Spend'].sum())
print("High Spend", df['High Spend'].sum())
print("High Net Worth Customer", df['High Net Worth Customer'].sum())

Low Spend 1003
Medium Spend 2004
High Spend 2036
High Net Worth Customer 4956
