# Task 2: Data Insights

In [1]:
import pandas as pd
import numpy as np
import datetime as DT
from datetime import timedelta
import io
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# First rows of the sheets are comments, not part of dataframes
df_transactions = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='Transactions', skiprows=1)
df_new_customer_list = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='NewCustomerList', skiprows=1)
df_customer_demographic = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerDemographic', skiprows=1)
df_customer_address = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerAddress', skiprows=1)

In [3]:
# Make copies for fail safe.
df_transactions_copy = df_transactions
df_new_customer_list_copy = df_new_customer_list
df_customer_demographic_copy = df_customer_demographic
df_customer_address_copy = df_customer_address

In [4]:
df_transactions.sample()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
8281,8282,22,2801,2017-02-04,0.0,Approved,WeareA2B,Standard,medium,medium,60.34,45.26,34165.0


In [5]:
df_new_customer_list.sample()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
685,Maurizia,Ritmeyer,Female,95,1980-04-09,Teacher,Entertainment,Mass Customer,N,Yes,...,NSW,Australia,6,0.93,1.1625,1.453125,1.235156,684,684,0.7


In [6]:
df_customer_demographic.sample()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure
355,356,Nichole,,Female,10,1975-03-30,Librarian,Entertainment,High Net Worth,N,-1,No,5.0


In [7]:
df_customer_address.sample()

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
991,996,8298 Texas Alley,2194,NSW,Australia,10


## Analyze and Clean New Customer List

In [8]:
df_new_customer_list.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,...,QLD,Australia,6,0.56,0.7,0.875,0.74375,1,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,...,NSW,Australia,11,0.89,0.89,1.1125,0.945625,1,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,...,VIC,Australia,5,1.01,1.01,1.01,1.01,1,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,...,QLD,Australia,1,0.87,1.0875,1.0875,1.0875,4,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,...,NSW,Australia,9,0.52,0.52,0.65,0.65,4,4,1.703125


In [9]:
df_new_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   first_name                           1000 non-null   object        
 1   last_name                            971 non-null    object        
 2   gender                               1000 non-null   object        
 3   past_3_years_bike_related_purchases  1000 non-null   int64         
 4   DOB                                  983 non-null    datetime64[ns]
 5   job_title                            894 non-null    object        
 6   job_industry_category                835 non-null    object        
 7   wealth_segment                       1000 non-null   object        
 8   deceased_indicator                   1000 non-null   object        
 9   owns_car                             1000 non-null   object        
 10  tenure       

We have five unnamed columns. We are not sure what they represents, so it is better to consult with the client or drop them for our analysis.

In [10]:
df_new_customer_list.drop(['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20'], axis = 1, inplace=True)
df_new_customer_list.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703125


We should convert DOB to age.

In [11]:
df_transactions['transaction_date'].sort_values()

516     2017-01-01
5876    2017-01-01
3459    2017-01-01
12484   2017-01-01
19130   2017-01-01
           ...    
605     2017-12-30
15269   2017-12-30
19906   2017-12-30
15756   2017-12-30
12003   2017-12-30
Name: transaction_date, Length: 20000, dtype: datetime64[ns]

We see that the data is collected in 2017. We should convert age to 2017.

In [12]:
# https://stackoverflow.com/questions/26788854/pandas-get-the-age-from-a-date-example-date-of-birth
# https://stackoverflow.com/questions/58948809/why-do-i-get-valueerror-nattype-does-not-support-strftime-even-though-its-no

df_new_customer_list['DOB'] = pd.to_datetime(df_new_customer_list['DOB'], errors='coerce', format='%Y-%m-%d')

year_2017 = pd.Timestamp('2017-12-31')

df_new_customer_list['age'] = (year_2017 - df_new_customer_list['DOB']).astype('<m8[Y]')

# drop DOB
df_new_customer_list.drop(['DOB'], axis = 1, inplace=True)

df_new_customer_list.head()


Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value,age
0,Chickie,Brister,Male,86,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.71875,60.0
1,Morly,Genery,Male,69,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.71875,47.0
2,Ardelis,Forrester,Female,10,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.71875,43.0
3,Lucine,Stutt,Female,64,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703125,38.0
4,Melinda,Hadlee,Female,34,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703125,52.0


## RFM

In [13]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  float64       
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

In [14]:
df_transactions.isnull().sum()

transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               360
order_status                 0
brand                      197
product_line               197
product_class              197
product_size               197
list_price                   0
standard_cost              197
product_first_sold_date    197
dtype: int64

In [15]:
df_transactions[df_transactions['standard_cost'].isnull()].head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
136,137,0,431,2017-09-23,0.0,Approved,,,,,1942.61,,
159,160,0,3300,2017-08-27,0.0,Approved,,,,,1656.86,,
366,367,0,1614,2017-03-10,0.0,Approved,,,,,850.89,,
406,407,0,2559,2017-06-14,1.0,Approved,,,,,710.59,,
676,677,0,2609,2017-07-02,0.0,Approved,,,,,1972.01,,


Let's see if the 197 entries are the same product_id.

In [16]:
df_transactions[df_transactions['standard_cost'].isnull()]['product_id'].value_counts()

0    197
Name: product_id, dtype: int64

The 197 entries will increase noise to the training data. These entries should be excluded from our analysis.

In [17]:
df_transactions.drop(df_transactions[df_transactions['standard_cost'].isnull()].index, inplace=True)
df_transactions.isnull().sum()

transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               358
order_status                 0
brand                        0
product_line                 0
product_class                0
product_size                 0
list_price                   0
standard_cost                0
product_first_sold_date      0
dtype: int64

In [18]:
df_transactions['profit'] = df_transactions['list_price'] - df_transactions['standard_cost']
df_transactions.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date,profit
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0,17.87
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0,1702.55
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0,1544.61
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0,817.36
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0,1055.82


In [19]:
# https://towardsdatascience.com/recency-frequency-monetary-model-with-python-and-how-sephora-uses-it-to-optimize-their-google-d6a0707c5f17

snapshot_date = df_transactions['transaction_date'].max() + timedelta(days=1)
print(snapshot_date)


data_process = df_transactions.groupby(['customer_id']).agg({
        'transaction_date': lambda x: (snapshot_date - x.max()).days,
        'transaction_id': 'count',
        'profit': 'mean'})

data_process.rename(columns={'transaction_date': 'Recency',
                         'transaction_id': 'Frequency',
                         'profit': 'MonetaryValue'}, inplace=True)

data_process.head()


2017-12-31 00:00:00


Unnamed: 0_level_0,Recency,Frequency,MonetaryValue
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,8,11,274.371818
2,129,3,742.086667
3,103,8,420.35125
4,196,2,110.285
5,17,6,399.156667


In [20]:
# --Calculate R and F groups--
# Create labels for Recency and Frequency
r_labels = range(4, 0, -1); f_labels = range(1, 5)
# Assign these labels to 4 equal percentile groups 
r_groups = pd.qcut(data_process['Recency'], q=4, labels=r_labels)
# Assign these labels to 4 equal percentile groups 
f_groups = pd.qcut(data_process['Frequency'], q=4, labels=f_labels)
# Create new columns R and F 
data_process = data_process.assign(R = r_groups.values, F = f_groups.values)
data_process.head()

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,R,F
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,8,11,274.371818,4,4
2,129,3,742.086667,1,1
3,103,8,420.35125,1,4
4,196,2,110.285,1,1
5,17,6,399.156667,4,2


In [21]:
# Create labels for MonetaryValue
m_labels = range(1, 5)
# Assign these labels to three equal percentile groups 
m_groups = pd.qcut(data_process['MonetaryValue'], q=4, labels=m_labels)
# Create new column M
data_process = data_process.assign(M = m_groups.values)

In [22]:
# Calculate RFM_Score
# data_process['RFM_Score'] = data_process[['R','F','M']].sum(axis=1)
data_process['RFM_Score'] = data_process.apply(lambda x: 125 * x['R'] + 100 * x['F'] + 130 * x['M'], axis=1)
data_process

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,R,F,M,RFM_Score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,8,11,274.371818,4,4,1,1030
2,129,3,742.086667,1,1,4,745
3,103,8,420.351250,1,4,2,785
4,196,2,110.285000,1,1,1,355
5,17,6,399.156667,4,2,2,960
...,...,...,...,...,...,...,...
3497,53,3,549.440000,2,1,3,740
3498,128,6,524.555000,1,2,2,585
3499,52,7,707.892857,2,3,4,1070
3500,145,6,297.643333,1,2,1,455


In [23]:
data_process.describe()

Unnamed: 0,Recency,Frequency,MonetaryValue,RFM_Score
count,3494.0,3494.0,3494.0,3494.0
mean,62.383515,5.667716,550.34453,860.124499
std,58.382418,2.311129,236.909344,252.252532
min,1.0,1.0,15.08,355.0
25%,18.0,4.0,385.707273,705.0
50%,45.0,6.0,532.0655,860.0
75%,87.0,7.0,695.792571,1040.0
max,354.0,14.0,1702.55,1420.0


In [24]:
data_process.reset_index(level=0, inplace=True)
rfm = data_process[['customer_id', 'RFM_Score']]
rfm.head()

Unnamed: 0,customer_id,RFM_Score
0,1,1030
1,2,745
2,3,785
3,4,355
4,5,960


In [25]:
rfm.describe()

Unnamed: 0,customer_id,RFM_Score
count,3494.0,3494.0
mean,1750.856039,860.124499
std,1011.902531,252.252532
min,1.0,355.0
25%,876.25,705.0
50%,1750.5,860.0
75%,2624.75,1040.0
max,5034.0,1420.0


## Create Old Customer List as Training Data

In [26]:
df_old_customer_list = df_customer_demographic
df_old_customer_list = df_old_customer_list.merge(df_customer_address, how='left', on='customer_id')
df_old_customer_list.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure,address,postcode,state,country,property_valuation
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0,060 Morning Avenue,2016.0,New South Wales,Australia,10.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0,6 Meadow Vale Court,2153.0,New South Wales,Australia,10.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0,,,,,
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0,0 Holy Cross Court,4211.0,QLD,Australia,9.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0,17979 Del Mar Point,2448.0,New South Wales,Australia,4.0


Merge number of transactions to the old customer list

In [27]:
df_old_customer_list = df_old_customer_list.merge(rfm, how='left', on='customer_id')
df_old_customer_list.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure,address,postcode,state,country,property_valuation,RFM_Score
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0,060 Morning Avenue,2016.0,New South Wales,Australia,10.0,1030.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0,6 Meadow Vale Court,2153.0,New South Wales,Australia,10.0,745.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0,,,,,,785.0
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0,0 Holy Cross Court,4211.0,QLD,Australia,9.0,355.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0,17979 Del Mar Point,2448.0,New South Wales,Australia,4.0,960.0


In [28]:
df_old_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 0 to 3999
Data columns (total 19 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          4000 non-null   int64         
 1   first_name                           4000 non-null   object        
 2   last_name                            3875 non-null   object        
 3   gender                               4000 non-null   object        
 4   past_3_years_bike_related_purchases  4000 non-null   int64         
 5   DOB                                  3913 non-null   datetime64[ns]
 6   job_title                            3494 non-null   object        
 7   job_industry_category                3344 non-null   object        
 8   wealth_segment                       4000 non-null   object        
 9   deceased_indicator                   4000 non-null   object        
 10  default     

Convert DOB to age with respect to year 2017

In [29]:
df_old_customer_list['DOB'] = pd.to_datetime(df_old_customer_list['DOB'], errors='coerce', format='%Y-%m-%d')

year_2017 = pd.Timestamp('2017-12-31')

df_old_customer_list['age'] = (year_2017 - df_old_customer_list['DOB']).astype('<m8[Y]')

# drop DOB
df_old_customer_list.drop(['DOB'], axis = 1, inplace=True)

df_old_customer_list.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure,address,postcode,state,country,property_valuation,RFM_Score,age
0,1,Laraine,Medendorp,F,93,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0,060 Morning Avenue,2016.0,New South Wales,Australia,10.0,1030.0,64.0
1,2,Eli,Bockman,Male,81,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0,6 Meadow Vale Court,2153.0,New South Wales,Australia,10.0,745.0,37.0
2,3,Arlin,Dearle,Male,61,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0,,,,,,785.0,63.0
3,4,Talbot,,Male,33,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0,0 Holy Cross Court,4211.0,QLD,Australia,9.0,355.0,56.0
4,5,Sheila-kathryn,Calton,Female,56,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0,17979 Del Mar Point,2448.0,New South Wales,Australia,4.0,960.0,40.0


## Data Cleaning for Both Old and New Customer List

### Drop Unecessary Columns

In [30]:
old_columns = df_old_customer_list.columns
new_columns = df_new_customer_list.columns

for col in old_columns:
    if col not in new_columns:
        print(col + ' in df_old_customer_list not exist in df_new_customer_list')
        
print()

for col in new_columns:
    if col not in old_columns:
        print(col + ' in df_new_customer_list not exist in df_old_customer_list')

customer_id in df_old_customer_list not exist in df_new_customer_list
default in df_old_customer_list not exist in df_new_customer_list
RFM_Score in df_old_customer_list not exist in df_new_customer_list

Rank in df_new_customer_list not exist in df_old_customer_list
Value in df_new_customer_list not exist in df_old_customer_list


We should drop columns except customer_id and RFM_Score. customer_id will be excluded from our analysis. RFM_Score will be the y value for machine learning models.

In [31]:
df_old_customer_list.drop('default', axis=1, inplace=True)
df_new_customer_list.drop(['Rank', 'Value'], axis=1, inplace=True)

In [32]:
df_old_customer_list.head(2)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,RFM_Score,age
0,1,Laraine,Medendorp,F,93,Executive Secretary,Health,Mass Customer,N,Yes,11.0,060 Morning Avenue,2016.0,New South Wales,Australia,10.0,1030.0,64.0
1,2,Eli,Bockman,Male,81,Administrative Officer,Financial Services,Mass Customer,N,Yes,16.0,6 Meadow Vale Court,2153.0,New South Wales,Australia,10.0,745.0,37.0


In [33]:
df_new_customer_list.head(2)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,age
0,Chickie,Brister,Male,86,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,60.0
1,Morly,Genery,Male,69,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,47.0


Drop categorical variables with more than 15 values.

In [34]:
len(df_old_customer_list['job_title'].unique())

196

`job_title` for both df need to be dropped

In [35]:
len(df_old_customer_list['job_industry_category'].unique())

10

In [36]:
len(df_new_customer_list['job_industry_category'].unique())

10

In [37]:
len(df_old_customer_list['wealth_segment'].unique())

3

In [38]:
len(df_new_customer_list['wealth_segment'].unique())

3

In [39]:
len(df_old_customer_list['address'].unique())

3994

`address` for both df need to be dropped

`postcode` need to be dropped too since `state` is enough as a address variable

In [40]:
df_old_customer_list.drop(['job_title', 'address', 'postcode'], axis=1, inplace=True)
df_new_customer_list.drop(['job_title', 'address', 'postcode'], axis=1, inplace=True)

In [41]:
df_old_customer_list.sample(1)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,state,country,property_valuation,RFM_Score,age
3930,3931,Kylie,Epine,U,19,IT,High Net Worth,N,Yes,,NSW,Australia,9.0,,


In [42]:
df_new_customer_list.sample(1)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,state,country,property_valuation,age
164,Emilie,Brody,Female,3,,Mass Customer,N,Yes,3,NSW,Australia,11,38.0


In [43]:
df_old_customer_list['country'].unique()

array(['Australia', nan], dtype=object)

In [44]:
df_new_customer_list['country'].unique()

array(['Australia'], dtype=object)

All data should be collected in Australia. We can remove the `country` column

In [45]:
df_old_customer_list.drop(['country'], axis=1, inplace=True)
df_new_customer_list.drop(['country'], axis=1, inplace=True)

For `deceased_indicator`, we should remove the customer if deceased, then drop column.

In [46]:
df_old_customer_list['deceased_indicator'].value_counts()

N    3998
Y       2
Name: deceased_indicator, dtype: int64

In [47]:
df_old_customer_list.drop(df_old_customer_list[df_old_customer_list['deceased_indicator'] == 'Y'].index, inplace=True)
df_old_customer_list['deceased_indicator'].value_counts()

N    3998
Name: deceased_indicator, dtype: int64

In [48]:
df_new_customer_list['deceased_indicator'].value_counts()

N    1000
Name: deceased_indicator, dtype: int64

In [49]:
df_old_customer_list.drop(['deceased_indicator'], axis=1, inplace=True)
df_new_customer_list.drop(['deceased_indicator'], axis=1, inplace=True)

Let's analyze job_industry_category again for null values

In [50]:
df_old_customer_list['job_industry_category'].value_counts()

Manufacturing         799
Financial Services    774
Health                601
Retail                357
Property              267
IT                    223
Entertainment         136
Argiculture           113
Telecommunications     72
Name: job_industry_category, dtype: int64

In [51]:
df_old_customer_list[df_old_customer_list['job_industry_category'].isnull()].count()

customer_id                            656
first_name                             656
last_name                              631
gender                                 656
past_3_years_bike_related_purchases    656
job_industry_category                    0
wealth_segment                         656
owns_car                               656
tenure                                 656
state                                  655
property_valuation                     655
RFM_Score                              561
age                                    656
dtype: int64

We have too many null values in this column. That will impact our analysis. We can simply drop the column.

In [52]:
df_old_customer_list.drop(['job_industry_category'], axis=1, inplace=True)
df_new_customer_list.drop(['job_industry_category'], axis=1, inplace=True)

### Feature Engineering

In [53]:
df_old_customer_list.isnull().sum()

customer_id                              0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
wealth_segment                           0
owns_car                                 0
tenure                                  87
state                                    4
property_valuation                       4
RFM_Score                              506
age                                     87
dtype: int64

For the missing RFM_Score, that means 506 customers do not have transaction data in 2017. It is either caused by incomplete data, or they did not make any purchases. We should drop these customers.

In [54]:
df_old_customer_list.drop(df_old_customer_list[df_old_customer_list['RFM_Score'].isnull()].index, inplace=True)
df_old_customer_list.isnull().sum()

customer_id                              0
first_name                               0
last_name                              112
gender                                   0
past_3_years_bike_related_purchases      0
wealth_segment                           0
owns_car                                 0
tenure                                  76
state                                    4
property_valuation                       4
RFM_Score                                0
age                                     76
dtype: int64

In [55]:
df_old_customer_list.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,RFM_Score,age
0,1,Laraine,Medendorp,F,93,Mass Customer,Yes,11.0,New South Wales,10.0,1030.0,64.0
1,2,Eli,Bockman,Male,81,Mass Customer,Yes,16.0,New South Wales,10.0,745.0,37.0
2,3,Arlin,Dearle,Male,61,Mass Customer,Yes,15.0,,,785.0,63.0
3,4,Talbot,,Male,33,Mass Customer,No,7.0,QLD,9.0,355.0,56.0
4,5,Sheila-kathryn,Calton,Female,56,Affluent Customer,Yes,8.0,New South Wales,4.0,960.0,40.0


In [56]:
df_new_customer_list.isnull().sum()

first_name                              0
last_name                              29
gender                                  0
past_3_years_bike_related_purchases     0
wealth_segment                          0
owns_car                                0
tenure                                  0
state                                   0
property_valuation                      0
age                                    17
dtype: int64

`tenure` and `age` missing values can be filled with mean values.

In [57]:
df_old_customer_list.fillna(df_old_customer_list.mean(), inplace=True)
df_old_customer_list.isnull().sum()

customer_id                              0
first_name                               0
last_name                              112
gender                                   0
past_3_years_bike_related_purchases      0
wealth_segment                           0
owns_car                                 0
tenure                                   0
state                                    4
property_valuation                       0
RFM_Score                                0
age                                      0
dtype: int64

In [58]:
df_new_customer_list.fillna(df_new_customer_list.mean(), inplace=True)
df_new_customer_list.isnull().sum()

first_name                              0
last_name                              29
gender                                  0
past_3_years_bike_related_purchases     0
wealth_segment                          0
owns_car                                0
tenure                                  0
state                                   0
property_valuation                      0
age                                     0
dtype: int64

Missing state

In [59]:
df_old_customer_list[df_old_customer_list['state'].isnull()]

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,RFM_Score,age
2,3,Arlin,Dearle,Male,61,Mass Customer,Yes,15.0,,7.517202,785.0,63.0
9,10,Fiorenze,Birdall,Female,49,Mass Customer,Yes,20.0,,7.517202,965.0,29.0
21,22,Deeanne,Durtnell,Female,79,Mass Customer,No,11.0,,7.517202,915.0,55.0
22,23,Olav,Polak,Male,43,High Net Worth,Yes,1.0,,7.517202,1220.0,22.0


In [60]:
df_old_customer_list['state'].value_counts()

NSW                1779
VIC                 798
QLD                 743
New South Wales      86
Victoria             82
Name: state, dtype: int64

Value of NSW is far more than VIC and QLD, even after consistency adjustment. we can replace nan with NSW.

In [61]:
values = {'state': 'NSW'}
df_old_customer_list.fillna(value=values, inplace=True)
df_old_customer_list.isnull().sum()

customer_id                              0
first_name                               0
last_name                              112
gender                                   0
past_3_years_bike_related_purchases      0
wealth_segment                           0
owns_car                                 0
tenure                                   0
state                                    0
property_valuation                       0
RFM_Score                                0
age                                      0
dtype: int64

In [62]:
df_new_customer_list.isnull().sum()

first_name                              0
last_name                              29
gender                                  0
past_3_years_bike_related_purchases     0
wealth_segment                          0
owns_car                                0
tenure                                  0
state                                   0
property_valuation                      0
age                                     0
dtype: int64

### Consistency Check

In [63]:
df_old_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3492 entries, 0 to 3499
Data columns (total 12 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   customer_id                          3492 non-null   int64  
 1   first_name                           3492 non-null   object 
 2   last_name                            3380 non-null   object 
 3   gender                               3492 non-null   object 
 4   past_3_years_bike_related_purchases  3492 non-null   int64  
 5   wealth_segment                       3492 non-null   object 
 6   owns_car                             3492 non-null   object 
 7   tenure                               3492 non-null   float64
 8   state                                3492 non-null   object 
 9   property_valuation                   3492 non-null   float64
 10  RFM_Score                            3492 non-null   float64
 11  age                           

In [64]:
df_new_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            971 non-null    object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   int64  
 4   wealth_segment                       1000 non-null   object 
 5   owns_car                             1000 non-null   object 
 6   tenure                               1000 non-null   int64  
 7   state                                1000 non-null   object 
 8   property_valuation                   1000 non-null   int64  
 9   age                                  1000 non-null   float64
dtypes: float64(1), int64(3), object(6)
memory usage: 78.2+ KB


#### Gender

In [65]:
df_old_customer_list['gender'].unique()

array(['F', 'Male', 'Female', 'U', 'Femal', 'M'], dtype=object)

In [66]:
df_new_customer_list['gender'].unique()

array(['Male', 'Female', 'U'], dtype=object)

We should make gender consistent with 'F', 'M', and 'U'

In [67]:
values = {'Male': 'M', 'Female': 'F', 'Femal': 'F'}
df_old_customer_list.replace(to_replace=values, inplace=True)
df_old_customer_list['gender'].unique()

array(['F', 'M', 'U'], dtype=object)

In [68]:
df_new_customer_list.replace(to_replace=values, inplace=True)
df_new_customer_list['gender'].unique()

array(['M', 'F', 'U'], dtype=object)

#### Wealth Segment

In [69]:
df_old_customer_list['wealth_segment'].unique()

array(['Mass Customer', 'Affluent Customer', 'High Net Worth'],
      dtype=object)

In [70]:
df_new_customer_list['wealth_segment'].unique()

array(['Mass Customer', 'Affluent Customer', 'High Net Worth'],
      dtype=object)

#### Owns Car

In [71]:
df_old_customer_list['owns_car'].unique()

array(['Yes', 'No'], dtype=object)

In [72]:
df_new_customer_list['owns_car'].unique()

array(['Yes', 'No'], dtype=object)

#### State

In [73]:
df_old_customer_list['state'].unique()

array(['New South Wales', 'NSW', 'QLD', 'VIC', 'Victoria'], dtype=object)

In [74]:
df_new_customer_list['state'].unique()

array(['QLD', 'NSW', 'VIC'], dtype=object)

We should make state consistent with "NSW", "QLD", and "VIC"

In [75]:
values = {'New South Wales': 'NSW', 'Victoria': 'VIC'}
df_old_customer_list.replace(to_replace=values, inplace=True)
df_old_customer_list['state'].unique()

array(['NSW', 'QLD', 'VIC'], dtype=object)

## Data Modeling

### One-Hot Encoding

First we concat the two dfs.

In [76]:
df_old_customer_list.sample()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,RFM_Score,age
2054,2055,Harmon,Bakster,M,46,High Net Worth,No,7.0,NSW,3.0,1290.0,37.0


In [77]:
df_new_customer_list.sample()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,age
595,Kyle,Michie,F,6,Mass Customer,Yes,10,VIC,8,59.0


In [78]:
df_old_customer_list['new_customer'] = 0
df_old_customer_list.sample()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,RFM_Score,age,new_customer
3340,3341,Vitia,Crum,F,19,Affluent Customer,Yes,6.0,NSW,12.0,480.0,37.0,0


In [79]:
df_new_customer_list['new_customer'] = 1
df_new_customer_list.sample()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,age,new_customer
616,Mariette,,F,47,Affluent Customer,Yes,17,NSW,11,61.0,1


In [80]:
df_old_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3492 entries, 0 to 3499
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   customer_id                          3492 non-null   int64  
 1   first_name                           3492 non-null   object 
 2   last_name                            3380 non-null   object 
 3   gender                               3492 non-null   object 
 4   past_3_years_bike_related_purchases  3492 non-null   int64  
 5   wealth_segment                       3492 non-null   object 
 6   owns_car                             3492 non-null   object 
 7   tenure                               3492 non-null   float64
 8   state                                3492 non-null   object 
 9   property_valuation                   3492 non-null   float64
 10  RFM_Score                            3492 non-null   float64
 11  age                           

In [81]:
df_new_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            971 non-null    object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   int64  
 4   wealth_segment                       1000 non-null   object 
 5   owns_car                             1000 non-null   object 
 6   tenure                               1000 non-null   int64  
 7   state                                1000 non-null   object 
 8   property_valuation                   1000 non-null   int64  
 9   age                                  1000 non-null   float64
 10  new_customer                         1000 non-null   int64  
dtypes: float64(1), int64(4), object

In [82]:
df_combine = pd.concat([df_old_customer_list, df_new_customer_list])
df_combine.sample(5)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,RFM_Score,age,new_customer
915,,Ilise,Clissold,F,58,High Net Worth,No,9.0,NSW,10.0,,30.0,1
2712,2713.0,Hadlee,Mackro,M,55,Affluent Customer,Yes,5.0,NSW,11.0,730.0,63.0,0
904,,Roth,Crum,U,0,Mass Customer,No,2.0,NSW,6.0,,46.207528,1
625,,Wheeler,Godsil,M,51,Affluent Customer,Yes,6.0,NSW,9.0,,24.0,1
2025,2026.0,Anabelle,Rogerson,F,62,Mass Customer,Yes,7.0,NSW,7.0,745.0,30.0,0


In [83]:
df_combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4492 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   customer_id                          3492 non-null   float64
 1   first_name                           4492 non-null   object 
 2   last_name                            4351 non-null   object 
 3   gender                               4492 non-null   object 
 4   past_3_years_bike_related_purchases  4492 non-null   int64  
 5   wealth_segment                       4492 non-null   object 
 6   owns_car                             4492 non-null   object 
 7   tenure                               4492 non-null   float64
 8   state                                4492 non-null   object 
 9   property_valuation                   4492 non-null   float64
 10  RFM_Score                            3492 non-null   float64
 11  age                            

In [84]:
df_combine.isnull().sum()

customer_id                            1000
first_name                                0
last_name                               141
gender                                    0
past_3_years_bike_related_purchases       0
wealth_segment                            0
owns_car                                  0
tenure                                    0
state                                     0
property_valuation                        0
RFM_Score                              1000
age                                       0
new_customer                              0
dtype: int64

Then we do one-hot encoding on the combined df.

In [85]:
# Get list of categorical variables
s = (df_combine.dtypes == 'object')
object_cols = list(s[s].index)
object_cols.remove('first_name')
object_cols.remove('last_name')

print("Categorical variables:")
print(object_cols)

Categorical variables:
['gender', 'wealth_segment', 'owns_car', 'state']


In [86]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
df_OH_cols = pd.DataFrame(OH_encoder.fit_transform(df_combine[object_cols]))

# One-hot encoding removed index; put it back
df_OH_cols.index = df_combine.index

# Remove categorical columns (will replace with one-hot encoding)
num_df_combined = df_combine.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
df_OH_combined = pd.concat([num_df_combined, df_OH_cols], axis=1)

df_OH_combined.sample(10)

Unnamed: 0,customer_id,first_name,last_name,past_3_years_bike_related_purchases,tenure,property_valuation,RFM_Score,age,new_customer,0,1,2,3,4,5,6,7,8,9,10
69,,Vivienne,Crayden,82,6.0,7.0,,29.0,1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
100,,Hanny,Treven,84,3.0,4.0,,26.0,1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
852,,Alick,Baise,62,1.0,8.0,,19.0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
524,,Perry,Whitehurst,79,8.0,7.0,,37.0,1,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2611,2612.0,Bran,Sauven,33,18.0,9.0,615.0,48.0,0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1487,1488.0,Donnamarie,Andrieu,82,17.0,3.0,605.0,31.0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3300,3301.0,Guntar,O'Halloran,5,6.0,8.0,835.0,30.0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1767,1768.0,Normand,Ganderton,36,13.0,12.0,745.0,43.0,0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2457,2458.0,Leilah,Liddel,73,5.0,10.0,1320.0,53.0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
591,592.0,Edik,Connichie,73,2.0,8.0,355.0,25.0,0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


Seperate the combined df to old customer and new customer data.

In [87]:
df_old_customer_OH = df_OH_combined[df_OH_combined['new_customer'] == 0]
df_old_customer_OH.drop('new_customer', axis=1, inplace=True)
df_old_customer_OH

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,customer_id,first_name,last_name,past_3_years_bike_related_purchases,tenure,property_valuation,RFM_Score,age,0,1,2,3,4,5,6,7,8,9,10
0,1.0,Laraine,Medendorp,93,11.0,10.000000,1030.0,64.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,2.0,Eli,Bockman,81,16.0,10.000000,745.0,37.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,3.0,Arlin,Dearle,61,15.0,7.517202,785.0,63.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,4.0,Talbot,,33,7.0,9.000000,355.0,56.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,5.0,Sheila-kathryn,Calton,56,8.0,4.000000,960.0,40.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,3496.0,Danya,Burnyeat,99,19.0,9.000000,485.0,31.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3496,3497.0,Thia,O'Day,73,18.0,5.000000,740.0,31.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3497,3498.0,Lois,Abrahim,28,5.0,4.000000,585.0,22.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3498,3499.0,Shelton,Tewkesberrie,29,7.0,9.000000,1070.0,38.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [88]:
df_new_customer_OH = df_OH_combined[df_OH_combined['new_customer'] == 1]
df_new_customer_OH.drop(['customer_id', 'RFM_Score', 'new_customer'], axis=1, inplace=True)
df_new_customer_OH

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,first_name,last_name,past_3_years_bike_related_purchases,tenure,property_valuation,age,0,1,2,3,4,5,6,7,8,9,10
0,Chickie,Brister,86,14.0,6.0,60.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,Morly,Genery,69,16.0,11.0,47.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,Ardelis,Forrester,10,10.0,5.0,43.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,Lucine,Stutt,64,5.0,1.0,38.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,Melinda,Hadlee,34,19.0,9.0,52.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Ferdinand,Romanetti,60,9.0,7.0,58.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
996,Burk,Wortley,22,6.0,10.0,16.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
997,Melloney,Temby,17,15.0,2.0,63.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
998,Dickie,Cubbini,30,19.0,2.0,65.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


### Train Test Split

In [89]:
features = df_old_customer_OH.columns.tolist()
features.remove('first_name')
features.remove('last_name')
features.remove('customer_id')
features.remove('RFM_Score')
features.remove('tenure')

# features.remove('age')

features

['past_3_years_bike_related_purchases',
 'property_valuation',
 'age',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10]

In [90]:
X = df_old_customer_OH[features]
y = df_old_customer_OH['RFM_Score']

pre_X = df_new_customer_OH[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 2)

### Random Forest Model

In [91]:
forest_model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=1)
forest_model.fit(train_X, train_y)
preds = forest_model.predict(val_X)
print("Mean Absolute Error: " + str(mean_absolute_error(val_y, preds)))

Mean Absolute Error: 202.03036115652407


In [92]:
mape = np.mean(np.abs((val_y - preds) / np.abs(val_y)))
print('Mean Absolute Percentage Error: ' + str(mape * 100) + '%')

Mean Absolute Percentage Error: 28.84210708039753%


### Predict New Customer

In [93]:
df_old_customer_OH['RFM_Score'].describe()

count    3492.000000
mean      860.073024
std       252.064695
min       355.000000
25%       705.000000
50%       860.000000
75%      1040.000000
max      1420.000000
Name: RFM_Score, dtype: float64

We can see that if the score is greater than 1040, the customer is in the first quantile. We rate them as high value customers.

In [94]:
pre_y = forest_model.predict(pre_X)
df_new_customer_OH['Predicted_RFM_Score'] = pre_y
df_new_customer_OH.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new_customer_OH['Predicted_RFM_Score'] = pre_y


Unnamed: 0,first_name,last_name,past_3_years_bike_related_purchases,tenure,property_valuation,age,0,1,2,3,4,5,6,7,8,9,10,Predicted_RFM_Score
0,Chickie,Brister,86,14.0,6.0,60.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,853.271791
1,Morly,Genery,69,16.0,11.0,47.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,878.838662
2,Ardelis,Forrester,10,10.0,5.0,43.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,834.133936
3,Lucine,Stutt,64,5.0,1.0,38.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,883.52505
4,Melinda,Hadlee,34,19.0,9.0,52.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,844.899593


In [95]:
df_new_customer_OH.describe()

Unnamed: 0,past_3_years_bike_related_purchases,tenure,property_valuation,age,0,1,2,3,4,5,6,7,8,9,10,Predicted_RFM_Score
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,49.836,11.388,7.397,46.207528,0.513,0.47,0.017,0.241,0.251,0.508,0.507,0.493,0.506,0.228,0.266,857.588382
std,27.796686,5.037145,2.758804,16.929745,0.500081,0.499349,0.129336,0.427904,0.433805,0.500186,0.500201,0.500201,0.500214,0.419753,0.442085,33.653667
min,0.0,0.0,1.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,742.628553
25%,26.75,7.0,6.0,34.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,836.645158
50%,51.0,11.0,8.0,46.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,857.358444
75%,72.0,15.0,9.0,60.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,880.040161
max,99.0,22.0,12.0,79.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1033.638923


In [96]:
for index, row in df_new_customer_OH.iterrows():
    if row['Predicted_RFM_Score'] > 1040:
        df_new_customer_OH.at[index, 'Tier'] = 4
    elif row['Predicted_RFM_Score'] > 860:
        df_new_customer_OH.at[index, 'Tier'] = 3
    elif row['Predicted_RFM_Score'] > 705:
        df_new_customer_OH.at[index, 'Tier'] = 2
    else:
        df_new_customer_OH.at[index, 'Tier'] = 1
        
df_new_customer_OH[['first_name', 'last_name', 'Tier']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,first_name,last_name,Tier
0,Chickie,Brister,2.0
1,Morly,Genery,3.0
2,Ardelis,Forrester,2.0
3,Lucine,Stutt,3.0
4,Melinda,Hadlee,2.0


In [97]:
df_new_customer_OH['Tier'].value_counts()

2.0    539
3.0    461
Name: Tier, dtype: int64

There are only Tier 2 and 3 customers. No high value or low value custoemers in the new customer list.