# Task 2: Data Insights

In [1]:
import pandas as pd
import numpy as np
import datetime as DT
from datetime import timedelta
import io
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [2]:
# First rows of the sheets are comments, not part of dataframes
df_transactions = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='Transactions', skiprows=1)
df_new_customer_list = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='NewCustomerList', skiprows=1)
df_customer_demographic = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerDemographic', skiprows=1)
df_customer_address = pd.read_excel('KPMG_VI_New_raw_data_update_final.xlsx', sheet_name='CustomerAddress', skiprows=1)

In [3]:
# Make copies for fail safe.
df_transactions_copy = df_transactions
df_new_customer_list_copy = df_new_customer_list
df_customer_demographic_copy = df_customer_demographic
df_customer_address_copy = df_customer_address

In [4]:
df_transactions.sample()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
18062,18063,93,1251,2017-03-09,0.0,Approved,WeareA2B,Standard,medium,medium,1065.03,230.09,36833.0


In [5]:
df_new_customer_list.sample()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
148,Agnella,Capener,Female,58,1969-05-21,Teacher,Health,High Net Worth,N,No,...,VIC,Australia,6,0.65,0.65,0.65,0.65,146,146,1.225


In [6]:
df_customer_demographic.sample()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure
1592,1593,Tommy,Kupisz,Female,69,1979-10-11,Tax Accountant,IT,Mass Customer,N,â°â´âµâââ,No,17.0


In [7]:
df_customer_address.sample()

Unnamed: 0,customer_id,address,postcode,state,country,property_valuation
338,343,6860 Green Ridge Avenue,2126,NSW,Australia,11


## Analyze and Clean New Customer List

In [8]:
df_new_customer_list.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,...,state,country,property_valuation,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,...,QLD,Australia,6,0.56,0.7,0.875,0.74375,1,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,...,NSW,Australia,11,0.89,0.89,1.1125,0.945625,1,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,...,VIC,Australia,5,1.01,1.01,1.01,1.01,1,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,...,QLD,Australia,1,0.87,1.0875,1.0875,1.0875,4,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,...,NSW,Australia,9,0.52,0.52,0.65,0.65,4,4,1.703125


In [9]:
df_new_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   first_name                           1000 non-null   object        
 1   last_name                            971 non-null    object        
 2   gender                               1000 non-null   object        
 3   past_3_years_bike_related_purchases  1000 non-null   int64         
 4   DOB                                  983 non-null    datetime64[ns]
 5   job_title                            894 non-null    object        
 6   job_industry_category                835 non-null    object        
 7   wealth_segment                       1000 non-null   object        
 8   deceased_indicator                   1000 non-null   object        
 9   owns_car                             1000 non-null   object        
 10  tenure       

We have five unnamed columns. We are not sure what they represents, so it is better to consult with the client or drop them for our analysis.

In [10]:
df_new_customer_list.drop(['Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20'], axis = 1, inplace=True)
df_new_customer_list.head()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value
0,Chickie,Brister,Male,86,1957-07-12,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.71875
1,Morly,Genery,Male,69,1970-03-22,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.71875
2,Ardelis,Forrester,Female,10,1974-08-28,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.71875
3,Lucine,Stutt,Female,64,1979-01-28,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703125
4,Melinda,Hadlee,Female,34,1965-09-21,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703125


We should convert DOB to age.

In [11]:
df_transactions['transaction_date'].sort_values()

516     2017-01-01
5876    2017-01-01
3459    2017-01-01
12484   2017-01-01
19130   2017-01-01
           ...    
605     2017-12-30
15269   2017-12-30
19906   2017-12-30
15756   2017-12-30
12003   2017-12-30
Name: transaction_date, Length: 20000, dtype: datetime64[ns]

We see that the data is collected in 2017. We should convert age to 2017.

In [12]:
# https://stackoverflow.com/questions/26788854/pandas-get-the-age-from-a-date-example-date-of-birth
# https://stackoverflow.com/questions/58948809/why-do-i-get-valueerror-nattype-does-not-support-strftime-even-though-its-no

df_new_customer_list['DOB'] = pd.to_datetime(df_new_customer_list['DOB'], errors='coerce', format='%Y-%m-%d')

year_2017 = pd.Timestamp('2017-12-31')

df_new_customer_list['age'] = (year_2017 - df_new_customer_list['DOB']).astype('<m8[Y]')

# drop DOB
df_new_customer_list.drop(['DOB'], axis = 1, inplace=True)

df_new_customer_list.head()


Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Rank,Value,age
0,Chickie,Brister,Male,86,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,1,1.71875,60.0
1,Morly,Genery,Male,69,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,1,1.71875,47.0
2,Ardelis,Forrester,Female,10,Senior Cost Accountant,Financial Services,Affluent Customer,N,No,10,5 Colorado Crossing,3505,VIC,Australia,5,1,1.71875,43.0
3,Lucine,Stutt,Female,64,Account Representative III,Manufacturing,Affluent Customer,N,Yes,5,207 Annamark Plaza,4814,QLD,Australia,1,4,1.703125,38.0
4,Melinda,Hadlee,Female,34,Financial Analyst,Financial Services,Affluent Customer,N,No,19,115 Montana Place,2093,NSW,Australia,9,4,1.703125,52.0


## RFM

In [13]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           20000 non-null  int64         
 1   product_id               20000 non-null  int64         
 2   customer_id              20000 non-null  int64         
 3   transaction_date         20000 non-null  datetime64[ns]
 4   online_order             19640 non-null  float64       
 5   order_status             20000 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               20000 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

In [14]:
df_transactions.isnull().sum()

transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               360
order_status                 0
brand                      197
product_line               197
product_class              197
product_size               197
list_price                   0
standard_cost              197
product_first_sold_date    197
dtype: int64

In [15]:
df_transactions[df_transactions['standard_cost'].isnull()].head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date
136,137,0,431,2017-09-23,0.0,Approved,,,,,1942.61,,
159,160,0,3300,2017-08-27,0.0,Approved,,,,,1656.86,,
366,367,0,1614,2017-03-10,0.0,Approved,,,,,850.89,,
406,407,0,2559,2017-06-14,1.0,Approved,,,,,710.59,,
676,677,0,2609,2017-07-02,0.0,Approved,,,,,1972.01,,


Let's see if the 197 entries are the same product_id.

In [16]:
df_transactions[df_transactions['standard_cost'].isnull()]['product_id'].value_counts()

0    197
Name: product_id, dtype: int64

The 197 entries will increase noise to the training data. These entries should be excluded from our analysis.

In [17]:
df_transactions.drop(df_transactions[df_transactions['standard_cost'].isnull()].index, inplace=True)
df_transactions.isnull().sum()

transaction_id               0
product_id                   0
customer_id                  0
transaction_date             0
online_order               358
order_status                 0
brand                        0
product_line                 0
product_class                0
product_size                 0
list_price                   0
standard_cost                0
product_first_sold_date      0
dtype: int64

In [18]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19803 entries, 0 to 19999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   transaction_id           19803 non-null  int64         
 1   product_id               19803 non-null  int64         
 2   customer_id              19803 non-null  int64         
 3   transaction_date         19803 non-null  datetime64[ns]
 4   online_order             19445 non-null  float64       
 5   order_status             19803 non-null  object        
 6   brand                    19803 non-null  object        
 7   product_line             19803 non-null  object        
 8   product_class            19803 non-null  object        
 9   product_size             19803 non-null  object        
 10  list_price               19803 non-null  float64       
 11  standard_cost            19803 non-null  float64       
 12  product_first_sold_date  19803 n

In [19]:
df_transactions['profit'] = df_transactions['list_price'] - df_transactions['standard_cost']
df_transactions.head()

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,product_first_sold_date,profit
0,1,2,2950,2017-02-25,0.0,Approved,Solex,Standard,medium,medium,71.49,53.62,41245.0,17.87
1,2,3,3120,2017-05-21,1.0,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,41701.0,1702.55
2,3,37,402,2017-10-16,0.0,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,36361.0,1544.61
3,4,88,3135,2017-08-31,0.0,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,36145.0,817.36
4,5,78,787,2017-10-01,1.0,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,42226.0,1055.82


In [20]:
# https://towardsdatascience.com/recency-frequency-monetary-model-with-python-and-how-sephora-uses-it-to-optimize-their-google-d6a0707c5f17

snapshot_date = df_transactions['transaction_date'].max() + timedelta(days=1)
print(snapshot_date)


data_process = df_transactions.groupby(['customer_id']).agg({
        'transaction_date': lambda x: (snapshot_date - x.max()).days,
        'transaction_id': 'count',
        'profit': 'sum'})

data_process.rename(columns={'transaction_date': 'Recency',
                         'transaction_id': 'Frequency',
                         'profit': 'MonetaryValue'}, inplace=True)

data_process.head()


2017-12-31 00:00:00


Unnamed: 0_level_0,Recency,Frequency,MonetaryValue
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,8,11,3018.09
2,129,3,2226.26
3,103,8,3362.81
4,196,2,220.57
5,17,6,2394.94


In [21]:
# --Calculate R and F groups--
# Create labels for Recency and Frequency
r_labels = range(4, 0, -1); f_labels = range(1, 5)
# Assign these labels to 4 equal percentile groups 
r_groups = pd.qcut(data_process['Recency'], q=4, labels=r_labels)
# Assign these labels to 4 equal percentile groups 
f_groups = pd.qcut(data_process['Frequency'], q=4, labels=f_labels)
# Create new columns R and F 
data_process = data_process.assign(R = r_groups.values, F = f_groups.values)
data_process.head()

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,R,F
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,8,11,3018.09,4,4
2,129,3,2226.26,1,1
3,103,8,3362.81,1,4
4,196,2,220.57,1,1
5,17,6,2394.94,4,2


In [22]:
# Create labels for MonetaryValue
m_labels = range(1, 5)
# Assign these labels to three equal percentile groups 
m_groups = pd.qcut(data_process['MonetaryValue'], q=4, labels=m_labels)
# Create new column M
data_process = data_process.assign(M = m_groups.values)

In [23]:
# Calculate RFM_Score
data_process['RFM_Score'] = data_process[['R','F','M']].sum(axis=1)
# data_process['RFM_Score'] = data_process[['Recency', 'Frequency', 'MonetaryValue']].sum(axis=1)
# data_process['RFM_Score'] = data_process[['R']].sum(axis=1)
# data_process['RFM_Score'] = data_process.apply(lambda x: 5 * x['R'] + 4 * x['F'] + 1 * x['M'], axis=1)
data_process

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,R,F,M,RFM_Score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,8,11,3018.090000,4,4,3,11.0
2,129,3,2226.260000,1,1,2,4.0
3,103,8,3362.810000,1,4,3,8.0
4,196,2,220.570000,1,1,1,3.0
5,17,6,2394.940000,4,2,2,8.0
...,...,...,...,...,...,...,...
3497,53,3,1648.320000,2,1,1,4.0
3498,128,6,3147.330000,1,2,3,6.0
3499,52,7,4955.250000,2,3,4,9.0
3500,145,6,1785.860000,1,2,1,4.0


In [24]:
data_process.describe()

Unnamed: 0,Recency,Frequency,MonetaryValue,RFM_Score
count,3494.0,3494.0,3494.0,3494.0
mean,62.383515,5.667716,3128.301078,7.223812
std,58.382418,2.311129,1770.536034,2.563981
min,1.0,1.0,15.08,3.0
25%,18.0,4.0,1841.37,5.0
50%,45.0,6.0,2861.98,7.0
75%,87.0,7.0,4183.1075,9.0
max,354.0,14.0,11668.95,12.0


We can set RFM_Score higher than 9 to be Tier 4, the second quantile to be Tier 3, and so on. The higher the tier, the better value the customer.

In [25]:
# for index, row in data_process.iterrows():
#     if row['RFM_Score'] >= 9:
#         data_process.at[index, 'Tier'] = 4
#     elif row['RFM_Score'] >= 7:
#         data_process.at[index, 'Tier'] = 3
#     elif row['RFM_Score'] >= 5:
#         data_process.at[index, 'Tier'] = 2
#     else:
#         data_process.at[index, 'Tier'] = 1

# data_process['Tier'] = data_process['RFM_Score']

m_labels = range(1, 5)
# Assign these labels to three equal percentile groups 
m_groups = pd.qcut(data_process['RFM_Score'], q=4, labels=m_labels)
# Create new column Tier
data_process = data_process.assign(Tier = m_groups.values)
data_process['Tier'] = data_process.Tier.cat.codes + 1
        
data_process.head()

Unnamed: 0_level_0,Recency,Frequency,MonetaryValue,R,F,M,RFM_Score,Tier
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,8,11,3018.09,4,4,3,11.0,4
2,129,3,2226.26,1,1,2,4.0,1
3,103,8,3362.81,1,4,3,8.0,3
4,196,2,220.57,1,1,1,3.0,1
5,17,6,2394.94,4,2,2,8.0,3


In [26]:
data_process.reset_index(level=0, inplace=True)
rfm = data_process[['customer_id', 'Tier']]
rfm.head()

Unnamed: 0,customer_id,Tier
0,1,4
1,2,1
2,3,3
3,4,1
4,5,3


In [27]:
rfm.describe()

Unnamed: 0,customer_id,Tier
count,3494.0,3494.0
mean,1750.856039,2.38838
std,1011.902531,1.119789
min,1.0,1.0
25%,876.25,1.0
50%,1750.5,2.0
75%,2624.75,3.0
max,5034.0,4.0


## Create Old Customer List as Training Data

In [28]:
df_old_customer_list = df_customer_demographic
df_old_customer_list = df_old_customer_list.merge(df_customer_address, how='left', on='customer_id')
df_old_customer_list.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure,address,postcode,state,country,property_valuation
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0,060 Morning Avenue,2016.0,New South Wales,Australia,10.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0,6 Meadow Vale Court,2153.0,New South Wales,Australia,10.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0,,,,,
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0,0 Holy Cross Court,4211.0,QLD,Australia,9.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0,17979 Del Mar Point,2448.0,New South Wales,Australia,4.0


Merge number of transactions to the old customer list

In [29]:
df_old_customer_list = df_old_customer_list.merge(rfm, how='left', on='customer_id')
df_old_customer_list.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure,address,postcode,state,country,property_valuation,Tier
0,1,Laraine,Medendorp,F,93,1953-10-12,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0,060 Morning Avenue,2016.0,New South Wales,Australia,10.0,4.0
1,2,Eli,Bockman,Male,81,1980-12-16,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0,6 Meadow Vale Court,2153.0,New South Wales,Australia,10.0,1.0
2,3,Arlin,Dearle,Male,61,1954-01-20,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0,,,,,,3.0
3,4,Talbot,,Male,33,1961-10-03,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0,0 Holy Cross Court,4211.0,QLD,Australia,9.0,1.0
4,5,Sheila-kathryn,Calton,Female,56,1977-05-13,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0,17979 Del Mar Point,2448.0,New South Wales,Australia,4.0,3.0


In [30]:
df_old_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4000 entries, 0 to 3999
Data columns (total 19 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   customer_id                          4000 non-null   int64         
 1   first_name                           4000 non-null   object        
 2   last_name                            3875 non-null   object        
 3   gender                               4000 non-null   object        
 4   past_3_years_bike_related_purchases  4000 non-null   int64         
 5   DOB                                  3913 non-null   datetime64[ns]
 6   job_title                            3494 non-null   object        
 7   job_industry_category                3344 non-null   object        
 8   wealth_segment                       4000 non-null   object        
 9   deceased_indicator                   4000 non-null   object        
 10  default     

Convert DOB to age with respect to year 2017

In [31]:
df_old_customer_list['DOB'] = pd.to_datetime(df_old_customer_list['DOB'], errors='coerce', format='%Y-%m-%d')

year_2017 = pd.Timestamp('2017-12-31')

df_old_customer_list['age'] = (year_2017 - df_old_customer_list['DOB']).astype('<m8[Y]')

# drop DOB
df_old_customer_list.drop(['DOB'], axis = 1, inplace=True)

df_old_customer_list.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,deceased_indicator,default,owns_car,tenure,address,postcode,state,country,property_valuation,Tier,age
0,1,Laraine,Medendorp,F,93,Executive Secretary,Health,Mass Customer,N,"""'",Yes,11.0,060 Morning Avenue,2016.0,New South Wales,Australia,10.0,4.0,64.0
1,2,Eli,Bockman,Male,81,Administrative Officer,Financial Services,Mass Customer,N,<script>alert('hi')</script>,Yes,16.0,6 Meadow Vale Court,2153.0,New South Wales,Australia,10.0,1.0,37.0
2,3,Arlin,Dearle,Male,61,Recruiting Manager,Property,Mass Customer,N,2018-02-01 00:00:00,Yes,15.0,,,,,,3.0,63.0
3,4,Talbot,,Male,33,,IT,Mass Customer,N,() { _; } >_[$($())] { touch /tmp/blns.shellsh...,No,7.0,0 Holy Cross Court,4211.0,QLD,Australia,9.0,1.0,56.0
4,5,Sheila-kathryn,Calton,Female,56,Senior Editor,,Affluent Customer,N,NIL,Yes,8.0,17979 Del Mar Point,2448.0,New South Wales,Australia,4.0,3.0,40.0


## Data Cleaning for Both Old and New Customer List

### Drop Unecessary Columns

In [32]:
old_columns = df_old_customer_list.columns
new_columns = df_new_customer_list.columns

for col in old_columns:
    if col not in new_columns:
        print(col + ' in df_old_customer_list not exist in df_new_customer_list')
        
print()

for col in new_columns:
    if col not in old_columns:
        print(col + ' in df_new_customer_list not exist in df_old_customer_list')

customer_id in df_old_customer_list not exist in df_new_customer_list
default in df_old_customer_list not exist in df_new_customer_list
Tier in df_old_customer_list not exist in df_new_customer_list

Rank in df_new_customer_list not exist in df_old_customer_list
Value in df_new_customer_list not exist in df_old_customer_list


We should drop columns except customer_id and Tier. customer_id will be excluded from our analysis. Tier will be the y value for machine learning models.

In [33]:
df_old_customer_list.drop('default', axis=1, inplace=True)
df_new_customer_list.drop(['Rank', 'Value'], axis=1, inplace=True)

In [34]:
df_old_customer_list.head(2)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,Tier,age
0,1,Laraine,Medendorp,F,93,Executive Secretary,Health,Mass Customer,N,Yes,11.0,060 Morning Avenue,2016.0,New South Wales,Australia,10.0,4.0,64.0
1,2,Eli,Bockman,Male,81,Administrative Officer,Financial Services,Mass Customer,N,Yes,16.0,6 Meadow Vale Court,2153.0,New South Wales,Australia,10.0,1.0,37.0


In [35]:
df_new_customer_list.head(2)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,address,postcode,state,country,property_valuation,age
0,Chickie,Brister,Male,86,General Manager,Manufacturing,Mass Customer,N,Yes,14,45 Shopko Center,4500,QLD,Australia,6,60.0
1,Morly,Genery,Male,69,Structural Engineer,Property,Mass Customer,N,No,16,14 Mccormick Park,2113,NSW,Australia,11,47.0


Drop categorical variables with more than 15 values.

In [36]:
len(df_old_customer_list['job_title'].unique())

196

`job_title` for both df need to be dropped

In [37]:
len(df_old_customer_list['job_industry_category'].unique())

10

In [38]:
len(df_new_customer_list['job_industry_category'].unique())

10

In [39]:
len(df_old_customer_list['wealth_segment'].unique())

3

In [40]:
len(df_new_customer_list['wealth_segment'].unique())

3

In [41]:
len(df_old_customer_list['address'].unique())

3994

`address` for both df need to be dropped

`postcode` need to be dropped too since `state` is enough as a address variable

In [42]:
df_old_customer_list.drop(['job_title', 'address', 'postcode'], axis=1, inplace=True)
df_new_customer_list.drop(['job_title', 'address', 'postcode'], axis=1, inplace=True)

In [43]:
df_old_customer_list.sample(1)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,state,country,property_valuation,Tier,age
1834,1835,Rodrigo,Felce,Male,48,Financial Services,Affluent Customer,N,Yes,1.0,NSW,Australia,10.0,2.0,24.0


In [44]:
df_new_customer_list.sample(1)

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,job_industry_category,wealth_segment,deceased_indicator,owns_car,tenure,state,country,property_valuation,age
566,Virginia,De Antoni,Female,17,Telecommunications,High Net Worth,N,Yes,12,NSW,Australia,10,53.0


In [45]:
df_old_customer_list['country'].unique()

array(['Australia', nan], dtype=object)

In [46]:
df_new_customer_list['country'].unique()

array(['Australia'], dtype=object)

All data should be collected in Australia. We can remove the `country` column

In [47]:
df_old_customer_list.drop(['country'], axis=1, inplace=True)
df_new_customer_list.drop(['country'], axis=1, inplace=True)

For `deceased_indicator`, we should remove the customer if deceased, then drop column.

In [48]:
df_old_customer_list['deceased_indicator'].value_counts()

N    3998
Y       2
Name: deceased_indicator, dtype: int64

In [49]:
df_old_customer_list.drop(df_old_customer_list[df_old_customer_list['deceased_indicator'] == 'Y'].index, inplace=True)
df_old_customer_list['deceased_indicator'].value_counts()

N    3998
Name: deceased_indicator, dtype: int64

In [50]:
df_new_customer_list['deceased_indicator'].value_counts()

N    1000
Name: deceased_indicator, dtype: int64

In [51]:
df_old_customer_list.drop(['deceased_indicator'], axis=1, inplace=True)
df_new_customer_list.drop(['deceased_indicator'], axis=1, inplace=True)

In [52]:
df_old_customer_list['job_industry_category'].value_counts()

Manufacturing         799
Financial Services    774
Health                601
Retail                357
Property              267
IT                    223
Entertainment         136
Argiculture           113
Telecommunications     72
Name: job_industry_category, dtype: int64

In [53]:
df_old_customer_list[df_old_customer_list['job_industry_category'].isnull()].count()

customer_id                            656
first_name                             656
last_name                              631
gender                                 656
past_3_years_bike_related_purchases    656
job_industry_category                    0
wealth_segment                         656
owns_car                               656
tenure                                 656
state                                  655
property_valuation                     655
Tier                                   561
age                                    656
dtype: int64

We have too many null values in this column. We can simply drop the column.

In [54]:
df_old_customer_list.drop(['job_industry_category'], axis=1, inplace=True)
df_new_customer_list.drop(['job_industry_category'], axis=1, inplace=True)

### Feature Engineering

In [55]:
df_old_customer_list.isnull().sum()

customer_id                              0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
wealth_segment                           0
owns_car                                 0
tenure                                  87
state                                    4
property_valuation                       4
Tier                                   506
age                                     87
dtype: int64

For the missing Tier, that means 506 customers do not have transaction data in 2017. It is either caused by incomplete data, or they did not make any purchases. For now, we assume the transaction data is complete and make them Tier 0, for no purchase made.

In [56]:
# values = {'Tier': 0}
# df_old_customer_list.fillna(value=values, inplace=True)
# df_old_customer_list.isnull().sum()

df_old_customer_list.drop(df_old_customer_list[df_old_customer_list['Tier'].isnull()].index, inplace=True)
df_old_customer_list.isnull().sum()

customer_id                              0
first_name                               0
last_name                              112
gender                                   0
past_3_years_bike_related_purchases      0
wealth_segment                           0
owns_car                                 0
tenure                                  76
state                                    4
property_valuation                       4
Tier                                     0
age                                     76
dtype: int64

In [57]:
df_old_customer_list.head()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,Tier,age
0,1,Laraine,Medendorp,F,93,Mass Customer,Yes,11.0,New South Wales,10.0,4.0,64.0
1,2,Eli,Bockman,Male,81,Mass Customer,Yes,16.0,New South Wales,10.0,1.0,37.0
2,3,Arlin,Dearle,Male,61,Mass Customer,Yes,15.0,,,3.0,63.0
3,4,Talbot,,Male,33,Mass Customer,No,7.0,QLD,9.0,1.0,56.0
4,5,Sheila-kathryn,Calton,Female,56,Affluent Customer,Yes,8.0,New South Wales,4.0,3.0,40.0


In [58]:
df_new_customer_list.isnull().sum()

first_name                              0
last_name                              29
gender                                  0
past_3_years_bike_related_purchases     0
wealth_segment                          0
owns_car                                0
tenure                                  0
state                                   0
property_valuation                      0
age                                    17
dtype: int64

`tenure` and `age` missing values can be filled with mean values.

In [59]:
df_old_customer_list.fillna(df_old_customer_list.mean(), inplace=True)
df_old_customer_list.isnull().sum()

customer_id                              0
first_name                               0
last_name                              112
gender                                   0
past_3_years_bike_related_purchases      0
wealth_segment                           0
owns_car                                 0
tenure                                   0
state                                    4
property_valuation                       0
Tier                                     0
age                                      0
dtype: int64

In [60]:
df_new_customer_list.fillna(df_new_customer_list.mean(), inplace=True)
df_new_customer_list.isnull().sum()

first_name                              0
last_name                              29
gender                                  0
past_3_years_bike_related_purchases     0
wealth_segment                          0
owns_car                                0
tenure                                  0
state                                   0
property_valuation                      0
age                                     0
dtype: int64

Missing state

In [61]:
df_old_customer_list[df_old_customer_list['state'].isnull()]

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,Tier,age
2,3,Arlin,Dearle,Male,61,Mass Customer,Yes,15.0,,7.517202,3.0,63.0
9,10,Fiorenze,Birdall,Female,49,Mass Customer,Yes,20.0,,7.517202,3.0,29.0
21,22,Deeanne,Durtnell,Female,79,Mass Customer,No,11.0,,7.517202,3.0,55.0
22,23,Olav,Polak,Male,43,High Net Worth,Yes,1.0,,7.517202,4.0,22.0


In [62]:
df_old_customer_list['state'].value_counts()

NSW                1779
VIC                 798
QLD                 743
New South Wales      86
Victoria             82
Name: state, dtype: int64

Value of NSW is far more than VIC and QLD, even after consistency adjustment. we can replace nan with NSW.

In [63]:
values = {'state': 'NSW'}
df_old_customer_list.fillna(value=values, inplace=True)
df_old_customer_list.isnull().sum()

customer_id                              0
first_name                               0
last_name                              112
gender                                   0
past_3_years_bike_related_purchases      0
wealth_segment                           0
owns_car                                 0
tenure                                   0
state                                    0
property_valuation                       0
Tier                                     0
age                                      0
dtype: int64

In [64]:
df_new_customer_list.isnull().sum()

first_name                              0
last_name                              29
gender                                  0
past_3_years_bike_related_purchases     0
wealth_segment                          0
owns_car                                0
tenure                                  0
state                                   0
property_valuation                      0
age                                     0
dtype: int64

### Consistency Check

In [65]:
df_old_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3492 entries, 0 to 3499
Data columns (total 12 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   customer_id                          3492 non-null   int64  
 1   first_name                           3492 non-null   object 
 2   last_name                            3380 non-null   object 
 3   gender                               3492 non-null   object 
 4   past_3_years_bike_related_purchases  3492 non-null   int64  
 5   wealth_segment                       3492 non-null   object 
 6   owns_car                             3492 non-null   object 
 7   tenure                               3492 non-null   float64
 8   state                                3492 non-null   object 
 9   property_valuation                   3492 non-null   float64
 10  Tier                                 3492 non-null   float64
 11  age                           

In [66]:
df_new_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            971 non-null    object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   int64  
 4   wealth_segment                       1000 non-null   object 
 5   owns_car                             1000 non-null   object 
 6   tenure                               1000 non-null   int64  
 7   state                                1000 non-null   object 
 8   property_valuation                   1000 non-null   int64  
 9   age                                  1000 non-null   float64
dtypes: float64(1), int64(3), object(6)
memory usage: 78.2+ KB


#### Gender

In [67]:
df_old_customer_list['gender'].unique()

array(['F', 'Male', 'Female', 'U', 'Femal', 'M'], dtype=object)

In [68]:
df_new_customer_list['gender'].unique()

array(['Male', 'Female', 'U'], dtype=object)

We should make gender consistent with 'F', 'M', and 'U'

In [69]:
values = {'Male': 'M', 'Female': 'F', 'Femal': 'F'}
df_old_customer_list.replace(to_replace=values, inplace=True)
df_old_customer_list['gender'].unique()

array(['F', 'M', 'U'], dtype=object)

In [70]:
df_new_customer_list.replace(to_replace=values, inplace=True)
df_new_customer_list['gender'].unique()

array(['M', 'F', 'U'], dtype=object)

#### Wealth Segment

In [71]:
df_old_customer_list['wealth_segment'].unique()

array(['Mass Customer', 'Affluent Customer', 'High Net Worth'],
      dtype=object)

In [72]:
df_new_customer_list['wealth_segment'].unique()

array(['Mass Customer', 'Affluent Customer', 'High Net Worth'],
      dtype=object)

#### Owns Car

In [73]:
df_old_customer_list['owns_car'].unique()

array(['Yes', 'No'], dtype=object)

In [74]:
df_new_customer_list['owns_car'].unique()

array(['Yes', 'No'], dtype=object)

#### State

In [75]:
df_old_customer_list['state'].unique()

array(['New South Wales', 'NSW', 'QLD', 'VIC', 'Victoria'], dtype=object)

In [76]:
df_new_customer_list['state'].unique()

array(['QLD', 'NSW', 'VIC'], dtype=object)

We should make state consistent with "NSW", "QLD", and "VIC"

In [77]:
values = {'New South Wales': 'NSW', 'Victoria': 'VIC'}
df_old_customer_list.replace(to_replace=values, inplace=True)
df_old_customer_list['state'].unique()

array(['NSW', 'QLD', 'VIC'], dtype=object)

## Data Modeling

### One-Hot Encoding

First we concat the two dfs.

In [78]:
df_old_customer_list.sample()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,Tier,age
2971,2972,Deena,Burnsides,F,11,High Net Worth,No,17.0,NSW,9.0,1.0,61.0


In [79]:
df_new_customer_list.sample()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,age
393,Packston,Wackett,M,10,Mass Customer,No,19,VIC,9,67.0


In [80]:
df_old_customer_list['new_customer'] = 0
df_old_customer_list.sample()

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,Tier,age,new_customer
505,506,Brett,Scrancher,F,43,Mass Customer,No,21.0,VIC,10.0,4.0,32.0,0


In [81]:
df_new_customer_list['new_customer'] = 1
df_new_customer_list.sample()

Unnamed: 0,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,age,new_customer
319,Zach,Hedman,M,87,Affluent Customer,Yes,4,NSW,9,36.0,1


In [82]:
df_old_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3492 entries, 0 to 3499
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   customer_id                          3492 non-null   int64  
 1   first_name                           3492 non-null   object 
 2   last_name                            3380 non-null   object 
 3   gender                               3492 non-null   object 
 4   past_3_years_bike_related_purchases  3492 non-null   int64  
 5   wealth_segment                       3492 non-null   object 
 6   owns_car                             3492 non-null   object 
 7   tenure                               3492 non-null   float64
 8   state                                3492 non-null   object 
 9   property_valuation                   3492 non-null   float64
 10  Tier                                 3492 non-null   float64
 11  age                           

In [83]:
df_new_customer_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   first_name                           1000 non-null   object 
 1   last_name                            971 non-null    object 
 2   gender                               1000 non-null   object 
 3   past_3_years_bike_related_purchases  1000 non-null   int64  
 4   wealth_segment                       1000 non-null   object 
 5   owns_car                             1000 non-null   object 
 6   tenure                               1000 non-null   int64  
 7   state                                1000 non-null   object 
 8   property_valuation                   1000 non-null   int64  
 9   age                                  1000 non-null   float64
 10  new_customer                         1000 non-null   int64  
dtypes: float64(1), int64(4), object

In [84]:
df_combine = pd.concat([df_old_customer_list, df_new_customer_list])
df_combine.sample(5)

Unnamed: 0,customer_id,first_name,last_name,gender,past_3_years_bike_related_purchases,wealth_segment,owns_car,tenure,state,property_valuation,Tier,age,new_customer
1563,1564.0,Sascha,Tander,F,42,Mass Customer,No,16.0,NSW,10.0,2.0,53.0,0
101,102.0,Langsdon,Tranfield,M,51,Affluent Customer,No,16.0,NSW,9.0,3.0,61.0,0
2427,2428.0,Pauly,Keightley,M,74,Mass Customer,Yes,9.0,VIC,7.0,4.0,49.0,0
474,,Laurie,,M,31,Mass Customer,Yes,15.0,NSW,10.0,,38.0,1
220,221.0,Mara,Bloore,F,14,Mass Customer,No,14.0,NSW,8.0,3.0,37.0,0


In [85]:
# df_combine.drop('state', axis=1, inplace=True)
df_combine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4492 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   customer_id                          3492 non-null   float64
 1   first_name                           4492 non-null   object 
 2   last_name                            4351 non-null   object 
 3   gender                               4492 non-null   object 
 4   past_3_years_bike_related_purchases  4492 non-null   int64  
 5   wealth_segment                       4492 non-null   object 
 6   owns_car                             4492 non-null   object 
 7   tenure                               4492 non-null   float64
 8   state                                4492 non-null   object 
 9   property_valuation                   4492 non-null   float64
 10  Tier                                 3492 non-null   float64
 11  age                            

In [86]:
df_combine.isnull().sum()

customer_id                            1000
first_name                                0
last_name                               141
gender                                    0
past_3_years_bike_related_purchases       0
wealth_segment                            0
owns_car                                  0
tenure                                    0
state                                     0
property_valuation                        0
Tier                                   1000
age                                       0
new_customer                              0
dtype: int64

Then we do one-hot encoding on the combined df.

In [87]:
# Get list of categorical variables
s = (df_combine.dtypes == 'object')
object_cols = list(s[s].index)
object_cols.remove('first_name')
object_cols.remove('last_name')

print("Categorical variables:")
print(object_cols)

Categorical variables:
['gender', 'wealth_segment', 'owns_car', 'state']


In [88]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
df_OH_cols = pd.DataFrame(OH_encoder.fit_transform(df_combine[object_cols]))

# One-hot encoding removed index; put it back
df_OH_cols.index = df_combine.index

# Remove categorical columns (will replace with one-hot encoding)
num_df_combined = df_combine.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
df_OH_combined = pd.concat([num_df_combined, df_OH_cols], axis=1)

df_OH_combined.sample(10)

Unnamed: 0,customer_id,first_name,last_name,past_3_years_bike_related_purchases,tenure,property_valuation,Tier,age,new_customer,0,1,2,3,4,5,6,7,8,9,10
668,669.0,Fleur,Whittlesea,73,3.0,7.0,4.0,37.0,0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
2641,2642.0,Arabelle,Rentelll,36,10.683841,5.0,2.0,39.864754,0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
371,372.0,Moina,Thumim,19,4.0,8.0,3.0,30.0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
727,,Son,Varney,75,10.0,7.0,,24.0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
646,647.0,Stanislas,Baildon,61,10.0,9.0,4.0,60.0,0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
2346,2347.0,L;urette,Annott,79,11.0,12.0,3.0,60.0,0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
736,,Fancie,Woofendell,68,6.0,7.0,,51.0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
478,,Aloysius,Killingsworth,89,12.0,9.0,,60.0,1,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2311,2312.0,Ronnica,Grebner,72,18.0,2.0,3.0,38.0,0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
1439,1440.0,Consuela,O'Logan,19,12.0,9.0,3.0,52.0,0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


Seperate the combined df to old customer and new customer data.

In [89]:
df_old_customer_OH = df_OH_combined[df_OH_combined['new_customer'] == 0]
df_old_customer_OH.drop('new_customer', axis=1, inplace=True)
df_old_customer_OH

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,customer_id,first_name,last_name,past_3_years_bike_related_purchases,tenure,property_valuation,Tier,age,0,1,2,3,4,5,6,7,8,9,10
0,1.0,Laraine,Medendorp,93,11.0,10.000000,4.0,64.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
1,2.0,Eli,Bockman,81,16.0,10.000000,1.0,37.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
2,3.0,Arlin,Dearle,61,15.0,7.517202,3.0,63.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3,4.0,Talbot,,33,7.0,9.000000,1.0,56.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,5.0,Sheila-kathryn,Calton,56,8.0,4.000000,3.0,40.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,3496.0,Danya,Burnyeat,99,19.0,9.000000,1.0,31.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
3496,3497.0,Thia,O'Day,73,18.0,5.000000,1.0,31.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3497,3498.0,Lois,Abrahim,28,5.0,4.000000,2.0,22.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
3498,3499.0,Shelton,Tewkesberrie,29,7.0,9.000000,3.0,38.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [90]:
df_new_customer_OH = df_OH_combined[df_OH_combined['new_customer'] == 1]
df_new_customer_OH.drop(['customer_id', 'Tier', 'new_customer'], axis=1, inplace=True)
df_new_customer_OH

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


Unnamed: 0,first_name,last_name,past_3_years_bike_related_purchases,tenure,property_valuation,age,0,1,2,3,4,5,6,7,8,9,10
0,Chickie,Brister,86,14.0,6.0,60.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,Morly,Genery,69,16.0,11.0,47.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
2,Ardelis,Forrester,10,10.0,5.0,43.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,Lucine,Stutt,64,5.0,1.0,38.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,Melinda,Hadlee,34,19.0,9.0,52.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Ferdinand,Romanetti,60,9.0,7.0,58.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
996,Burk,Wortley,22,6.0,10.0,16.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
997,Melloney,Temby,17,15.0,2.0,63.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
998,Dickie,Cubbini,30,19.0,2.0,65.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


### Train Test Split

In [91]:
features = df_old_customer_OH.columns.tolist()
features.remove('first_name')
features.remove('last_name')
features.remove('customer_id')
features.remove('Tier')
features.remove('tenure')

features.remove('age')

features

['past_3_years_bike_related_purchases',
 'property_valuation',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10]

In [92]:
X = df_old_customer_OH[features]
y = df_old_customer_OH['Tier']

pre_X = df_new_customer_OH[features]

train_X, val_X, train_y, val_y = train_test_split(X, y, test_size = 0.2, random_state = 2)

### Random Forest Model

In [93]:
forest_model = RandomForestRegressor(n_estimators=100, max_depth=7, random_state=1)
# forest_model = RandomForestRegressor(n_estimators=5000, max_depth=1, random_state=1)
forest_model.fit(train_X, train_y)
preds = forest_model.predict(val_X)
print("Mean Absolute Error: " + str(mean_absolute_error(val_y, preds)))

Mean Absolute Error: 0.9876917078699143


In [94]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(forest_model, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [0.97087812 1.00133922 0.98282984 1.04100551 0.9802961 ]
