In [1]:
!pip install pandas
!pip install scikit-learn



In [2]:
import pandas as pd
from scipy.stats import zscore

## Read in data

In [3]:
df = pd.read_csv("customer_purchase_sample.csv")

## Fill in missing values

### Check for missing values

In [4]:
df_na = df[df.isna().any(axis=1)]
df_na

Unnamed: 0,age,income,location,device,referral_source,campaign,pages_viewed,time_on_site_sec,is_returning_user,prior_purchases,days_since_last_visit,emails_opened_30d,ad_clicks_30d,purchased
10,48,,South,desktop,search,spring_sale,10,7200.0,1,0,3,0,0,1
35,21,,West,mobile,search,none,3,256.0,1,0,22,2,2,0
42,57,57912.18,South,mobile,search,new_arrivals,9,,1,2,3,0,2,1
51,29,,Northeast,mobile,search,spring_sale,8,83.0,1,1,4,0,0,0
55,20,81440.85,,desktop,search,none,3,107.0,1,0,4,1,1,0
69,44,,Midwest,desktop,search,spring_sale,12,83.0,1,0,53,2,1,0
89,65,,South,desktop,direct,none,8,96.0,0,1,2,3,0,0
99,25,90297.2,,mobile,search,none,7,164.0,1,1,18,1,1,1
102,63,91846.85,South,mobile,search,loyalty_push,7,,0,0,4,1,1,1
104,63,,South,desktop,search,spring_sale,10,,1,1,11,0,1,0


In [5]:
df.head(12)

Unnamed: 0,age,income,location,device,referral_source,campaign,pages_viewed,time_on_site_sec,is_returning_user,prior_purchases,days_since_last_visit,emails_opened_30d,ad_clicks_30d,purchased
0,23,125775.63,Northeast,mobile,email,new_arrivals,8,115.0,1,2,47,0,0,0
1,62,110053.34,Northeast,desktop,direct,none,9,249.0,1,1,4,6,1,1
2,55,95706.13,West,mobile,social,none,9,154.0,0,1,12,0,1,1
3,43,128899.09,Midwest,tablet,social,none,4,133.0,1,2,1,0,0,0
4,42,49337.11,Northeast,mobile,direct,none,6,306.0,0,0,5,0,0,0
5,66,65807.45,Northeast,mobile,search,none,6,85.0,1,0,14,0,0,0
6,22,57202.72,Midwest,desktop,search,spring_sale,8,108.0,0,0,45,1,1,0
7,57,73305.71,South,mobile,search,spring_sale,8,199.0,0,0,49,2,1,0
8,29,43699.42,West,mobile,email,none,6,215.0,1,0,0,1,0,1
9,23,104054.53,South,mobile,direct,spring_sale,5,224.0,1,0,14,0,0,0


Get the mode (most common) location

In [6]:
location_mode = df["location"].mode()[0]
location_mode

'Northeast'

Fill missing values with mean or mode

In [7]:
df["income"] = df["income"].fillna(df["income"].mean())
df["time_on_site_sec"] = df["time_on_site_sec"].fillna(df["time_on_site_sec"].mean())
df["location"] = df["location"].fillna(location_mode)

In [8]:
df[df.isna().any(axis=1)]

Unnamed: 0,age,income,location,device,referral_source,campaign,pages_viewed,time_on_site_sec,is_returning_user,prior_purchases,days_since_last_visit,emails_opened_30d,ad_clicks_30d,purchased


## Scale numeric variables

In [9]:
df['income'] = zscore(df['income'])
df['pages_viewed'] = zscore(df['pages_viewed'])
df['time_on_site_sec'] = zscore(df['time_on_site_sec'])
df['prior_purchases'] = zscore(df['prior_purchases'])
df['days_since_last_visit'] = zscore(df['days_since_last_visit'])
df['emails_opened_30d'] = zscore(df['emails_opened_30d'])
df['ad_clicks_30d'] = zscore(df['ad_clicks_30d'])

In [10]:
df.head(12)

Unnamed: 0,age,income,location,device,referral_source,campaign,pages_viewed,time_on_site_sec,is_returning_user,prior_purchases,days_since_last_visit,emails_opened_30d,ad_clicks_30d,purchased
0,23,1.254536,Northeast,mobile,email,new_arrivals,0.077357,-0.22235,1,1.685035,2.240592,-1.025078,-0.865744,0
1,62,0.7535704,Northeast,desktop,direct,none,0.245524,0.07471,1,0.427546,-0.724248,4.066371,0.285511,1
2,55,0.2964196,West,mobile,social,none,0.245524,-0.135892,0,0.427546,-0.17265,-1.025078,0.285511,1
3,43,1.35406,Midwest,tablet,social,none,-0.595312,-0.182446,1,1.685035,-0.931098,-1.025078,-0.865744,0
4,42,-1.181055,Northeast,mobile,direct,none,-0.258978,0.201071,0,-0.829943,-0.655299,-1.025078,-0.865744,0
5,66,-0.6562538,Northeast,mobile,search,none,-0.258978,-0.288855,1,-0.829943,-0.034751,-1.025078,-0.865744,0
6,22,-0.9304296,Midwest,desktop,search,spring_sale,0.077357,-0.237868,0,-0.829943,2.102693,-0.176504,0.285511,0
7,57,-0.4173338,South,mobile,search,spring_sale,0.077357,-0.036133,0,-0.829943,2.378492,0.672071,0.285511,0
8,29,-1.360691,West,mobile,email,none,-0.258978,-0.000663,1,-0.829943,-1.000048,-0.176504,-0.865744,1
9,23,0.562428,South,mobile,direct,spring_sale,-0.427145,0.019289,1,-0.829943,-0.034751,-1.025078,-0.865744,0


## One-hot encode categorical data

In [11]:
def one_hot(df, cols):
    """
    @param df pandas DataFrame
    @param cols a list of columns to encode 
    @return a DataFrame with one-hot encoding
    """
    for each in cols:
        dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
        df = pd.concat([df, dummies], axis=1)
    return df

In [12]:
one_hot(df, ["location", "device", "referral_source", "campaign"])

Unnamed: 0,age,income,location,device,referral_source,campaign,pages_viewed,time_on_site_sec,is_returning_user,prior_purchases,...,device_tablet,referral_source_ads,referral_source_direct,referral_source_email,referral_source_search,referral_source_social,campaign_loyalty_push,campaign_new_arrivals,campaign_none,campaign_spring_sale
0,23,1.254536,Northeast,mobile,email,new_arrivals,0.077357,-0.222350,1,1.685035,...,False,False,False,True,False,False,False,True,False,False
1,62,0.753570,Northeast,desktop,direct,none,0.245524,0.074710,1,0.427546,...,False,False,True,False,False,False,False,False,True,False
2,55,0.296420,West,mobile,social,none,0.245524,-0.135892,0,0.427546,...,False,False,False,False,False,True,False,False,True,False
3,43,1.354060,Midwest,tablet,social,none,-0.595312,-0.182446,1,1.685035,...,True,False,False,False,False,True,False,False,True,False
4,42,-1.181055,Northeast,mobile,direct,none,-0.258978,0.201071,0,-0.829943,...,False,False,True,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,22,0.148671,South,desktop,search,spring_sale,-0.763479,-0.217916,1,-0.829943,...,False,False,False,False,True,False,False,False,False,True
246,61,0.762435,West,mobile,search,none,-0.258978,0.221023,0,-0.829943,...,False,False,False,False,True,False,False,False,True,False
247,41,-0.033792,West,tablet,search,none,-0.931647,0.070276,0,-0.829943,...,True,False,False,False,True,False,False,False,True,False
248,62,1.225642,Northeast,desktop,social,none,0.750026,0.050325,1,-0.829943,...,False,False,False,False,False,True,False,False,True,False
