In [1]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
pd.set_option('display.max_columns', None),
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Import Data

In [3]:
df = pd.read_csv('../data/online_shoppers_intention.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [5]:
# check target variable distribution
df.Revenue.value_counts()

False    10422
True      1908
Name: Revenue, dtype: int64

The target variable is highly unbalanced, so we will have to use some pre-processing techniques that will hopefully provide us with better performance in the machine learning models.
  
Helpful Links: 
* https://machinelearningmastery.com/what-is-imbalanced-classification/  
* https://machinelearningmastery.com/framework-for-imbalanced-classification-projects/


In [6]:
# Check for null values
df.isnull().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

In [8]:
# force column names to lowercase
df.columns = df.columns.str.lower()

There are no null values in the dataframe. 

### Next Steps for machine learning preparation -- 
1) determine with columns are categorical, and 
2) which columns are numeric. 

In [9]:
df.sample(5)

Unnamed: 0,administrative,administrative_duration,informational,informational_duration,productrelated,productrelated_duration,bouncerates,exitrates,pagevalues,specialday,month,operatingsystems,browser,region,traffictype,visitortype,weekend,revenue
7511,4,174.8,0,0.0,16,237.5,0.0,0.01,0.0,0.0,Sep,3,2,4,2,New_Visitor,False,False
11544,0,0.0,0,0.0,13,342.92,0.0,0.01,0.0,0.0,Nov,3,2,3,2,New_Visitor,True,False
1918,1,44.0,0,0.0,25,773.47,0.0,0.03,10.36,0.0,Mar,2,2,1,2,Returning_Visitor,True,True
7740,0,0.0,0,0.0,11,1855.6,0.03,0.05,0.0,0.0,Sep,2,2,1,1,Returning_Visitor,False,False
10161,1,0.0,0,0.0,56,2627.76,0.02,0.03,0.0,0.0,Nov,3,2,1,2,Returning_Visitor,True,True


According to the UCI dataset page, the dataset structure is as follows:

**Categorical:**
* `month`
* `operatingsystems`
* `browser`
* `region`
* `traffictype`
* `visitortype`
* `weekend`
* `revenue` (Target Feature)
  
**Numeric:**
* `administrative` - number of admin pages visited
* `informational`
* `productrelated`
* `administrative_duration`
* `informational_duration`
* `productrelated_duration`
* `bouncerates` (percent)
* `exitrates` (percent)
* `pagevalues` - definition somewhat ambigous
* `specialday` - numeric representation of proximity to holiday


In [14]:
# convert desired fields to categorical
df[['month','operatingsystems','browser','region','traffictype','visitortype','weekend','revenue']] = df[['month','operatingsystems','browser','region','traffictype','visitortype','weekend','revenue']].apply(lambda x: x.astype('category'))

In [20]:
# convert some fields to integers
df[['administrative','informational','productrelated']] = df[['administrative','informational','productrelated']].apply(lambda x: x.astype('int'))

# convert remaining fields to floats
df[['administrative_duration','informational_duration','productrelated_duration','bouncerates','exitrates','pagevalues','specialday']] = df[['administrative_duration','informational_duration','productrelated_duration','bouncerates','exitrates','pagevalues','specialday']].apply(lambda x: x.astype('float'))

In [22]:
# verify output matches desired transformations
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   administrative           12330 non-null  int32   
 1   administrative_duration  12330 non-null  float64 
 2   informational            12330 non-null  int32   
 3   informational_duration   12330 non-null  float64 
 4   productrelated           12330 non-null  int32   
 5   productrelated_duration  12330 non-null  float64 
 6   bouncerates              12330 non-null  float64 
 7   exitrates                12330 non-null  float64 
 8   pagevalues               12330 non-null  float64 
 9   specialday               12330 non-null  float64 
 10  month                    12330 non-null  category
 11  operatingsystems         12330 non-null  category
 12  browser                  12330 non-null  category
 13  region                   12330 non-null  category
 14  traffi

In [24]:
df.sample(5)

Unnamed: 0,administrative,administrative_duration,informational,informational_duration,productrelated,productrelated_duration,bouncerates,exitrates,pagevalues,specialday,month,operatingsystems,browser,region,traffictype,visitortype,weekend,revenue
7643,4,94.23,2,72.0,145,2935.26,0.0,0.01,0.27,0.0,Aug,2,2,2,1,Returning_Visitor,False,False
6002,2,91.6,2,678.0,66,2101.31,0.0,0.01,48.04,0.0,Oct,2,2,3,2,Returning_Visitor,False,True
10170,0,0.0,2,26.5,149,6001.17,0.0,0.02,0.0,0.0,Nov,2,2,1,2,Returning_Visitor,False,True
3729,7,397.83,0,0.0,28,1249.63,0.0,0.02,0.0,0.0,May,3,2,7,6,Returning_Visitor,False,False
5917,15,169.03,0,0.0,223,2477.54,0.0,0.0,0.18,0.0,Oct,2,2,7,3,Returning_Visitor,True,False


### Exploratory Data Analysis