# Online shopper datasample examples
## this notebook contains some examples using Numpy, Pandas and Matplotlib


Data Source: https://archive.ics.uci.edu/ml/datasets/Online+Shoppers+Purchasing+Intention+Dataset
        
Description: <a href="..\data\Online shoppers - Description.pdf">Online shoppers - Description.pdf</a>

In [None]:
#import the tools:numpy,pandas and matplotlib
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# read the data into a pandas dataframe and show the fist five line
data = pd.read_csv("..\data\online_shoppers.csv")
data.head()

In [None]:
type(data)

In [None]:
data.columns

## Numpy examples

In [None]:
# Create a function to display the array attributes and data (optinally).  
#  Note the use of the default value for the show_data parameter.  Made
#  this part of the display optional (and off by default) to support large
#  arrays (that you wouldn't want to try to show explicitly)
def show(the_array, show_data = 0):
    print("  Dimension: {:}".format(the_array.ndim))
    print("       Size: {:}".format(the_array.size))
    print("      Shape: {:}".format(the_array.shape))    
    print("  Data Type: {:}".format(the_array.dtype))    
    print("  Item Size: {:}".format(the_array.itemsize))
    print("Data Buffer: {:}".format(the_array.data))
    if show_data:
        print(the_array)

### Show some statistics of attribute[Administrative]
#### "Administrative" is the Number of pages visited by the visitor about account management, so we want to check how the shoppers use this kind of page

In [None]:
#get a pandas series which takes column [Administrative]
ad = data['Administrative']
# ad

In [None]:
print("Max value in Administrative  : {:0.2f}  ".format(np.max(ad)))
print("Min value in Administrative  : {:0.2f} ".format(np.min(ad)))
print("Mean value in Administrative : {:0.2f} ".format(np.mean(ad)))
print("The std in Administrative    : {:0.2f} ".format(np.std(ad)))


In [None]:
# count how many shoppers at least enter Administrative page once.
print("{} shoppers at least enter Administrative page once ".format(np.count_nonzero(ad)))
# count how many shoppers at least enter Administrative page more than twice.
print("{} shoppers at least enter Administrative page more than twice ".format(np.count_nonzero(ad>2)))

### Transfor several columns into Numpy array and show the array attributes
#### After transfering the columns into Numpy array, we can use the manipulation methods provided by Numpy array

In [None]:
B_and_E = np.array(data[['BounceRates','ExitRates']])
show(B_and_E)

### Data slicing for Numpy array

In [None]:
# create a data slice for the 3rd and 4th line of [BounceRates] and [ExitRates]
slice1 = B_and_E[2:4, :2]
slice1

### We may also be interested in the attribute[ProductRelated] which is the number of pages visited by visitor about product related pages
#### We will check how many shoppers visit the productrelated pages, what is the average number visited by shoppers and what is the average time spend by those shopper

In [None]:
pdct = data['ProductRelated']
# the total number of pages visited by shoppers
pdct_sum = np.sum(pdct)
# compute the number of shoppers who at least enter product related page once
pdct_nzero = np.count_nonzero(pdct)
# compute the total amount of time those shopper spend in the product related page
d_pdct_nzero = np.sum(data[pdct>0]['ProductRelated_Duration'])
# compute the average
avg = d_pdct_nzero/pdct_nzero

#print result
print("{} shoopers at least enter product related page once".format(pdct_nzero))
print("For those shoppers who at least visit product related page once, the average number of product related pages visited is {:f} ".format(pdct_sum/pdct_nzero))
print("{:0.2f} seconds those shoppers spend in the product related page".format(d_pdct_nzero))
print("The average time spend in the product related page if a shopper visit that page is {:0.2f} seconds".format(avg))

## Pandas examples

### Use Pandas describe function to see the statistics for all numerical attributes

In [None]:
data.describe()

### Use pandas embeded function to compute some statistics in attribute [Administrative]
#### We will get the same values as we we use the Numpy to computed the statistics above

In [None]:
# use pandas embeded function to compute some statistics in attribute [Administrative]
print("Max value in Administrative  : {:0.2f} ".format(data['Administrative'].max()))
print("Min value in Administrative  : {:0.2f} ".format(data['Administrative'].min()))
print("Mean value in Administrative : {:0.2f} ".format(data['Administrative'].mean()))
print("The std in Administrative    : {:0.2f}".format(data['Administrative'].std()))

### Add a column to show the total time a shopper spend in all kinds of pages

In [None]:
#add a column[Total_Duration] to show hoy much total time (Administrative_Duration + Informational_Duration + ProductRelated_Duration) a shopper spend
A_D = data['Administrative_Duration']
I_D = data['Informational_Duration']
P_D = data['ProductRelated_Duration']
data['Total_Duration'] = A_D + I_D + P_D
data.head()

### Show the total time (hours) shoppers spend in different region
#### If we want to know which region the shopper are more likely to shop online

In [None]:
# show the total time shoppers spend in different region in hours
data.groupby('Region')['Total_Duration'].sum()/ 3600

#### Then we check whether the shopper in region 1 really have a higher percentage end up withing buying something

In [None]:
# The number of shoppper in region1
region1_shopper_a = data[data['Region'] == 1]['Index'].size
# The number of shoppers end up buying somthing in region1
region1_shopper_b = data[(data['Region'] == 1) &( data['Revenue'] == True)]['Index'].size
# The number of shoppers in other regions 
regiono_shopper_a = data[data['Region'] != 1]['Index'].size
# The number of shoppers in other regions end up buying something
regiono_shopper_b = data[(data['Region'] != 1) & (data['Revenue'] == True)]['Index'].size

print("There is {:0.2f} percent of shoppers in region1 buy products".format(region1_shopper_b/region1_shopper_a))
print("There is {:0.2f} percent of shoppers in other regions(not region1) buy products".format(regiono_shopper_b/regiono_shopper_a))

#### So we can find that there may be no particular difference between the shopper in region1 and other regions

### Show how many shoppers shop on weekend and weekday
#### We may want to see whether there will be more shoppers shop in weekend

In [None]:
#show how many shoppers shop on weekend and weekday
weekend = data[data['Weekend']==True]['Index'].count()
weekday = data[data['Weekend']==False]['Index'].count()
print("{} shoppers shop in weekend".format(weekend))
print("{} shoppers shop in weekday".format(weekday))

### Show how many shoppers finally buy something on weekend and weekday

In [None]:
# show how many shoppers finally buy something on weekend and weekday
weekend_buy = data[(data['Weekend']==True) & (data['Revenue']==True)]['Index'].count()
weekday_buy = data[(data['Weekend']==False) & (data['Revenue']==True)]['Index'].count()
print("{} shoppers buy something in weekend".format(weekend_buy))
print("{} shoppers buy something in weekday".format(weekday_buy))

### Show how many shoppers shop in different month
#### We may also interest in how many shoppers shop in different month and we decide which month could be a good sales promotion month

In [None]:
# Show how many shoppers shop in different month
data.groupby('Month')['Index'].count()

### Show the number of shoppers shop in different operating systems and browsers
#### From this result, we can find the preference of browsers of shoppers in each region

In [None]:
# show the number of shoppers shop in different operating system and browsers
data.groupby(['OperatingSystems','Browser'])['Index'].size()

### show mean and median of total_duration among different visitortype
#### We are interested whether there are some differences among different visitortype

In [None]:
# Whether each type of visitors have similiar behaviors in different regions
data.groupby(['VisitorType','Region'])['Total_Duration'].aggregate(['mean','median'])

## Matplotlib examples

### Plot a histogram of the Region of shoppers

In [None]:
# plot a histogram of the Region of shoppers
Reg = data['Region']
plt.hist(Reg);

### A scatter plot of the number of shoppers in different operatingsystem-browser combination

In [None]:
colors =range(42)
#create a new dataframe 
new_df = data.groupby(['OperatingSystems','Browser'])['Index'].size().reset_index(name ='shopper numbers')
x = np.array(new_df['OperatingSystems']) # x-axis value
y = np.array(new_df['Browser']) # y-axis value
sizes = np.array(new_df['shopper numbers'])
plt.rcParams['figure.figsize'] = (15.0, 10.0) #figure size
plt.scatter(x, y, c=colors, s=sizes, alpha=0.9)
plt.colorbar()  # show color scale

plt.title('shopper numbers in different OperatingSystems and browsers',fontsize = 25)
plt.xlabel('Operating system #',fontsize = 20)
plt.ylabel('Browser #',fontsize = 20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=15);

### A Pie chart of the number of shoppers on weekend and weekdays

In [None]:
labels = 'Weekend','Weekday'  # labels for pie chart
sizes = [weekend,weekday]    # the size(number) for each label

# optional parameters
plt.rcParams['figure.figsize'] = (20.0, 9.0) #figure size
colors = ['gold', 'yellowgreen']
explode = (0,0.1)

#plot without optinal parameters
plt.subplot(1, 2, 1)
plt.pie(sizes, labels=labels) 

#plot with optional parameters to make the plot more better for view
plt.subplot(1, 2, 2)
patches, texts, autotexts = plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=30) 

###  A barplot of the shoppers number by month value

In [None]:
# basic barplot
#re-ordered the month sequence
month = data.groupby('Month')['Index'].size().reset_index(name = 'Shoppers number')
month['Month_No'] = [8,12,2,7,6,3,5,11,10,9]
month_ordered = month.sort_values(by='Month_No')

#plot the distribution
month_label = ('Feb','Mar','May','June','Jul','Aug','Sep','Oct','Nov','Dec')
fig,ax = plt.subplots(figsize=(20,10))
plt.bar(month_label,month_ordered['Shoppers number'])
plt.ylabel('Shoppers number',fontsize = 20)
plt.title('Number of shoppers in month distribution',fontsize = 20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=15)

In [None]:
#add a monthly mean value line
#plot the distribution
month_label = ('Feb','Mar','May','June','Jul','Aug','Sep','Oct','Nov','Dec')
fig,ax = plt.subplots(figsize=(20,10))
plt.bar(month_label,month_ordered['Shoppers number'])
plt.ylabel('Shoppers number',fontsize = 20)
plt.title('Number of shoppers in month distribution',fontsize = 20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=15)
#plot the mean value line
mean = month_ordered['Shoppers number'].mean()
ax.axhline(mean,color = 'red')

In [None]:
#add a legend to the monthly number and the average monthly number
#plot the distribution
month_label = ('Feb','Mar','May','June','Jul','Aug','Sep','Oct','Nov','Dec')
fig,ax = plt.subplots(figsize=(20,10))
plt.bar(month_label,month_ordered['Shoppers number'])
plt.ylabel('Shoppers number',fontsize = 20)
plt.title('Number of shoppers in month distribution',fontsize = 20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=15)

#plot the mean value line
mean = month_ordered['Shoppers number'].mean()
ax.axhline(mean,color = 'red')

#plot lengend
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
monthly_num = mpatches.Patch(color='blue', label='The monthly numbers')
monthly_avg = mlines.Line2D([],[], color='red', label='monthly average')
plt.legend(handles=[monthly_avg,monthly_num],loc=2)