# Online shopper datasample examples
## this notebook contains some examples using Numpy, Pandas and Matplotlib


Data Source: https://archive.ics.uci.edu/ml/datasets/Online+Shoppers+Purchasing+Intention+Dataset
        
Description: <a href="..\data\Online shoppers - Description.pdf">Online shoppers - Description.pdf</a>

In [2]:
#import the tools:numpy,pandas and matplotlib
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# read the data into a pandas dataframe and show the fist five line
data = pd.read_csv("..\data\online_shoppers.csv")
data.head()

Unnamed: 0,Index,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


## Numpy examples

### show some statistics

In [4]:
# some statistics in attribute [Administrative]
ad = data['Administrative']
print("Max value in Administrative is {:0.2f}  ".format(np.max(ad)))
print("Min value in Administrative is {:0.2f} ".format(np.min(ad)))
print("Mean value in Administrative is {:0.2f} ".format(np.mean(ad)))
print("The std in Administrative is {:0.2f} ".format(np.std(ad)))

Max value in Administrative is 27.00  
Min value in Administrative is 0.00 
Mean value in Administrative is 2.32 
The std in Administrative is 3.32 


In [None]:
### 

In [None]:
# count how many shoppers at least enter Administrative page once.
print("{} shoppers at least enter Administrative page once ".format(np.count_nonzero(ad)))
# count how many shoppers at least enter Administrative page more than twice.
print("{} shoppers at least enter Administrative page more than twice ".format(np.count_nonzero(ad>2)))

In [None]:
# compute the average ProductRelated_Duration among those shoppers who at least enter product related page once.

pdct = data['ProductRelated']
# compute the number of shoppers who at least enter product related page once
pdct_nzero = np.count_nonzero(pdct)
# compute the total amount of time those shopper spend in the product related page
d_pdct_nzero = np.sum(data[pdct>0]['ProductRelated_Duration'])
# compute the average
avg = d_pdct_nzero/pdct_nzero

#print result
print("{} shoopers at least enter product related page once".format(pdct_nzero))
print("{:0.2f} seconds those shoppers spend in the product related page".format(d_pdct_nzero))
print("The average time spend in the product related page if a shopper visit that page is %.2f" %avg)

## Pandas examples

In [None]:
# Use Pandas describe function to see the statistics for all numerical attributes
data.describe()

In [None]:
#add a column[Total_Duration] to show hoy much total time (Administrative_Duration + Informational_Duration + ProductRelated_Duration) a shopper spend
A_D = data['Administrative_Duration']
I_D = data['Informational_Duration']
P_D = data['ProductRelated_Duration']
data['Total_Duration'] = A_D + I_D + P_D
data.head()

In [None]:
# use pandas embeded function to compute some statistics in attribute [Administrative]
print("Max value in Administrative:  ",data['Administrative'].max())
print("Min value in Administrative:  ",data['Administrative'].min())
print("Mean value in Administrative:  ",data['Administrative'].mean())
print("The std in Administrative:  ",data['Administrative'].std())

In [None]:
#show how many shoppers shop on weekend and weekday
weekend = data[data['Weekend']==True]['Index'].count()
weekday = data[data['Weekend']==False]['Index'].count()
print("{} shoppers shop in weekend".format(weekend))
print("{} shoppers shop in weekday".format(weekday))

In [None]:
# show how many shoppers finally buy something on weekend and weekday
weekend_buy = data[(data['Weekend']==True) & (data['Revenue']==True)]['Index'].count()
weekday_buy = data[(data['Weekend']==False) & (data['Revenue']==True)]['Index'].count()
print("{} shoppers buy something in weekend".format(weekend_buy))
print("{} shoppers buy something in weekday".format(weekday_buy))

In [None]:
# Show how many shoppers shop in each month
data.groupby('Month')['Index'].count()

In [None]:
# show the total time shoppers spend in region in hours
data.groupby('Region')['Total_Duration'].sum()/ 3600

In [None]:
# show the number of shoppers shop in different operating system and browsers
data.groupby(['OperatingSystems','Browser'])['Index'].size()

In [None]:
# show mean and median of total_duration between 
data.groupby('VisitorType')['Total_Duration'].aggregate(['mean','median'])

## Matplotlib examples

In [None]:
# plot a histogram of the Region of shoppers
Reg = data['Region']
plt.hist(Reg);

In [None]:
#scatter plot of the shopper numbers in different operatingsystem-browser combination
colors =range(42)
#create a new dataframe 
new_df = data.groupby(['OperatingSystems','Browser'])['Index'].size().reset_index(name ='shopper numbers')
x = np.array(new_df['OperatingSystems'])
y = np.array(new_df['Browser'])
sizes = np.array(new_df['shopper numbers'])
plt.rcParams['figure.figsize'] = (15.0, 10.0) #figure size
plt.scatter(x, y, c=colors, s=sizes, alpha=0.9)
plt.colorbar()  # show color scale

plt.title('shopper numbers in different OperatingSystems and browsers',fontsize = 25)
plt.xlabel('Operating system #',fontsize = 20)
plt.ylabel('Browser #',fontsize = 20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=15);

In [None]:
# Pie chart of the shopper numbers on weekend and weekdays
labels = 'Weekend','Weekday'  # labels for pie chart
sizes = [weekend,weekday]    # the size(number) for each label

# optional parameters
plt.rcParams['figure.figsize'] = (20.0, 9.0) #figure size
colors = ['gold', 'yellowgreen']
explode = (0,0.1)

#plot without optinal parameters
plt.subplot(1, 2, 1)
plt.pie(sizes, labels=labels) 

#plot with optional parameters to make the plot more better for view
plt.subplot(1, 2, 2)
patches, texts, autotexts = plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=30) 

In [None]:
# Show the barplot of the shoppers number by month value
month = data.groupby('Month')['Index'].size().reset_index(name = 'Shoppers number')
month['Month_No'] = [8,12,2,7,6,3,5,11,10,9]
month_ordered = month.sort_values(by='Month_No')

#plot the distribution
month_label = ('Feb','Mar','May','June','Jul','Aug','Sep','Oct','Nov','Dec')
fig,ax = plt.subplots(figsize=(20,10))
plt.bar(month_label,month_ordered['Shoppers number'])
plt.ylabel('Shoppers number',fontsize = 20)
plt.title('Number of shoppers in month distribution',fontsize = 20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=15)

In [None]:
#add a monthly mean value line
#plot the distribution
month_label = ('Feb','Mar','May','June','Jul','Aug','Sep','Oct','Nov','Dec')
fig,ax = plt.subplots(figsize=(20,10))
plt.bar(month_label,month_ordered['Shoppers number'])
plt.ylabel('Shoppers number',fontsize = 20)
plt.title('Number of shoppers in month distribution',fontsize = 20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=15)
#plot the mean value line
mean = month_ordered['Shoppers number'].mean()
ax.axhline(mean,color = 'red')

In [None]:
#add a legend to the monthly number and the average monthly number
#plot the distribution
month_label = ('Feb','Mar','May','June','Jul','Aug','Sep','Oct','Nov','Dec')
fig,ax = plt.subplots(figsize=(20,10))
plt.bar(month_label,month_ordered['Shoppers number'])
plt.ylabel('Shoppers number',fontsize = 20)
plt.title('Number of shoppers in month distribution',fontsize = 20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=15)

#plot the mean value line
mean = month_ordered['Shoppers number'].mean()
ax.axhline(mean,color = 'red')

#plot lengend
import matplotlib.patches as mpatches
import matplotlib.lines as mlines
monthly_num = mpatches.Patch(color='blue', label='The monthly numbers')
monthly_avg = mlines.Line2D([],[], color='red', label='monthly average')
plt.legend(handles=[monthly_avg,monthly_num],loc=2)