In [1]:
from ucimlrepo import fetch_ucirepo 
import pandas as pd
  
# fetch dataset 
online_shoppers_purchasing_intention_dataset = fetch_ucirepo(id=468) 
  
# data (as pandas dataframes) 
X = online_shoppers_purchasing_intention_dataset.data.features 
y = online_shoppers_purchasing_intention_dataset.data.targets 
  
# metadata 
print(online_shoppers_purchasing_intention_dataset.metadata) 
  
# variable information 
print(online_shoppers_purchasing_intention_dataset.variables) 


{'uci_id': 468, 'name': 'Online Shoppers Purchasing Intention Dataset', 'repository_url': 'https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/468/data.csv', 'abstract': 'Of the 12,330 sessions in the dataset,\n84.5% (10,422) were negative class samples that did not\nend with shopping, and the rest (1908) were positive class\nsamples ending with shopping.', 'area': 'Business', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 12330, 'num_features': 17, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Revenue'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2018, 'last_updated': 'Thu Jan 11 2024', 'dataset_doi': '10.24432/C5F88Q', 'creators': ['C. Sakar', 'Yomi Kastro'], 'intro_paper': {'title': 'Real-time prediction of online shoppers’ purchasing intention using multi

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Combining features and target into one dataframe for convenience
data = pd.concat([X, y], axis=1)

# Set the figure size for all plots
plt.figure(figsize=(15, 10))

# 1. Feature Distribution
# Plotting the distribution of key features
plt.subplot(3, 2, 1)
sns.histplot(data['Administrative'], bins=30, kde=True)
plt.title('Distribution of Administrative')

plt.subplot(3, 2, 2)
sns.histplot(data['Informational'], bins=30, kde=True)
plt.title('Distribution of Informational')

plt.subplot(3, 2, 3)
sns.histplot(data['ProductRelated'], bins=30, kde=True)
plt.title('Distribution of ProductRelated')

plt.subplot(3, 2, 4)
sns.histplot(data['BounceRates'], bins=30, kde=True)
plt.title('Distribution of BounceRates')

plt.subplot(3, 2, 5)
sns.histplot(data['ExitRates'], bins=30, kde=True)
plt.title('Distribution of ExitRates')

plt.subplot(3, 2, 6)
sns.histplot(data['PageValues'], bins=30, kde=True)
plt.title('Distribution of PageValues')

plt.tight_layout()
plt.show()

# 2. Revenue vs. Features
plt.figure(figsize=(15, 10))

plt.subplot(3, 2, 1)
sns.boxplot(x='Revenue', y='Administrative_Duration', data=data)
plt.title('Administrative Duration vs. Revenue')

plt.subplot(3, 2, 2)
sns.boxplot(x='Revenue', y='Informational_Duration', data=data)
plt.title('Informational Duration vs. Revenue')

plt.subplot(3, 2, 3)
sns.boxplot(x='Revenue', y='ProductRelated_Duration', data=data)
plt.title('ProductRelated Duration vs. Revenue')

plt.subplot(3, 2, 4)
sns.boxplot(x='Revenue', y='BounceRates', data=data)
plt.title('BounceRates vs. Revenue')

plt.subplot(3, 2, 5)
sns.boxplot(x='Revenue', y='ExitRates', data=data)
plt.title('ExitRates vs. Revenue')

plt.subplot(3, 2, 6)
sns.boxplot(x='Revenue', y='PageValues', data=data)
plt.title('PageValues vs. Revenue')

plt.tight_layout()
plt.show()

# 3. Correlation Heatmap
plt.figure(figsize=(15, 10))
corr = data.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Features')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'data' is the DataFrame combining features and target

# 1. Feature Distribution
plt.figure(figsize=(15, 10))

plt.subplot(3, 2, 1)
sns.histplot(data['Administrative'], bins=30, kde=True)
plt.title('Distribution of Administrative')

plt.subplot(3, 2, 2)
sns.histplot(data['Informational'], bins=30, kde=True)
plt.title('Distribution of Informational')

plt.subplot(3, 2, 3)
sns.histplot(data['ProductRelated'], bins=30, kde=True)
plt.title('Distribution of ProductRelated')

plt.subplot(3, 2, 4)
sns.histplot(data['BounceRates'], bins=30, kde=True)
plt.title('Distribution of BounceRates')

plt.subplot(3, 2, 5)
sns.histplot(data['ExitRates'], bins=30, kde=True)
plt.title('Distribution of ExitRates')

plt.subplot(3, 2, 6)
sns.histplot(data['PageValues'], bins=30, kde=True)
plt.title('Distribution of PageValues')

plt.tight_layout()
plt.show()

# 2. Revenue vs. Features
plt.figure(figsize=(15, 10))

plt.subplot(3, 2, 1)
sns.boxplot(x='Revenue', y='Administrative_Duration', data=data)
plt.title('Administrative Duration vs. Revenue')

plt.subplot(3, 2, 2)
sns.boxplot(x='Revenue', y='Informational_Duration', data=data)
plt.title('Informational Duration vs. Revenue')

plt.subplot(3, 2, 3)
sns.boxplot(x='Revenue', y='ProductRelated_Duration', data=data)
plt.title('ProductRelated Duration vs. Revenue')

plt.subplot(3, 2, 4)
sns.boxplot(x='Revenue', y='BounceRates', data=data)
plt.title('BounceRates vs. Revenue')

plt.subplot(3, 2, 5)
sns.boxplot(x='Revenue', y='ExitRates', data=data)
plt.title('ExitRates vs. Revenue')

plt.subplot(3, 2, 6)
sns.boxplot(x='Revenue', y='PageValues', data=data)
plt.title('PageValues vs. Revenue')

plt.tight_layout()
plt.show()

# 3. Correlation Heatmap
plt.figure(figsize=(15, 10))
corr = data.corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of Features')
plt.show()
