# Libraries

In [1]:
import numpy as np # Numerical computation
import pandas as pd # Dataset manipulation

pd.set_option('display.max_columns', None) # Num of displayed columns

# Visualization tools
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Libraries version

print('Numpy ver:', np.__version__)
print('Pandas ver:', pd.__version__)
print('Matplotlib ver:', matplotlib.__version__)
print('Seaborn ver:', sns.__version__)

Numpy ver: 1.21.5
Pandas ver: 1.4.2
Matplotlib ver: 3.5.1
Seaborn ver: 0.11.2


# Import Dataset

In [3]:
df = pd.read_csv('../dataset/online_shoppers_intention.csv')

# Overview
df.sample(5)

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
9520,3,57.425,3,55.0,11,281.425,0.004167,0.053125,0.0,0.0,Nov,2,2,4,2,Returning_Visitor,False,False
10104,0,0.0,1,14.0,19,1666.75,0.0,0.021053,0.0,0.0,Nov,8,13,9,20,Other,False,False
1829,3,23.0,0,0.0,30,465.416667,0.0,0.009375,50.664983,0.0,Mar,4,1,4,3,Returning_Visitor,False,False
1677,5,274.333333,0,0.0,13,717.6,0.0,0.001961,0.0,0.0,Mar,1,1,3,10,New_Visitor,False,False
8511,2,96.75,0,0.0,103,3216.920172,0.01219,0.030754,0.0,0.0,Dec,2,2,6,2,Returning_Visitor,True,False


# Exploratory Data Analysis

## Descriptive Statistics

In [4]:
# Dataset Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

Dataset terdiri dari 12330 observasi dan 18 field. Setiap nama kolom sudah merepresentasikan isinya. Setiap field sudah memiliki tipe data yang sesuai dengan konten yang dimuatnya.

In [5]:
# Missing Values
df.isnull().sum()

Administrative             0
Administrative_Duration    0
Informational              0
Informational_Duration     0
ProductRelated             0
ProductRelated_Duration    0
BounceRates                0
ExitRates                  0
PageValues                 0
SpecialDay                 0
Month                      0
OperatingSystems           0
Browser                    0
Region                     0
TrafficType                0
VisitorType                0
Weekend                    0
Revenue                    0
dtype: int64

Tidak ada nilai kosong (missing value) di setiap field.

In [6]:
# Numeric Fields Descriptive Statistics
num_fields = [field for field in df.columns if df[field].dtypes != 'O' and 
              field not in ['OperatingSystems','Browser', 'Region', 'TrafficType']]

df[num_fields].describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0


In [7]:
# Encoded Fields Descriptive Statistics


In [8]:
# Categorical Fields Descriptive Statistics


Summary Analisis Statistik Deskriptif

Fitur numerik

1. Semua fitur numerik *(Andiminstrative, Administrative_Duration, Informational, Informational_Duration, ProductRelated, ProductRelated_Duration, BounceRates, ExitRates, PageValues, dan SpecialDay)* memiliki nilai **mean dan median yang berbeda**. Kemungkinan fitur-fitur tersebut memiliki **outlier** atau **tidak menyebar normal (skewed)**.
2. Semua fitur yang disebutkan pada poin 1 juga memiliki nilai **maksimum** yang cukup **jauh dari nilai Q3** yang mengindikasikan bahwa fitur-fitur tersebut memiliki **outlier**.
3. Beberapa fitur (Informational, Informational_Duration, PageValues, SpecialDay) terlihat **didominasi oleh nilai 0** dan memiliki nilai **maksimum** yang cukup **besar**. Indikasi kuat bahwa fitur-fitur tersebut memiliki **outlier**.

Fitur Kategorik

## Univariate Analysis

## Multivariate Analysis

## Business Insight