# Importing Libraries

In [1]:
# import sklearn
from sklearn.linear_model import LinearRegression

# Importing for Data Manipulation
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Importing For Data Visualization
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import iplot

# Load Dataset

In [2]:
%store -r df_surplus
df_surplus = df_surplus

In [3]:
# Checking the databased again
df_surplus.describe()

Unnamed: 0,Price,Quantity,CustomerNo
count,522601.0,522601.0,522601.0
mean,12.63716,10.667492,15226.311767
std,7.965974,157.54242,1716.555479
min,5.13,1.0,12004.0
25%,10.99,1.0,13804.0
50%,11.94,4.0,15152.0
75%,14.09,12.0,16729.0
max,660.62,80995.0,18287.0


In [4]:
df_surplus.info()

<class 'pandas.core.frame.DataFrame'>
Index: 522601 entries, 0 to 536324
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   TransactionNo  522601 non-null  object 
 1   Date           522601 non-null  object 
 2   ProductNo      522601 non-null  object 
 3   ProductName    522601 non-null  object 
 4   Price          522601 non-null  float64
 5   Quantity       522601 non-null  int64  
 6   CustomerNo     522601 non-null  float64
 7   Country        522601 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 35.9+ MB


In [5]:
# Changing the type 'Date' to datetime
df_surplus['Date'] = pd.to_datetime(df_surplus['Date'])

In [6]:
# Changing the data type 'CustomerNo' and 'TransactionId' into integer
df_surplus['CustomerNo'] = df_surplus['CustomerNo'].astype(int)
df_surplus['TransactionNo'] = df_surplus['TransactionNo'].astype(int)

In [7]:
#Let's check it again
df_surplus.info()

<class 'pandas.core.frame.DataFrame'>
Index: 522601 entries, 0 to 536324
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   TransactionNo  522601 non-null  int32         
 1   Date           522601 non-null  datetime64[ns]
 2   ProductNo      522601 non-null  object        
 3   ProductName    522601 non-null  object        
 4   Price          522601 non-null  float64       
 5   Quantity       522601 non-null  int64         
 6   CustomerNo     522601 non-null  int32         
 7   Country        522601 non-null  object        
dtypes: datetime64[ns](1), float64(1), int32(2), int64(1), object(3)
memory usage: 31.9+ MB


# Exploratory Data Analysis (EDA)

First we need to set the time of database for being analyze

In [8]:
# Setting the date and time
start_date = df_surplus.Date.min()
end_date = df_surplus.Date.max()
period = end_date - start_date

print("Start:", start_date)
print("End:", end_date)
print("Period:", period)

Start: 2018-12-01 00:00:00
End: 2019-12-09 00:00:00
Period: 373 days 00:00:00


In [9]:
# Get orders counts of countries
countries = df_surplus['Country'].value_counts()

# Prepare data dictionary for choropleth map
data = dict(type='choropleth', locations=countries.index, locationmode='country names', z=countries, text=countries.index, colorbar={'title': 'Order num.'})

# Define layout for choropleth map
layout = dict(title='Total number of orders per country', geo=dict(showframe=True, projection={'type': 'mercator'}), width=1000, height=1000)

# Create choropleth map figure
choromap = go.Figure(data=[data], layout=layout)

# Display the choropleth map
iplot(choromap, validate=False)

In [10]:
# Creating Revenue Collumn
df_surplus['Revenue'] = df_surplus['Price'] * df_surplus['Quantity']
df_surplus.sample()

Unnamed: 0,TransactionNo,Date,ProductNo,ProductName,Price,Quantity,CustomerNo,Country,Revenue
185551,567817,2019-09-22,23199,Jumbo Bag Apples,6.19,2,15005,United Kingdom,12.38


In [11]:
# Memisah tanggal, bulan dan tahun
df_surplus['Day'] = pd.DatetimeIndex(df_surplus['Date']).day
df_surplus['Month'] = pd.DatetimeIndex(df_surplus['Date']).month.astype(int)
df_surplus['Year'] = pd.DatetimeIndex(df_surplus['Date']).year
df_surplus['Month-Year'] = df_surplus['Date'].apply(lambda x: x.strftime('%Y-%m'))

In [12]:
df_surplus['Month-Year'].describe()

count      522601
unique         13
top       2019-11
freq        81819
Name: Month-Year, dtype: object

In [13]:
# bulanan
mountly_revenue = df_surplus.groupby(['Month-Year'])['Revenue'].sum().reset_index()
mountly_revenue = mountly_revenue.sort_values(by=['Month-Year'], ascending = True)
fig1 = px.line(mountly_revenue, x="Month-Year", y="Revenue", text="Revenue")
fig1.update_traces(textposition="bottom right")
fig1.show()

In [14]:
# Mencari Top 5 Penjualan Dataset

top5_mountly = df_surplus.groupby(['Month-Year'])['Revenue'].sum().reset_index()
top5_mountly = top5_mountly.sort_values(by=['Revenue'], ascending = False, ignore_index=True)
top5_mountly.head(5)

Unnamed: 0,Month-Year,Revenue
0,2019-11,7828489.53
1,2019-10,7212279.85
2,2019-09,6613772.79
3,2019-08,4749801.23
4,2019-07,4571494.88


In [15]:
fig2 = px.bar(top5_mountly.head(5), x="Month-Year", y="Revenue", text="Revenue")
fig2.update_layout(xaxis=dict(type = "category"))
fig2.show()

Pendapatan Tertinggi di Tahun 2019 bulan 11

In [16]:
# Mencari Bottom 5 Penjualan Dataset

top5_mountly.tail(5)

Unnamed: 0,Month-Year,Revenue
8,2018-12,4397648.39
9,2019-03,4384669.82
10,2019-04,3579310.06
11,2019-02,3327342.64
12,2019-12,2512069.52


In [17]:
fig3 = px.bar(top5_mountly.tail(5), x="Month-Year", y="Revenue", text="Revenue")
fig3.update_layout(xaxis=dict(type = "category"))
fig3.show()

Pendapatan Terendah di Tahun 2019 bulan 12

In [18]:
# Penjualan bulanan berdasarkan negara

test1 = df_surplus.groupby(['Country', 'Month-Year'])['Revenue'].sum().reset_index()
fig3 = px.line(test1, x="Month-Year", y="Revenue", color="Country")
fig3.update_traces(textposition="bottom right")
fig3.show()

In [19]:
fig3 = px.bar(test1, x="Month-Year", y="Revenue", title="Wide-Form Input", color="Country")
fig3.show()

In [20]:
# Grouping Country and Revenue
top_revenue = df_surplus.groupby(['Country'])['Revenue'].sum().reset_index()
top_revenue = top_revenue.sort_values(by=['Revenue'], ascending = False)

# Creating Diagram Bar to determine which country has top revenue
fig = px.bar(top_revenue.head(5), x='Country', y='Revenue', color='Revenue', title='Highest Revenue Countries')
fig.update_xaxes(title='Country')
fig.update_yaxes(title='Revenue')
fig.show()

as you guys can see... the highest revenue in this databased is United Kingdom with **52346795**.

In [21]:
# Searching Total Product in Database
print('Total Product on this Database:\n' + str(df_surplus['ProductName'].nunique()))

Total Product on this Database:
3753


In [22]:
# Grouping ProductName with Quantity
top_product = df_surplus.groupby(['ProductName'])['Quantity'].sum().reset_index()
top_product = top_product.sort_values(by=['Quantity'], ascending = False)

# Creating Diagram Bar to determine which country has top revenue
fig = px.bar(top_product.head(10), x='ProductName', y='Quantity', color='Quantity', title='Top Product Based on Quantity')
fig.update_xaxes(title='Product Name')
fig.update_yaxes(title='Total Quantity Sold')
fig.show()

In [23]:
# Grouping ProductName with Revenue
top_product = df_surplus.groupby(['ProductName'])['Revenue'].sum().reset_index()
top_product = top_product.sort_values(by=['Revenue'], ascending = False)

# Creating Diagram Bar to determine which country has top revenue
fig = px.bar(top_product.head(10), x='ProductName', y='Revenue', color='Revenue', title='Top Product Based on Revenue')
fig.update_xaxes(title='Product Name')
fig.update_yaxes(title='Total Revenue')
fig.show()

In [24]:
# Searching Total Customer in Database
print('Total Customer on this Database:\n' + str(df_surplus['CustomerNo'].nunique()))

Total Customer on this Database:
4718


In [25]:
# Grouping CustomerNo with Quantity
top_customer_by_quantity = df_surplus.groupby(['CustomerNo'])['Quantity'].sum().reset_index()
top_customer_by_quantity = top_customer_by_quantity.sort_values(by=['Quantity'], ascending = False)

# Horizontal bar plot for top customers by quantity
fig_customer_quantity = px.bar(top_customer_by_quantity.head(10), x="Quantity", y="CustomerNo", color="Quantity", orientation='h', title="Top Customers according to Product Quantity Sold")
fig_customer_quantity.update_xaxes(title="Quantity")
fig_customer_quantity.update_yaxes(title="CustomerNo", type = 'category')
fig_customer_quantity.show()

In [26]:
# Grouping CustomerNo with Revenue
top_customer_by_revenue = df_surplus.groupby(['CustomerNo'])['Revenue'].sum().reset_index()
top_customer_by_revenue = top_customer_by_revenue.sort_values(by=['Revenue'], ascending = False)

# Horizontal bar plot for top customers by quantity
fig_customer_revenue = px.bar(top_customer_by_revenue.head(10), x="Revenue", y="CustomerNo", color="Revenue", orientation='h', title="Top Customers according to Revenue")
fig_customer_revenue.update_xaxes(title="Revenue")
fig_customer_revenue.update_yaxes(title="CustomerNo", type = 'category')
fig_customer_revenue.show()