# Importing Libraries

In [1]:
# import sklearn
from sklearn.linear_model import LinearRegression

# Importing for Data Manipulation
# import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Importing For Data Visualization
# import matplotlib.pyplot as plt # for data visualization
# import seaborn as sns
# from wordcloud import WordCloud
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Reading The Dataset

In [2]:
%store -r df_cancel
df_cancel = df_cancel

In [3]:
# Checking the databased again
df_cancel.describe()

Unnamed: 0,Price,Quantity,CustomerNo
count,8494.0,8494.0,8494.0
mean,14.66771,31.496586,14994.995526
std,25.241861,1197.896194,1706.171419
min,5.46,1.0,12346.0
25%,11.34,1.0,13520.75
50%,12.86,2.0,14901.5
75%,15.32,6.0,16393.0
max,660.62,80995.0,18282.0


In [4]:
df_cancel.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8494 entries, 1616 to 536349
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TransactionNo  8494 non-null   object 
 1   Date           8494 non-null   object 
 2   ProductNo      8494 non-null   object 
 3   ProductName    8494 non-null   object 
 4   Price          8494 non-null   float64
 5   Quantity       8494 non-null   int64  
 6   CustomerNo     8494 non-null   float64
 7   Country        8494 non-null   object 
dtypes: float64(2), int64(1), object(5)
memory usage: 597.2+ KB


In [5]:
# Changing the type 'Date' to datetime
df_cancel['Date'] = pd.to_datetime(df_cancel['Date'])

In [6]:
# Changing the data type 'CustomerNo' and 'TransactionId' into integer
df_cancel['CustomerNo'] = df_cancel['CustomerNo'].astype(int)
df_cancel['TransactionNo'] = df_cancel['TransactionNo'].str.replace('\D', '', regex=True).astype(int)

In [7]:
#Let's check it again
df_cancel.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8494 entries, 1616 to 536349
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   TransactionNo  8494 non-null   int32         
 1   Date           8494 non-null   datetime64[ns]
 2   ProductNo      8494 non-null   object        
 3   ProductName    8494 non-null   object        
 4   Price          8494 non-null   float64       
 5   Quantity       8494 non-null   int64         
 6   CustomerNo     8494 non-null   int32         
 7   Country        8494 non-null   object        
dtypes: datetime64[ns](1), float64(1), int32(2), int64(1), object(3)
memory usage: 530.9+ KB


# Exploratory Data Analysis (EDA)

First we need to set the time of database for being analyze

In [8]:
# Setting the date and time
start_date = df_cancel.Date.min()
end_date = df_cancel.Date.max()
period = end_date - start_date

print("Start:", start_date)
print("End:", end_date)
print("Period:", period)

Start: 2018-12-01 00:00:00
End: 2019-12-09 00:00:00
Period: 373 days 00:00:00


In [9]:
# Get orders counts of countries
countries = df_cancel['Country'].value_counts()

# Prepare data dictionary for choropleth map
data = dict(type='choropleth', locations=countries.index, locationmode='country names', z=countries, text=countries.index, colorbar={'title': 'Order num.'})

# Define layout for choropleth map
layout = dict(title='Total number of orders per country', geo=dict(showframe=True, projection={'type': 'mercator'}), width=1000, height=1000)

# Create choropleth map figure
choromap = go.Figure(data=[data], layout=layout)

# Display the choropleth map
iplot(choromap, validate=False)

In [10]:
# Creating Revenue Collumn
df_cancel['Revenue'] = df_cancel['Price'] * df_cancel['Quantity']
df_cancel.sample()

Unnamed: 0,TransactionNo,Date,ProductNo,ProductName,Price,Quantity,CustomerNo,Country,Revenue
307783,557269,2019-06-19,22781,Gumball Magazine Rack,18.09,2,13097,United Kingdom,36.18


In [11]:
# Grouping Country and Revenue
top_revenue = df_cancel.groupby(['Country'])['Revenue'].sum().reset_index()
top_revenue = top_revenue.sort_values(by=['Revenue'], ascending = False)

# Creating Diagram Bar to determine which country has top revenue
fig = px.bar(top_revenue.head(5), x='Country', y='Revenue', color='Revenue', title='Highest Revenue Countries')
fig.update_xaxes(title='Country')
fig.update_yaxes(title='Revenue')
fig.show()

as you guys can see... the highest revenue in this databased is United Kingdom with **52346795**.

In [12]:
# Searching Total Product in Database
print('Total Product on this Database:\n' + str(df_cancel['ProductName'].nunique()))

Total Product on this Database:
1907


In [13]:
# Grouping ProductName with Quantity
top_product = df_cancel.groupby(['ProductName'])['Quantity'].sum().reset_index()
top_product = top_product.sort_values(by=['Quantity'], ascending = False)

# Creating Diagram Bar to determine which country has top revenue
fig = px.bar(top_product.head(10), x='ProductName', y='Quantity', color='Quantity', title='Top Product Based on Quantity')
fig.update_xaxes(title='Product Name')
fig.update_yaxes(title='Total Quantity Sold')
fig.show()

In [14]:
# Grouping ProductName with Revenue
top_product = df_cancel.groupby(['ProductName'])['Revenue'].sum().reset_index()
top_product = top_product.sort_values(by=['Revenue'], ascending = False)

# Creating Diagram Bar to determine which country has top revenue
fig = px.bar(top_product.head(10), x='ProductName', y='Revenue', color='Revenue', title='Top Product Based on Revenue')
fig.update_xaxes(title='Product Name')
fig.update_yaxes(title='Total Revenue')
fig.show()

In [15]:
# Searching Total Customer in Database
print('Total Customer on this Database:\n' + str(df_cancel['CustomerNo'].nunique()))

Total Customer on this Database:
1533


In [16]:
# Grouping CustomerNo with Quantity
top_customer_by_quantity = df_cancel.groupby(['CustomerNo'])['Quantity'].sum().reset_index()
top_customer_by_quantity = top_customer_by_quantity.sort_values(by=['Quantity'], ascending = False)

# Horizontal bar plot for top customers by quantity
fig_customer_quantity = px.bar(top_customer_by_quantity.head(10), x="Quantity", y="CustomerNo", color="Quantity", orientation='h', title="Top Customers according to Product Quantity Sold")
fig_customer_quantity.update_xaxes(title="Quantity")
fig_customer_quantity.update_yaxes(title="CustomerNo", type = 'category')
fig_customer_quantity.show()

In [17]:
# Grouping CustomerNo with Revenue
top_customer_by_revenue = df_cancel.groupby(['CustomerNo'])['Revenue'].sum().reset_index()
top_customer_by_revenue = top_customer_by_revenue.sort_values(by=['Revenue'], ascending = False)

# Horizontal bar plot for top customers by quantity
fig_customer_revenue = px.bar(top_customer_by_revenue.head(10), x="Revenue", y="CustomerNo", color="Revenue", orientation='h', title="Top Customers according to Revenue")
fig_customer_revenue.update_xaxes(title="Revenue")
fig_customer_revenue.update_yaxes(title="CustomerNo", type = 'category')
fig_customer_revenue.show()