In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules


In [None]:
data = pd.read_csv('data.csv', encoding = 'unicode_escape')

print(data.shape)

FileNotFoundError: ignored

In [None]:
# checking the head of the data

data.head()

In [None]:
# describing the data

data.describe()

In [None]:
# taking out information about the data

data.info()

In [None]:
# checking the data-types of the data

data.dtypes

In [None]:
# Checking categorical data from df.Country
# unique, counts = np.unique(df.Country, return_counts=True)
# print(dict(zip(unique, counts)))
country_set = df[['Country', 'InvoiceNo']]
country_set = country_set.pivot_table(columns='Country', aggfunc='count')
country_set.sort_values('InvoiceNo', axis=1, ascending=False).T

In [None]:
# visualizing the unitprice

plt.rcParams['figure.figsize'] = (15, 7)
plt.style.use('fivethirtyeight')
sns.distplot(data['UnitPrice'], color = 'lightblue')
plt.title('Distribution of Unit price', fontsize = 20)
plt.xlabel('Different Unit Price for different items')
plt.ylabel('count')
plt.show()

In [None]:
# checking the different values for country in the dataset

data['Country'].value_counts().head(20)

In [None]:
# checking the different values for country in the dataset

plt.rcParams['figure.figsize'] = (12, 10)
a = data['Country'].value_counts().tail(20)
sns.barplot(x = a.values, y = a.index, palette = 'inferno')
plt.title('Bottom 20 Countries having Online Retail Market', fontsize = 20)
plt.xlabel('Names of Countries')
plt.ylabel('Count')
plt.show()

In [None]:
# checking how many quantity of products have been sold online from each country

a = data['Quantity'].groupby(data['Country']).agg('sum').sort_values(ascending = False)[1:]
print(a)

sns.barplot(x = a.values, y = a.index, palette = 'magma')
plt.title('Quality of Products sold in all the countries except UK')
plt.show()

In [None]:
# Having a look at the bottom 20 Countries in terms of Quantities according to the countries

color = plt.cm.viridis(np.linspace(0, 1, 20))
data['Quantity'].groupby(data['Country']).agg('sum').sort_values(ascending = True).head(20).plot.bar(figsize = (15, 7),
                                                                                                    color = color)

plt.title('Bottom 20 Countries according to Quantity Sold Online', fontsize = 20)
plt.xlabel('Names of the Countries')
plt.ylabel('Number of Items Sold')
plt.show()

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS

stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'white', width = 900, height = 900).generate(str(data['Description']))

print(wordcloud)
plt.rcParams['figure.figsize'] = (12, 12)
plt.axis('off')
plt.imshow(wordcloud)
plt.title('Most Occuring word in the Description list', fontsize = 20)
plt.show()

In [None]:
# FEATURE ENGINEERING 
# ADDING A COLUMN TO THE DATASET

# sales = unitprice*quantity

data['Sales'] = data['UnitPrice'] * data['Quantity']

# visualizing the sales in the entire globe
plt.rcParams['figure.figsize'] = (15, 5)
sns.distplot(data['Sales'], color = 'crimson')
plt.title('Distribution of Sales in entire globe', fontsize = 20)
plt.xlabel('Sales in different Countries')
plt.ylabel('Sales')
plt.show()

In [None]:
# looking at each country's sales

plt.rcParams['figure.figsize'] = (9, 12)
a = data['Sales'].groupby(data['Country']).agg('sum').sort_values(ascending = False)[1:]
print(a)
sns.barplot(x = a.values, y = a.index, palette = 'inferno')
plt.title('Sales of all the Countries Except UK')
plt.show()

In [None]:
# looking at the bottom 20 countries sales wise

data['Sales'].groupby(data['Country']).agg('sum').sort_values(ascending = True).head(20).plot.bar(figsize = (15, 7), color = 'pink')
plt.title('Bottom 20 Countries Sales wise', fontsize = 20)
plt.xlabel('Names of Countries')
plt.ylabel('Sales')
plt.show()

In [None]:
# let's look at Sales vs Invoicedate (Time series Analysis)

plt.rcParams['figure.figsize'] = (15, 5)
data.plot(x = 'InvoiceDate', y = 'Sales')
plt.title("Time Series Analysis of Sales", fontsize = 20)
plt.xlabel('Date of Purchase')
plt.ylabel('Sales')
plt.show()

In [None]:
# checking how many unique customer IDs are there

x = data['CustomerID'].nunique()

# printing the value
print("There are {} number of different customers".format(x))

In [None]:
# checking different number of unique countries present in this dataset

x = data['Country'].nunique()

# printing the result
print("There are {} number of different countries who do online retailing from UK".format(x))

In [None]:
# defining a function to plot time-series plot for any country

def time_series(country):
  dataset = data[data['Country'] == country]
  dataset.plot(x = 'InvoiceDate', y = 'Sales')
  
  
time_series('United Kingdom')
plt.title('Time-Series plot for UK', fontsize = 20)

In [None]:
# time-series plot for netherlands

dataset = data[data['Country'] == 'Netherlands']
dataset.plot(x = 'InvoiceDate', y = 'Sales')
plt.title('Time-Series for Netherlands', fontsize = 20)
plt.xlabel('Date of Purchase')
plt.ylabel('Sales Amount')
plt.show()

In [None]:
# time-series plot for EIRE

dataset = data[data['Country'] == 'EIRE']
dataset.plot(x = 'InvoiceDate', y = 'Sales')
plt.title('Time-Series for EIRE', fontsize = 20)
plt.xticks(rotation = -20)
plt.xlabel('Date of Purchase')
plt.ylabel('Sales Amount')
plt.show()

In [None]:
# time-series plot for Australia

dataset = data[data['Country'] == 'Australia']
dataset.plot(x = 'InvoiceDate', y = 'Sales')
plt.title('Time-Series for Australia', fontsize = 20)
plt.xlabel('Date of Purchase')
plt.ylabel('Sales Amount')
plt.show()

In [None]:
# time-series plot for France

dataset = data[data['Country'] == 'France']
dataset.plot(x = 'InvoiceDate', y = 'Sales')
plt.title('Time-Series for France', fontsize = 20)
plt.xticks(rotation = -20)
plt.xlabel('Date of Purchase')
plt.ylabel('Sales Amount')
plt.show()

In [None]:
# time-series plot for Germany

dataset = data[data['Country'] == 'Germany']
dataset.plot(x = 'InvoiceDate', y = 'Sales')
plt.title('Time-Series for Germany', fontsize = 20)
plt.xlabel('Date of Purchase')
plt.ylabel('Sales Amount')
plt.show()

In [None]:
# time-series plot for Switzlerland

dataset = data[data['Country'] == 'Switzerland']
dataset.plot(x = 'InvoiceDate', y = 'Sales')
plt.title('Time-Series for Switzerland', fontsize = 20)
plt.xticks(rotation = -20)
plt.xlabel('Date of Purchase')
plt.ylabel('Sales Amount')
plt.show()

In [None]:
# looking the stockcode for the datset

color = plt.cm.copper(np.linspace(0, 1, 20))
data['StockCode'].value_counts().head(20).plot.bar(color = color, figsize = (18, 10))
plt.title('Most Popular Stock codes', fontsize = 20)
plt.show()

In [None]:
# sorting the dataset by sales amount

data.sort_values(by = 'Sales')

In [None]:
# sorting the datset by unit price

data.sort_values(by = 'UnitPrice', ascending = False).head(20)

In [None]:
# Stripping extra spaces in the description 
data['Description'] = data['Description'].str.strip() 
  
# Dropping the rows without any invoice number 
data.dropna(axis = 0, subset =['InvoiceNo'], inplace = True) 
data['InvoiceNo'] = data['InvoiceNo'].astype('str') 
  
# Dropping all transactions which were done on credit 
data = data[~data['InvoiceNo'].str.contains('C')] 

In [None]:
# Transactions done in France 
basket_France = (data[data['Country'] =="France"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
# Transactions done in the United Kingdom 
basket_UK = (data[data['Country'] =="United Kingdom"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
# Transactions done in Portugal 
basket_Por = (data[data['Country'] =="Portugal"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 
  
basket_Sweden = (data[data['Country'] =="Sweden"] 
          .groupby(['InvoiceNo', 'Description'])['Quantity'] 
          .sum().unstack().reset_index().fillna(0) 
          .set_index('InvoiceNo')) 

In [None]:
def hot_encode(x): 
    if(x<= 0): 
        return 0
    if(x>= 1): 
        return 1

# Encoding the datasets 
basket_encoded = basket_France.applymap(hot_encode) 
basket_France = basket_encoded 
  
basket_encoded = basket_UK.applymap(hot_encode) 
basket_UK = basket_encoded 
  
basket_encoded = basket_Por.applymap(hot_encode) 
basket_Por = basket_encoded 
  
basket_encoded = basket_Sweden.applymap(hot_encode) 
basket_Sweden = basket_encoded 

In [None]:
# Building the model 
frq_items = apriori(basket_France, min_support = 0.05, use_colnames = True) 
  
# Collecting the inferred rules in a dataframe 
rules = association_rules(frq_items, metric ="lift", min_threshold = 1) 
rules = rules.sort_values(['confidence', 'lift'], ascending =[False, False]) 
rules.head()