# Customer Segmentation - K-Means Clustering

## Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #conda install -c conda-forge matplotlib
import seaborn as sns #conda install -c anaconda seaborn

## Features Description:
- InvoiceNo: A unique identifier for each invoice generated for a transaction.
- StockCode: A code identifying the specific product in the inventory system.
- Description: The description of the product listed on the invoice.
- Quantity: The quantity of each product purchased in the transaction.
- InvoiceDate: The date when the transaction or purchase was made.
- UnitPrice: The price per unit of the product listed on the invoice.
- CustomerID: A unique identifier for the customer associated with the transaction.
- Country: The country where the customer who made the transaction is located.

## Reading Data

In [10]:
# Reading the data
df = pd.read_excel('online-retail.xlsx')

In [11]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [13]:
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [14]:
df.shape

(541909, 8)

In [15]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Quantity,541909.0,9.55225,218.081158,-80995.0,1.0,3.0,10.0,80995.0
UnitPrice,541909.0,4.611114,96.759853,-11062.06,1.25,2.08,4.13,38970.0
CustomerID,406829.0,15287.69057,1713.600303,12346.0,13953.0,15152.0,16791.0,18287.0


In [16]:
df.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [17]:
df["Quantity"].min()

-80995

In [20]:
df["UnitPrice"].min()

-11062.06

In [21]:
# Contabilizando os diferentes valores de cada categoria
df['Description'].value_counts()

WHITE HANGING HEART T-LIGHT HOLDER     2369
REGENCY CAKESTAND 3 TIER               2200
JUMBO BAG RED RETROSPOT                2159
PARTY BUNTING                          1727
LUNCH BAG RED RETROSPOT                1638
                                       ... 
Missing                                   1
historic computer difference?....se       1
DUSTY PINK CHRISTMAS TREE 30CM            1
WRAP BLUE RUSSIAN FOLKART                 1
PINK BERTIE MOBILE PHONE CHARM            1
Name: Description, Length: 4223, dtype: int64