## Dimensionality Reduction

In [1]:
# print_function for compatibility with Python 3
from __future__ import print_function
print('Print is ready to serve.')

# NumPy for numerical computing

import numpy as np

# Pandas for DataFrames
import pandas as pd


# Matplotlib for visualization
import matplotlib.pyplot as plt

# display plots in the notebook
%matplotlib inline

# Seaborn for easier visualization
import seaborn as sns

Print is ready to serve.


**We have to build the cluster model based on specific item purchased**.

So, load the data from the *cleaned_transactions.csv* instead of *analytical_base_table.csv*.

In [2]:
df = pd.read_csv('./cleaned_transactions.csv')

In [3]:
df.shape

(33698, 8)

In [4]:
df.columns

Index([u'InvoiceNo', u'StockCode', u'Description', u'Quantity', u'InvoiceDate',
       u'UnitPrice', u'CustomerID', u'Country'],
      dtype='object')

In [7]:
# specific items => stock code
# how many unique stock codes are there
print(len(df.StockCode.unique()))

2574


In [9]:
df[['StockCode', 'Description']][:5]

Unnamed: 0,StockCode,Description
0,22728,ALARM CLOCK BAKELIKE PINK
1,22727,ALARM CLOCK BAKELIKE RED
2,22726,ALARM CLOCK BAKELIKE GREEN
3,21724,PANDA AND BUNNIES STICKER SHEET
4,21883,STARS GIFT TAPE


## High Dimensionality

In [10]:
item_dummies = pd.get_dummies(df.StockCode)

In [11]:
item_dummies['CustomerID'] = df.CustomerID

In [12]:
item_dummies.head(5)

Unnamed: 0,10002,10120,10125,10133,10135,11001,15034,15036,15039,15044A,...,90201A,90201B,90201C,90201D,90202D,90204,C2,M,POST,CustomerID
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12583
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12583
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12583
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12583
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12583


In [13]:
item_dummies.shape

(33698, 2575)

In [16]:
# aggregate data by customer level 
item_data = item_dummies.groupby('CustomerID').sum()

In [17]:
item_data.head(5)

Unnamed: 0_level_0,10002,10120,10125,10133,10135,11001,15034,15036,15039,15044A,...,90192,90201A,90201B,90201C,90201D,90202D,90204,C2,M,POST
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
12349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0


In [19]:
# find the total number times each item is purchased.
item_data.sum().astype(int)

10002        12
10120         1
10125        13
10133         5
10135         4
11001         8
15034         5
15036        19
15039         3
15044A        6
15044B        3
15044C        2
15044D        4
15056BL      50
15056N       35
15056P       24
15058A        9
15058B        8
15058C        4
15060B       12
16008        11
16011         3
16012         4
16014        10
16016        16
16045         8
16048         8
16054         2
16156L        6
16156S       12
           ... 
90098         1
90099         2
90108         1
90114         1
90120B        1
90145         2
90160A        1
90160B        1
90160C        1
90160D        1
90161B        1
90161C        1
90161D        1
90162A        1
90162B        1
90164A        1
90170         1
90173         1
90184B        1
90184C        1
90192         1
90201A        1
90201B        3
90201C        2
90201D        1
90202D        1
90204         1
C2            6
M            34
POST       1055
dtype: int64

In [21]:
# save the item data
# customerID will be the index.
item_data.to_csv('./item_data.csv')

## Threshold

**Instead of all items, inorder to reduce the dimensionality, let's take only the top 20 items**.

In [22]:
# Display most popular 20 items
item_data.sum().sort_values().tail(20)

22961      114.0
22630      115.0
22139      117.0
21080      122.0
85099B     123.0
20726      123.0
20719      128.0
20750      132.0
23084      140.0
20725      141.0
21212      143.0
22551      158.0
22629      160.0
22328      166.0
21731      169.0
22556      179.0
22554      197.0
22423      222.0
22326      271.0
POST      1055.0
dtype: float64

In [23]:
# Get list of StockCodes for the 20 most popular items
top_20_items = item_data.sum().sort_values().tail(20).index

In [24]:
top_20_items

Index([u'22961', u'22630', u'22139', u'21080', u'85099B', u'20726', u'20719',
       u'20750', u'23084', u'20725', u'21212', u'22551', u'22629', u'22328',
       u'21731', u'22556', u'22554', u'22423', u'22326', u'POST'],
      dtype='object')

In [25]:
# Keep only features for top 20 items
top_20_items_data = item_data[top_20_items]

# Shape of remaining dataframe
top_20_items_data.shape

(414, 20)

In [26]:
top_20_items_data.to_csv('./threshold_item_data.csv')