In [1]:
# Import libraries necessary for this project
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from IPython.display import display # Allows the use of display() for DataFrames

In [2]:
# Pretty display for notebooks
%matplotlib inline

Here we drop the sections region and channel as we plan on using only the remaining features
to predict customer segmentation

In [3]:
# Load the wholesale customers dataset
try:
    data = pd.read_csv("customers.csv")
    data.drop(['Region', 'Channel'], axis = 1, inplace = True)
    print("Wholesale customers dataset has {} samples with {} features each.".format(*data.shape))
except:
    print("Dataset could not be loaded. Is the dataset missing?")

Wholesale customers dataset has 440 samples with 6 features each.


At this step we viualise the stats of the data into a table to better understand it

In [4]:
# Display a description of the dataset using pandas describe command
display(data.describe())

Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen
count,440.0,440.0,440.0,440.0,440.0,440.0
mean,12000.297727,5796.265909,7951.277273,3071.931818,2881.493182,1524.870455
std,12647.328865,7380.377175,9503.162829,4854.673333,4767.854448,2820.105937
min,3.0,55.0,3.0,25.0,3.0,3.0
25%,3127.75,1533.0,2153.0,742.25,256.75,408.25
50%,8504.0,3627.0,4755.5,1526.0,816.5,965.5
75%,16933.75,7190.25,10655.75,3554.25,3922.0,1820.25
max,112151.0,73498.0,92780.0,60869.0,40827.0,47943.0


In [5]:
# Display the head of the dataset using pandas head command
data.head()

Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen
0,12669,9656,7561,214,2674,1338
1,7057,9810,9568,1762,3293,1776
2,6353,8808,7684,2405,3516,7844
3,13265,1196,4221,6404,507,1788
4,22615,5410,7198,3915,1777,5185


We can use a few customers to try to rationalize customer segmentation through them

For this purpose we will display only a few customers as a new data set derived from data

In [6]:
# Select three indices to sample from the dataset
indices = [85,181,338]

# Create a DataFrame of the chosen samples
samples = pd.DataFrame(data.loc[indices], columns = data.keys())
print("Chosen samples of wholesale customers dataset:")
display(samples)

Chosen samples of wholesale customers dataset:


Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicatessen
85,16117,46197,92780,1026,40827,2944
181,112151,29627,18148,16745,4948,8550
338,3,333,7021,15601,15,550


mean values of items:
Fresh: 12000.2977
Milk: 5796.2
Grocery: 7951.2
Frozen items: 3071.9
Detergents_paper: 2881.4
Delicatessen: 1524.8

based on the mean values of items we can conclude that customer 85 is a retailer. It has hghest spending on detergents and paper and groceries, these are everyday products and shouldnt cost a lot. It has high spending on milk as well

Customer 181 is more than likely a large market hence highest spending on fresh items, and consistent above average spending.

Customer 338 is a Restaurant as its grocery is average and frozen item cost is high 

at this stage we train supervised models based on the catagories of items. This will let us find out if a customer buys one catagory of item will they purchase proportionate amount of other catagories. This is done using random forests regressor

In [7]:
# using the numpy 'drop' function, drop one feature, copy to another data frame
new_data = data.drop('Grocery', axis=1)

# Split the data into training and testing sets(0.25)
# use  the dropped feature as the target
# Set a random state.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(new_data, data.Grocery, test_size=0.25, random_state=42)

In [8]:
print(X_test)

     Fresh   Milk  Frozen  Detergents_Paper  Delicatessen
265   5909  23527   10155               830          3636
78   10766   1175    2096               301           167
347  27380   7184    2809              4621          1022
255  31614    489    3242               111           615
327    542    899     414                88           522
..     ...    ...     ...               ...           ...
378   3225   3294     282                68          1114
417   5065   5499     364              3485          1063
16    1020   8816     134              4508          1080
406  25066   5010    9806              1092           960
117   6990   3880    1647               319          1160

[110 rows x 5 columns]


In [9]:
# Use training set to train random forest regressor
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor()
regressor = regressor.fit(X_train, y_train)
prediction = regressor.predict(X_test)

# Report the score of the prediction using the testing set
from sklearn.metrics import r2_score
score = r2_score(y_test, prediction)
print("Prediction score is: {}".format(score))

Prediction score is: 0.6864696118661506


In [12]:
import pickle
# save the classifier
with open('simple_rand_forest.pkl', 'wb') as fid:
    pickle.dump(regressor, fid)    