# Capstone: Pre-processing & Training Data Development

## Import packages and data

In [1]:
import os
import pandas as pd
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from mlxtend.preprocessing import TransactionEncoder

df = pd.read_excel (r'Desktop\bread basket.xlsx')
print (df.head())

   Transaction           Item         date_time period_day weekday_weekend
0            1          Bread  30-10-2016 09:58    morning         weekend
1            2   Scandinavian  30-10-2016 10:05    morning         weekend
2            2   Scandinavian  30-10-2016 10:05    morning         weekend
3            3  Hot chocolate  30-10-2016 10:07    morning         weekend
4            3            Jam  30-10-2016 10:07    morning         weekend


## Clean Data

In [2]:
df=df[df.Item != 'Adjustment']

In [3]:
df=df[df.Item != 'Afternoon with the baker']

In [4]:
df.describe

<bound method NDFrame.describe of        Transaction           Item            date_time period_day  \
0                1          Bread     30-10-2016 09:58    morning   
1                2   Scandinavian     30-10-2016 10:05    morning   
2                2   Scandinavian     30-10-2016 10:05    morning   
3                3  Hot chocolate     30-10-2016 10:07    morning   
4                3            Jam     30-10-2016 10:07    morning   
...            ...            ...                  ...        ...   
20502         9682         Coffee  2017-09-04 14:32:00  afternoon   
20503         9682            Tea  2017-09-04 14:32:00  afternoon   
20504         9683         Coffee  2017-09-04 14:57:00  afternoon   
20505         9683         Pastry  2017-09-04 14:57:00  afternoon   
20506         9684      Smoothies  2017-09-04 15:04:00  afternoon   

      weekday_weekend  
0             weekend  
1             weekend  
2             weekend  
3             weekend  
4             wee

In [5]:
df.tail()

Unnamed: 0,Transaction,Item,date_time,period_day,weekday_weekend
20502,9682,Coffee,2017-09-04 14:32:00,afternoon,weekend
20503,9682,Tea,2017-09-04 14:32:00,afternoon,weekend
20504,9683,Coffee,2017-09-04 14:57:00,afternoon,weekend
20505,9683,Pastry,2017-09-04 14:57:00,afternoon,weekend
20506,9684,Smoothies,2017-09-04 15:04:00,afternoon,weekend


## EDA

In [6]:
df.Item.value_counts()

Coffee            5471
Bread             3325
Tea               1435
Cake              1025
Pastry             856
                  ... 
Polenta              1
Olum & polenta       1
Gift voucher         1
Chicken sand         1
Bacon                1
Name: Item, Length: 92, dtype: int64

## Association Model

In [7]:
#Only keep Transaction column and Item column
df_i = df[['Transaction', 'Item']]
df_i

Unnamed: 0,Transaction,Item
0,1,Bread
1,2,Scandinavian
2,2,Scandinavian
3,3,Hot chocolate
4,3,Jam
...,...,...
20502,9682,Coffee
20503,9682,Tea
20504,9683,Coffee
20505,9683,Pastry


In [8]:
#Create dummy variables and group them
df_i_dum = pd.get_dummies(df_i)
df_i_dum=df_i_dum.groupby(['Transaction']).max()
df_i_dum

Unnamed: 0_level_0,Item_Alfajores,Item_Argentina Night,Item_Art Tray,Item_Bacon,Item_Baguette,Item_Bakewell,Item_Bare Popcorn,Item_Basket,Item_Bowl Nic Pitt,Item_Bread,...,Item_The BART,Item_The Nomad,Item_Tiffin,Item_Toast,Item_Truffles,Item_Tshirt,Item_Valentine's card,Item_Vegan Feast,Item_Vegan mincepie,Item_Victorian Sponge
Transaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9680,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9681,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
9682,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9683,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#Create pair wise association model
from mlxtend.frequent_patterns import apriori

frequent_items = apriori(df_i_dum, min_support=.01, use_colnames=True)
frequent_items

Unnamed: 0,support,itemsets
0,0.036468,(Item_Alfajores)
1,0.016114,(Item_Baguette)
2,0.328315,(Item_Bread)
3,0.040178,(Item_Brownie)
4,0.104209,(Item_Cake)
...,...,...
56,0.023746,"(Item_Coffee, Item_Toast)"
57,0.014417,"(Item_Sandwich, Item_Tea)"
58,0.010071,"(Item_Bread, Item_Coffee, Item_Cake)"
59,0.011237,"(Item_Bread, Item_Coffee, Item_Pastry)"


In [10]:
from mlxtend.frequent_patterns import association_rules
a_rules = association_rules(frequent_items, metric='confidence', min_threshold=.2)
a_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Item_Alfajores),(Item_Bread),0.036468,0.328315,0.010389,0.284884,0.867713,-0.001584,0.939266
1,(Item_Alfajores),(Item_Coffee),0.036468,0.480017,0.019718,0.540698,1.126414,0.002213,1.132115
2,(Item_Brownie),(Item_Bread),0.040178,0.328315,0.010813,0.269129,0.819728,-0.002378,0.91902
3,(Item_Cake),(Item_Bread),0.104209,0.328315,0.023428,0.224822,0.684774,-0.010785,0.866491
4,(Item_Bread),(Item_Coffee),0.328315,0.480017,0.090321,0.275105,0.573115,-0.067276,0.717322
5,(Item_Cookies),(Item_Bread),0.054596,0.328315,0.014523,0.266019,0.810255,-0.003401,0.915126
6,(Item_Hot chocolate),(Item_Bread),0.058518,0.328315,0.013463,0.230072,0.700766,-0.005749,0.8724
7,(Item_Medialuna),(Item_Bread),0.062016,0.328315,0.016962,0.273504,0.833053,-0.003399,0.924554
8,(Item_Pastry),(Item_Bread),0.086399,0.328315,0.029259,0.33865,1.031478,0.000893,1.015627
9,(Item_Sandwich),(Item_Bread),0.072087,0.328315,0.017068,0.236765,0.72115,-0.0066,0.880049


In [11]:
#Keep important columns
a_rules = a_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
a_rules

Unnamed: 0,antecedents,consequents,support,confidence,lift
0,(Item_Alfajores),(Item_Bread),0.010389,0.284884,0.867713
1,(Item_Alfajores),(Item_Coffee),0.019718,0.540698,1.126414
2,(Item_Brownie),(Item_Bread),0.010813,0.269129,0.819728
3,(Item_Cake),(Item_Bread),0.023428,0.224822,0.684774
4,(Item_Bread),(Item_Coffee),0.090321,0.275105,0.573115
5,(Item_Cookies),(Item_Bread),0.014523,0.266019,0.810255
6,(Item_Hot chocolate),(Item_Bread),0.013463,0.230072,0.700766
7,(Item_Medialuna),(Item_Bread),0.016962,0.273504,0.833053
8,(Item_Pastry),(Item_Bread),0.029259,0.33865,1.031478
9,(Item_Sandwich),(Item_Bread),0.017068,0.236765,0.72115


## Logistic Regression Model

### Part 1: Items predict when they are bought

In [12]:
print(df.period_day.value_counts())
df.weekday_weekend.value_counts()

afternoon    11550
morning       8394
evening        504
night           14
Name: period_day, dtype: int64


weekday    12788
weekend     7674
Name: weekday_weekend, dtype: int64

In [13]:
#Condense dataset into two columns
df1 = df[[ 'Item','period_day', 'weekday_weekend']]
df1['DayPart'] = df1['period_day'] + df1['weekday_weekend']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['DayPart'] = df1['period_day'] + df1['weekday_weekend']


In [14]:
df1

Unnamed: 0,Item,period_day,weekday_weekend,DayPart
0,Bread,morning,weekend,morningweekend
1,Scandinavian,morning,weekend,morningweekend
2,Scandinavian,morning,weekend,morningweekend
3,Hot chocolate,morning,weekend,morningweekend
4,Jam,morning,weekend,morningweekend
...,...,...,...,...
20502,Coffee,afternoon,weekend,afternoonweekend
20503,Tea,afternoon,weekend,afternoonweekend
20504,Coffee,afternoon,weekend,afternoonweekend
20505,Pastry,afternoon,weekend,afternoonweekend


In [15]:
df1 = df1.drop('weekday_weekend',1)
df1 = df1.drop('period_day',1)
df1

Unnamed: 0,Item,DayPart
0,Bread,morningweekend
1,Scandinavian,morningweekend
2,Scandinavian,morningweekend
3,Hot chocolate,morningweekend
4,Jam,morningweekend
...,...,...
20502,Coffee,afternoonweekend
20503,Tea,afternoonweekend
20504,Coffee,afternoonweekend
20505,Pastry,afternoonweekend


In [16]:
#import packages
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [17]:
#explore data
df1.DayPart.value_counts()

afternoonweekday    7260
morningweekday      5169
afternoonweekend    4290
morningweekend      3225
eveningweekday       355
eveningweekend       149
nightweekend          10
nightweekday           4
Name: DayPart, dtype: int64

In [18]:
#create y variable for first logistic regression model
y = df1.DayPart

In [19]:
#get data set ready for X variable 
df1 = df1.drop('DayPart',1)
df1

Unnamed: 0,Item
0,Bread
1,Scandinavian
2,Scandinavian
3,Hot chocolate
4,Jam
...,...
20502,Coffee
20503,Tea
20504,Coffee
20505,Pastry


In [20]:
#get dummy variables
df1 = pd.get_dummies(df1)

In [21]:
df1

Unnamed: 0,Item_Alfajores,Item_Argentina Night,Item_Art Tray,Item_Bacon,Item_Baguette,Item_Bakewell,Item_Bare Popcorn,Item_Basket,Item_Bowl Nic Pitt,Item_Bread,...,Item_The BART,Item_The Nomad,Item_Tiffin,Item_Toast,Item_Truffles,Item_Tshirt,Item_Valentine's card,Item_Vegan Feast,Item_Vegan mincepie,Item_Victorian Sponge
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20502,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20503,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20504,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20505,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
#Create X variable for first logistic regression model
X = df1

In [23]:
#Split data into 25% test data and 75% train data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)

In [24]:
#First Logistic Regression Model
model= LogisticRegression()

In [25]:
#Train first logistic regression model
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [26]:
#Make first predictions
y_pred=model.predict(X_test)

In [27]:
#Look at confusion matrics from predictions and test data
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

array([[1586,   81,    0,    0,  137,   21,    0,    0],
       [ 914,  102,    0,    0,   71,   18,    0,    0],
       [  64,    2,    0,    0,    9,    4,    0,    0],
       [  18,    0,    0,   11,    4,    0,    0,    0],
       [1014,   21,    0,    0,  215,   11,    0,    0],
       [ 586,   60,    0,    0,  136,   28,    0,    0],
       [   0,    2,    0,    0,    0,    0,    0,    0],
       [   0,    1,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [33]:
#Calculate accuracy of first logistic regression model
from sklearn.metrics import accuracy_score

print('The accuracy of the model is ' + str(round(accuracy_score(model.predict(X_test), y_test),2)*100) + '%')

The accuracy of the model is 38.0%


In [67]:
#Code to see first logistic regression predictions

mc= model.classes_
#print(mc)

#model coefficients
cf = model.coef_
#print(cf)

my_list = list(model.classes_)
#print(my_list)
list2 = list(df_i['Item'])
#print(list2)

df2 = pd.DataFrame(cf)

#print(len(df1.columns))
#print(len(cf))

mc = list(mc)

df2=df2.T
df2.columns = mc
#print(mc)
#print(df2)


Unnamed: 0,Alfajores,Argentina Night,Art Tray,Bacon,Baguette,Bakewell,Bare Popcorn,Basket,Bowl Nic Pitt,Bread,...,The BART,The Nomad,Tiffin,Toast,Truffles,Tshirt,Valentine's card,Vegan Feast,Vegan mincepie,Victorian Sponge
0,0.088068,-0.317005,0.537556,-0.339244,0.137497,0.374422,0.736894,-0.760166,0.167576,0.331789,...,-0.346278,-0.872725,0.232484,0.054189,0.455682,-1.151311,-0.692692,-1.948477,-0.018888,-0.1924
1,0.108818,-0.157399,0.093983,0.502627,-0.734091,-0.204497,-0.647901,-0.655372,0.290917,0.055802,...,-0.289435,0.431239,0.145912,-0.29433,0.229784,-1.014073,-0.164406,-1.131156,-0.163638,0.394671
2,0.712587,-0.121925,0.248718,-0.038985,-0.328967,-0.457276,0.792187,-0.111534,-0.058053,-0.002985,...,-0.039778,-0.553142,0.205591,-1.045407,-0.006911,-0.225441,-0.184359,0.53609,0.049747,-0.100286
3,-0.399876,-0.078004,0.451678,-0.024073,-0.658832,-0.321564,-0.067872,-0.071321,-0.036259,-1.150186,...,-0.024576,0.300178,-0.732169,-0.83951,0.340373,4.07519,1.531638,0.325948,-0.412747,-0.06369
4,0.109623,-0.538826,-0.551925,-0.198588,0.80481,0.374391,-0.480092,-0.489348,-0.289605,1.022346,...,0.634018,-0.038793,-0.088484,1.558011,-0.295045,-0.804677,-0.753733,-1.108948,0.371069,0.200107
5,-0.236469,1.093932,-0.677543,-0.165174,0.968767,0.281719,-0.405881,1.991573,-0.241051,0.84575,...,-0.168551,0.838272,0.435323,0.818907,-0.470993,-0.701434,-0.646352,-0.934968,0.284053,-0.389934
6,-0.048875,-0.001232,-0.005518,-0.000371,-0.016898,-0.006121,-0.001067,-0.001129,-0.000559,-0.242665,...,-0.000378,-0.007995,-0.019907,-0.026253,-0.025779,-0.002518,0.989108,-0.006869,-0.008418,-0.000996
7,-0.163259,-0.00457,-0.020046,-0.001407,-0.060033,-0.022343,-0.00398,-0.004244,-0.002089,-0.619612,...,-0.001434,-0.028964,-0.070285,-0.091416,-0.08991,-0.009332,-0.007027,4.437334,-0.030499,-0.003692


In [46]:
#Continued code to see first logistic regression predictions
dc = pd.DataFrame(df1.columns)
dc.columns=['Items']
dc

Unnamed: 0,Items
0,Item_Alfajores
1,Item_Argentina Night
2,Item_Art Tray
3,Item_Bacon
4,Item_Baguette
...,...
87,Item_Tshirt
88,Item_Valentine's card
89,Item_Vegan Feast
90,Item_Vegan mincepie


In [47]:
#Total first logistic regression predictions
df3 = pd.concat([dc, df2], axis=1)
df3

Unnamed: 0,Items,afternoonweekday,afternoonweekend,eveningweekday,eveningweekend,morningweekday,morningweekend,nightweekday,nightweekend
0,Item_Alfajores,0.193300,0.133788,0.642164,-0.768083,0.204245,-0.150444,-0.066080,-0.188890
1,Item_Argentina Night,-0.403492,0.187611,-0.097295,-0.064626,-0.574464,0.958723,-0.001572,-0.004884
2,Item_Art Tray,0.380784,0.075288,0.247539,0.398073,-0.616551,-0.456039,-0.007198,-0.021895
3,Item_Bacon,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,Item_Baguette,0.189296,-0.515387,-0.354397,-0.658209,0.760727,0.675744,-0.025191,-0.072582
...,...,...,...,...,...,...,...,...,...
87,Item_Tshirt,-1.222186,-1.057729,-0.180299,4.080377,-0.861503,-0.746018,-0.003445,-0.009196
88,Item_Valentine's card,-0.942390,-0.141235,-0.200618,2.262165,-0.132887,-0.831288,-0.003167,-0.010580
89,Item_Vegan Feast,-1.658067,-1.179559,0.605221,0.313923,-1.169935,-1.012078,-0.006738,4.107234
90,Item_Vegan mincepie,0.073642,-0.277632,0.050170,-0.377257,0.286893,0.289461,-0.011112,-0.034166


In [49]:
#Here the Items predict when they are bought. These items predict they will be bought in the afternoon of a weekday.
#afternoonweekday model 
df_afternoonweekday = df3[['Items', 'afternoonweekday']]
df_afternoonweekday.sort_values('afternoonweekday',ascending=False, inplace=True)
df_afternoonweekday.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_afternoonweekday.sort_values('afternoonweekday',ascending=False, inplace=True)


Unnamed: 0,Items,afternoonweekday
76,Item_Soup,1.931811
16,Item_Chicken Stew,1.51908
71,Item_Sandwich,1.241694
44,Item_Hearty & Seasonal,1.174013
10,Item_Bread Pudding,1.159651
70,Item_Salad,0.951423
64,Item_Pick and Mix Bowls,0.859942
18,Item_Chimichurri Oil,0.754816
52,Item_Lemon and coconut,0.692715
6,Item_Bare Popcorn,0.631015


In [50]:
#Here the Items predict when they are bought. These items predict they will be bought in the afternoon of a weekend.
#afternoonweekend model 
df_afternoonweekend = df3[['Items', 'afternoonweekend']]
df_afternoonweekend.sort_values('afternoonweekend',ascending=False, inplace=True)
df_afternoonweekend.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_afternoonweekend.sort_values('afternoonweekend',ascending=False, inplace=True)


Unnamed: 0,Items,afternoonweekend
37,Item_Frittata,1.736435
79,Item_Tacos/Fajita,1.478964
25,Item_Crepes,0.942561
71,Item_Sandwich,0.922387
76,Item_Soup,0.907599
77,Item_Spanish Brunch,0.860888
44,Item_Hearty & Seasonal,0.839607
45,Item_Honey,0.658248
14,Item_Caramel bites,0.621015
70,Item_Salad,0.586258


In [51]:
#Here the Items predict when they are bought. These items predict they will be bought in the evening of a weekday.
#evening weekday model 
df_eveningweekday = df3[['Items', 'eveningweekday']]
df_eveningweekday.sort_values('eveningweekday',ascending=False, inplace=True)
df_eveningweekday.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eveningweekday.sort_values('eveningweekday',ascending=False, inplace=True)


Unnamed: 0,Items,eveningweekday
31,Item_Ella's Kitchen Pouches,1.374905
38,Item_Fudge,1.265966
54,Item_Mighty Protein,1.228473
59,Item_My-5 Fruit Shoot,0.917093
24,Item_Cookies,0.843809
6,Item_Bare Popcorn,0.7035
0,Item_Alfajores,0.642164
32,Item_Empanadas,0.62718
89,Item_Vegan Feast,0.605221
16,Item_Chicken Stew,0.578227


In [52]:
#Here the Items predict when they are bought. These items predict they will be bought in the evening of a weekend.
#evening weekend model 
df_eveningweekend = df3[['Items', 'eveningweekend']]
df_eveningweekend.sort_values('eveningweekend',ascending=False, inplace=True)
df_eveningweekend.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_eveningweekend.sort_values('eveningweekend',ascending=False, inplace=True)


Unnamed: 0,Items,eveningweekend
87,Item_Tshirt,4.080377
65,Item_Pintxos,3.326117
67,Item_Postcard,3.073964
88,Item_Valentine's card,2.262165
23,Item_Coke,0.811053
56,Item_Mortimer,0.742527
60,Item_Nomad bag,0.723828
51,Item_Kids biscuit,0.64343
28,Item_Duck egg,0.62189
59,Item_My-5 Fruit Shoot,0.486618


In [54]:
#Here the Items predict when they are bought. These items predict they will be bought in the morning of a weekday.
#morning weekday model 
df_mornweekday = df3[['Items', 'morningweekday']]
df_mornweekday.sort_values('morningweekday',ascending=False, inplace=True)
df_mornweekday.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mornweekday.sort_values('morningweekday',ascending=False, inplace=True)


Unnamed: 0,Items,morningweekday
50,Item_Keeping It Local,1.586332
85,Item_Toast,1.497732
63,Item_Pastry,1.363102
20,Item_Christmas common,1.151944
35,Item_Farm House,1.034813
21,Item_Coffee,1.011222
9,Item_Bread,0.941393
53,Item_Medialuna,0.884944
15,Item_Cherry me Dried fruit,0.825369
4,Item_Baguette,0.760727


In [55]:
#Here the Items predict when they are bought. These items predict they will be bought in the morning of a weekdend.
#morning weekend model 
df_mornweekend = df3[['Items', 'morningweekend']]
df_mornweekend.sort_values('morningweekend',ascending=False, inplace=True)
df_mornweekend.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mornweekend.sort_values('morningweekend',ascending=False, inplace=True)


Unnamed: 0,Items,morningweekend
7,Item_Basket,1.676389
53,Item_Medialuna,1.02549
63,Item_Pastry,1.005785
73,Item_Scone,0.984382
1,Item_Argentina Night,0.958723
85,Item_Toast,0.787561
77,Item_Spanish Brunch,0.742996
9,Item_Bread,0.719798
4,Item_Baguette,0.675744
21,Item_Coffee,0.634614


In [57]:
#Here the Items predict when they are bought. These items predict they will be bought in the night of a weekday.
#nightweekday model 
df_nightweekday = df3[['Items', 'nightweekday']]
df_nightweekday.sort_values('nightweekday',ascending=False, inplace=True)
#Only two positive rows
df_nightweekday.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nightweekday.sort_values('nightweekday',ascending=False, inplace=True)


Unnamed: 0,Items,nightweekday
55,Item_Mineral water,0.773902
49,Item_Juice,0.706812


In [59]:
#Here the Items predict when they are bought. These items predict they will be bought in the night of a weekend.
#nightweekend model 
df_nightweekend = df3[['Items', 'nightweekend']]
df_nightweekend.sort_values('nightweekend',ascending=False, inplace=True)
#Only three positive rows
df_nightweekend.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nightweekend.sort_values('nightweekend',ascending=False, inplace=True)


Unnamed: 0,Items,nightweekend
89,Item_Vegan Feast,4.107234
46,Item_Hot chocolate,0.833133
72,Item_Scandinavian,0.541607


### Part 2: Time of day predicts the items that are bought

In [107]:
#Code to create dataset
df5 = df[[ 'Item','period_day', 'weekday_weekend']]
df5['DayPart'] = df5['period_day'] + df5['weekday_weekend']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['DayPart'] = df5['period_day'] + df5['weekday_weekend']


In [108]:
df5 = df5.drop('weekday_weekend',1)
df5 = df5.drop('period_day',1)
df5

Unnamed: 0,Item,DayPart
0,Bread,morningweekend
1,Scandinavian,morningweekend
2,Scandinavian,morningweekend
3,Hot chocolate,morningweekend
4,Jam,morningweekend
...,...,...
20502,Coffee,afternoonweekend
20503,Tea,afternoonweekend
20504,Coffee,afternoonweekend
20505,Pastry,afternoonweekend


In [109]:
#Creat y variable for second logistic regression
y=df5.Item
df5 = df5.drop('Item',1)
#Get dummy variables
df5 = pd.get_dummies(df5)
df5

Unnamed: 0,DayPart_afternoonweekday,DayPart_afternoonweekend,DayPart_eveningweekday,DayPart_eveningweekend,DayPart_morningweekday,DayPart_morningweekend,DayPart_nightweekday,DayPart_nightweekend
0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0,0
3,0,0,0,0,0,1,0,0
4,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
20502,0,1,0,0,0,0,0,0
20503,0,1,0,0,0,0,0,0
20504,0,1,0,0,0,0,0,0
20505,0,1,0,0,0,0,0,0


In [110]:
#Create X variable for second logistic regression
X = df5
#Split data into 75% train data and 25% test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)
#Create second Logistic Regression model
model=LogisticRegression()
#Train second Logistic Regression model
model.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [111]:
#Second Logistic Regression model predictions
y_pred=model.predict(X_test)
#Second Logistic Regression model confusion matrix. Test data vs predictions
cnf_matrix1 = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix1

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [112]:
print('The accuracy of the model is ' + str(round(accuracy_score(model.predict(X_test), y_test),2)*100) + '%')

The accuracy of the model is 27.0%


In [113]:
#Code to see second logistic regression coefficients
print(model.classes_)
model.coef_

['Alfajores' 'Argentina Night' 'Art Tray' 'Bacon' 'Baguette' 'Bakewell'
 'Bare Popcorn' 'Basket' 'Bowl Nic Pitt' 'Bread' 'Bread Pudding'
 'Brioche and salami' 'Brownie' 'Cake' 'Caramel bites'
 'Cherry me Dried fruit' 'Chicken Stew' 'Chicken sand' 'Chimichurri Oil'
 'Chocolates' 'Christmas common' 'Coffee' 'Coffee granules ' 'Coke'
 'Cookies' 'Crepes' 'Crisps' 'Drinking chocolate spoons ' 'Duck egg'
 'Dulce de Leche' 'Eggs' "Ella's Kitchen Pouches" 'Empanadas'
 'Extra Salami or Feta' 'Fairy Doors' 'Farm House' 'Focaccia' 'Frittata'
 'Fudge' 'Gift voucher' 'Gingerbread syrup' 'Granola' 'Hack the stack'
 'Half slice Monster ' 'Hearty & Seasonal' 'Honey' 'Hot chocolate' 'Jam'
 'Jammie Dodgers' 'Juice' 'Keeping It Local' 'Kids biscuit'
 'Lemon and coconut' 'Medialuna' 'Mighty Protein' 'Mineral water'
 'Mortimer' 'Muesli' 'Muffin' 'My-5 Fruit Shoot' 'Nomad bag'
 'Olum & polenta' 'Panatone' 'Pastry' 'Pick and Mix Bowls' 'Pintxos'
 'Polenta' 'Postcard' 'Raspberry shortbread sandwich' 'Raw bars

array([[ 2.10822167e-01,  5.81491221e-02,  6.86990115e-01,
        -2.71084153e-01,  1.08352847e-01, -3.40498320e-01,
        -9.79677555e-02, -1.84077152e-01],
       [-3.01226195e-01,  2.64227372e-01, -1.20998238e-01,
        -6.86144686e-02, -5.73353022e-01,  7.04919790e-01,
        -2.65687682e-03, -5.48875109e-03],
       [ 4.47823923e-01, -9.66855073e-03,  1.83574038e-01,
         4.49879373e-01, -4.81379391e-01, -4.71673074e-01,
        -1.36186915e-02, -2.74657659e-02],
       [-3.27261160e-01,  5.08735685e-01, -3.79915392e-02,
        -2.08584097e-02, -2.10363517e-01, -1.66108301e-01,
        -7.79039878e-04, -1.67148654e-03],
       [ 3.73614362e-01, -4.75826027e-01, -8.28191241e-01,
        -5.88310156e-01,  8.12826015e-01,  9.04245528e-01,
        -3.51953305e-02, -6.98865161e-02],
       [ 3.43339728e-01, -5.81614728e-02, -4.56097350e-01,
        -2.89429564e-01,  2.79009389e-01,  2.45339866e-01,
        -1.32619608e-02, -2.69458725e-02],
       [ 5.91716818e-01, -5.956044

In [114]:
#Continued scratch work
df6 = pd.DataFrame(model.coef_)
df6

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.210822,0.058149,0.686990,-0.271084,0.108353,-0.340498,-0.097968,-0.184077
1,-0.301226,0.264227,-0.120998,-0.068614,-0.573353,0.704920,-0.002657,-0.005489
2,0.447824,-0.009669,0.183574,0.449879,-0.481379,-0.471673,-0.013619,-0.027466
3,-0.327261,0.508736,-0.037992,-0.020858,-0.210364,-0.166108,-0.000779,-0.001671
4,0.373614,-0.475826,-0.828191,-0.588310,0.812826,0.904246,-0.035195,-0.069887
...,...,...,...,...,...,...,...,...
87,-1.148423,-1.018401,-0.228547,4.159189,-0.857369,-0.722792,-0.005564,-0.011573
88,-1.154872,-0.437638,-0.326781,1.288934,-0.170072,-0.990107,1.944398,-0.016446
89,-1.617934,-0.914266,0.054662,0.357378,-1.195837,-0.979509,-0.016069,4.465231
90,0.055529,-0.528583,0.484324,-0.382290,0.055596,0.447880,-0.018694,-0.037606


In [115]:
#Continued scratch work
df6.columns = list(df5.columns)
df6

Unnamed: 0,DayPart_afternoonweekday,DayPart_afternoonweekend,DayPart_eveningweekday,DayPart_eveningweekend,DayPart_morningweekday,DayPart_morningweekend,DayPart_nightweekday,DayPart_nightweekend
0,0.210822,0.058149,0.686990,-0.271084,0.108353,-0.340498,-0.097968,-0.184077
1,-0.301226,0.264227,-0.120998,-0.068614,-0.573353,0.704920,-0.002657,-0.005489
2,0.447824,-0.009669,0.183574,0.449879,-0.481379,-0.471673,-0.013619,-0.027466
3,-0.327261,0.508736,-0.037992,-0.020858,-0.210364,-0.166108,-0.000779,-0.001671
4,0.373614,-0.475826,-0.828191,-0.588310,0.812826,0.904246,-0.035195,-0.069887
...,...,...,...,...,...,...,...,...
87,-1.148423,-1.018401,-0.228547,4.159189,-0.857369,-0.722792,-0.005564,-0.011573
88,-1.154872,-0.437638,-0.326781,1.288934,-0.170072,-0.990107,1.944398,-0.016446
89,-1.617934,-0.914266,0.054662,0.357378,-1.195837,-0.979509,-0.016069,4.465231
90,0.055529,-0.528583,0.484324,-0.382290,0.055596,0.447880,-0.018694,-0.037606


In [116]:
#Continued scratch work
df7 = pd.concat([dc, df6], axis=1)
df7.tail(10)

Unnamed: 0,Items,DayPart_afternoonweekday,DayPart_afternoonweekend,DayPart_eveningweekday,DayPart_eveningweekend,DayPart_morningweekday,DayPart_morningweekend,DayPart_nightweekday,DayPart_nightweekend
82,Item_The BART,-0.332678,-0.281671,-0.038618,-0.021208,0.610866,-0.168845,-0.000792,-0.001697
83,Item_The Nomad,-0.45237,0.487821,-0.558251,-0.366109,0.217446,0.773687,-0.017899,-0.036188
84,Item_Tiffin,0.116135,0.115444,0.382253,-0.704873,0.064136,0.275464,-0.045875,-0.090042
85,Item_Toast,0.237545,-0.281874,-1.0612,-0.794198,1.470144,0.745005,-0.058052,-0.112733
86,Item_Truffles,0.452937,0.044064,0.196724,0.410778,-0.405975,-0.383774,-0.056883,-0.110446
87,Item_Tshirt,-1.148423,-1.018401,-0.228547,4.159189,-0.857369,-0.722792,-0.005564,-0.011573
88,Item_Valentine's card,-1.154872,-0.437638,-0.326781,1.288934,-0.170072,-0.990107,1.944398,-0.016446
89,Item_Vegan Feast,-1.617934,-0.914266,0.054662,0.357378,-1.195837,-0.979509,-0.016069,4.465231
90,Item_Vegan mincepie,0.055529,-0.528583,0.484324,-0.38229,0.055596,0.44788,-0.018694,-0.037606
91,Item_Victorian Sponge,-0.237819,0.658928,-0.112432,-0.063632,0.092449,-0.436214,-0.002458,-0.005099


In [117]:
#Here the time of day or DayPart predicts which items are bought. The afternoon of a weekday predicts these items.
#afternoonweekday model 
df8 = df7[['Items', 'DayPart_afternoonweekday']]
df8.sort_values('DayPart_afternoonweekday',ascending=False, inplace=True)
df8.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df8.sort_values('DayPart_afternoonweekday',ascending=False, inplace=True)


Unnamed: 0,Items,DayPart_afternoonweekday
76,Item_Soup,1.911903
16,Item_Chicken Stew,1.375729
70,Item_Salad,1.348277
71,Item_Sandwich,1.345893
52,Item_Lemon and coconut,1.123529
10,Item_Bread Pudding,1.123529
44,Item_Hearty & Seasonal,1.072277
18,Item_Chimichurri Oil,0.745073
6,Item_Bare Popcorn,0.591717
26,Item_Crisps,0.502344


In [118]:
#Here the time of day or DayPart predicts which items are bought. The afternoon of a weekend predicts these items.
#afternoonweekend model 
df9 = df7[['Items', 'DayPart_afternoonweekend']]
df9.sort_values('DayPart_afternoonweekend',ascending=False, inplace=True)
df9.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df9.sort_values('DayPart_afternoonweekend',ascending=False, inplace=True)


Unnamed: 0,Items,DayPart_afternoonweekend
79,Item_Tacos/Fajita,1.773049
37,Item_Frittata,1.506952
51,Item_Kids biscuit,1.085016
71,Item_Sandwich,0.974716
25,Item_Crepes,0.919544
64,Item_Pick and Mix Bowls,0.876783
44,Item_Hearty & Seasonal,0.845996
54,Item_Mighty Protein,0.843105
77,Item_Spanish Brunch,0.785987
76,Item_Soup,0.730776


In [119]:
#Here the time of day or DayPart predicts which items are bought. The evening of a weekday predicts these items.
#eveningweekday model 
df10 = df7[['Items', 'DayPart_eveningweekday']]
df10.sort_values('DayPart_eveningweekday',ascending=False, inplace=True)
df10.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df10.sort_values('DayPart_eveningweekday',ascending=False, inplace=True)


Unnamed: 0,Items,DayPart_eveningweekday
31,Item_Ella's Kitchen Pouches,1.506393
38,Item_Fudge,1.206549
29,Item_Dulce de Leche,1.102695
59,Item_My-5 Fruit Shoot,0.895597
42,Item_Hack the stack,0.871925
6,Item_Bare Popcorn,0.810581
16,Item_Chicken Stew,0.794507
56,Item_Mortimer,0.782192
32,Item_Empanadas,0.700313
19,Item_Chocolates,0.69023


In [120]:
#Here the time of day or DayPart predicts which items are bought. The evening of a weekend predicts these items.
#eveningweekend model 
df11 = df7[['Items', 'DayPart_eveningweekend']]
df11.sort_values('DayPart_eveningweekend',ascending=False, inplace=True)
df11.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df11.sort_values('DayPart_eveningweekend',ascending=False, inplace=True)


Unnamed: 0,Items,DayPart_eveningweekend
87,Item_Tshirt,4.159189
67,Item_Postcard,3.180445
65,Item_Pintxos,2.916509
23,Item_Coke,1.416702
88,Item_Valentine's card,1.288934
28,Item_Duck egg,0.790502
60,Item_Nomad bag,0.785441
29,Item_Dulce de Leche,0.681869
59,Item_My-5 Fruit Shoot,0.577314
2,Item_Art Tray,0.449879


In [121]:
#Here the time of day or DayPart predicts which items are bought. The morning of a weekday predicts these items.
#morningweekday model 
df12 = df7[['Items', 'DayPart_morningweekday']]
df12.sort_values('DayPart_morningweekday',ascending=False, inplace=True)
df12.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df12.sort_values('DayPart_morningweekday',ascending=False, inplace=True)


Unnamed: 0,Items,DayPart_morningweekday
50,Item_Keeping It Local,1.545728
85,Item_Toast,1.470144
63,Item_Pastry,1.341147
57,Item_Muesli,1.233742
21,Item_Coffee,1.03377
20,Item_Christmas common,0.97694
9,Item_Bread,0.960676
35,Item_Farm House,0.875301
40,Item_Gingerbread syrup,0.852472
4,Item_Baguette,0.812826


In [122]:
#Here the time of day or DayPart predicts which items are bought. The morning of a weekend predicts these items.
#morningweekend model 
df13 = df7[['Items', 'DayPart_morningweekend']]
df13.sort_values('DayPart_morningweekend',ascending=False, inplace=True)
df13.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df13.sort_values('DayPart_morningweekend',ascending=False, inplace=True)


Unnamed: 0,Items,DayPart_morningweekend
7,Item_Basket,1.841509
63,Item_Pastry,1.100087
53,Item_Medialuna,1.013087
73,Item_Scone,0.955558
4,Item_Baguette,0.904246
41,Item_Granola,0.868238
30,Item_Eggs,0.786108
83,Item_The Nomad,0.773687
9,Item_Bread,0.764307
85,Item_Toast,0.745005


In [124]:
#Here the time of day or DayPart predicts which items are bought. The night of a weekday predicts these items.
#nightweekday model 
df14 = df7[['Items', 'DayPart_nightweekday']]
df14.sort_values('DayPart_nightweekday',ascending=False, inplace=True)
#Only three positive rows
df14.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df14.sort_values('DayPart_nightweekday',ascending=False, inplace=True)


Unnamed: 0,Items,DayPart_nightweekday
88,Item_Valentine's card,1.944398
55,Item_Mineral water,0.908165
49,Item_Juice,0.766458


In [126]:
#Here the time of day or DayPart predicts which items are bought. The night of a weekend predicts these items.
#nightweekend model 
df15 = df7[['Items', 'DayPart_nightweekend']]
df15.sort_values('DayPart_nightweekend',ascending=False, inplace=True)
#Only three positive rows
df15.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df15.sort_values('DayPart_nightweekend',ascending=False, inplace=True)


Unnamed: 0,Items,DayPart_nightweekend
89,Item_Vegan Feast,4.465231
46,Item_Hot chocolate,0.973502
72,Item_Scandinavian,0.656527
