In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

Load data

In [76]:
FARM_NAME = 'POS-data_Baechlihof.csv'
#FARM_NAME = 'POS-data_Juckerhof.csv'

df = pd.read_csv('./../data/' + FARM_NAME, sep=";", encoding="latin-1")


df['vat_percentage'] = df['vat_pos'] / df['price']

def label_restaurant(t):
    if t >= .05:
        return "restaurant"
    else:
        return "shop"
    
df['place'] = df.vat_percentage.apply(label_restaurant)
df['date'] = [d.date() for d in pd.to_datetime(df['timestamp_human'])]

In [356]:
def slice_df(df_, timeframe):
    if timeframe == "winter":
        return df_[df_['date'].apply(lambda d:d.month in [12, 1, 2])]
    elif timeframe == "spring":
        return df_[df_['date'].apply(lambda d:d.month in [3, 4, 5])]
    elif timeframe == "summer":
        return df_[df_['date'].apply(lambda d:d.month in [6, 7, 8])]
    elif timeframe == "fall":
        return df_[df_['date'].apply(lambda d:d.month in [9, 10, 11])]
    elif timeframe == "dayoff":
        return df_[df_['date'].apply(lambda d: (d.weekday() in [5,6]) | (d.month == 10 & d.day == 31) | (d.month == 12 & d.day == 24) | (d.month == 4 & d.day == 17))]
    else:
        return df_

In [77]:
TIMEFRAME = "winter"
df = slice_df(df, TIMEFRAME)

Focus on restaurant data

In [78]:
PLACE = 'restaurant'
df_ = df[df.place == PLACE]
df_transactions_full = df_.groupby('transaction_id')['article'].apply(list)

Applying LDA

In [79]:
from sklearn.feature_extraction.text import CountVectorizer
sep = "++++"
bow = CountVectorizer(vocabulary=set(df_.article.values), 
                      lowercase=False, 
                      tokenizer=lambda s: s.split(sep))
document_term_matrix = bow.fit_transform(df_transactions_full.apply(lambda s: sep.join(s)))

In [212]:
%%time
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=30,
                                learning_method='batch',
                                verbose=1, 
                                evaluate_every=1 , 
                                perp_tol=1e-2, 
                                max_iter=150)
lda.fit(document_term_matrix)

iteration: 1 of max_iter: 150, perplexity: 174.6578
iteration: 2 of max_iter: 150, perplexity: 111.0916
iteration: 3 of max_iter: 150, perplexity: 88.7409
iteration: 4 of max_iter: 150, perplexity: 79.7054
iteration: 5 of max_iter: 150, perplexity: 75.5981
iteration: 6 of max_iter: 150, perplexity: 74.0975
iteration: 7 of max_iter: 150, perplexity: 73.3517
iteration: 8 of max_iter: 150, perplexity: 73.1253
iteration: 9 of max_iter: 150, perplexity: 72.7778
iteration: 10 of max_iter: 150, perplexity: 71.0121
iteration: 11 of max_iter: 150, perplexity: 70.9087
iteration: 12 of max_iter: 150, perplexity: 70.9034
CPU times: user 17.7 s, sys: 81.3 ms, total: 17.8 s
Wall time: 17.8 s


In [213]:
lda.transform(document_term_matrix).shape

(11634, 30)

In [99]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += "  ;  ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [112]:
pd.DataFrame(lda.components_[4]).sum()

0    813.13334
dtype: float64

In [217]:
topic_word.shape

(30, 212)

In [242]:
input_ = ["Bier vom Hof AMBER 5dl", "Tee"]
input_2 = ["Tee", 'Apfelstrudel mit Vanillesauce']

In [244]:
if input_2:
    best_topics = np.argmax(lda.transform(bow.transform(input_2)), axis=1)
    print(best_topics)
    sol = []
    for best_topic in best_topics:
        print("best topic", best_topic)
        top5_idx = topic_word[best_topic].argsort()[-5:][::-1]
        top5_probas = sorted(topic_word[best_topic], reverse=True)[:5]
        best_match_words = [bow.get_feature_names()[b] for b in top5_idx]
        sol += list(zip(best_match_words, top5_probas))
    result = list(set([(w,p) for (w,p) in sol if w not in input_2]))
    result_sorted = sorted(result, key=lambda x: x[1], reverse=True)
    sum_ = sum([p for (_,p) in result_sorted])
    result_sorted = [(w, p/sum_) for (w,p) in result_sorted] 
    print(result_sorted)
else:
    pass

[5 5]
best topic 5
best topic 5
[('Schale', 0.71266991402690971), ('Buffet - Birchermüesli S', 0.27129887424585902), ('HofBäckerei Spezialpreis', 0.016031211727231143)]


In [372]:
def closest_products(lda, product_list):
    topic_word = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]
    if product_list:
        best_topics = np.argmax(lda.transform(bow.transform(product_list)), axis=1)
        sol = []
        for best_topic in best_topics:
            top5_idx = topic_word[best_topic].argsort()[-5:][::-1] 
            top5_probas = sorted(topic_word[best_topic], reverse=True)[:5]
            best_match_words = [bow.get_feature_names()[b] for b in top5_idx]
            sol += list(zip(best_match_words, top5_probas))
        result = list(set([(w,p) for (w,p) in sol if w not in product_list]))
        result_sorted = sorted(result, key=lambda x: x[1], reverse=True)
        sum_ = sum([p for (_,p) in result_sorted])
        result_sorted = [(w, p/sum_) for (w,p) in result_sorted] # normalize the union of probas
        return result_sorted
    else:
        raise Exception("You must pass a product list")

# Global function

In [314]:
"""
timeframe; place; article_list; suggestions_list
winter
winter
...
summer
...

"""

'\ntimeframe; place; article_list; suggestions_list\nwinter\nwinter\n...\nsummer\n...\n\n'

In [398]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import itertools, random
from tqdm import tqdm
from itertools import chain

In [353]:
FARM_NAME = 'POS-data_Baechlihof.csv'
#FARM_NAME = 'POS-data_Juckerhof.csv'
df = pd.read_csv('./../data/' + FARM_NAME, sep=";", encoding="latin-1")

df['vat_percentage'] = df['vat_pos'] / df['price']

def label_restaurant(t):
    if t >= .05:
        return "restaurant"
    else:
        return "shop"
    
df['place'] = df.vat_percentage.apply(label_restaurant)
#df['date'] = [d.date() for d in pd.to_datetime(df['timestamp'], unit="s")]
df['date'] = df.timestamp.apply(lambda d: pd.to_datetime(d, unit='s'))

In [399]:
def pad_10(l):
    if len(l)>=10:
        return l[:10]
    else:
        return l + [pd.np.NaN] * (len(l)-10+1)

In [425]:
def generate_suggestions(df_input, timeframe, place):
    
    # slice on timeframe 
    df_input = slice_df(df_input, timeframe)

    # select restaurant or shop
    df_input = df_input[df_input['place'] == place]

    # group by transactions
    df_transactions_full = df_input.groupby('transaction_id')['article'].apply(list)

    # pre process the document term matrix
    sep = "++++"
    bow = CountVectorizer(vocabulary=set(df_.article.values), 
                          lowercase=False, 
                          tokenizer=lambda s: s.split(sep))
    document_term_matrix = bow.fit_transform(df_transactions_full.apply(lambda s: sep.join(s)))

    # apply LDA
    lda = LatentDirichletAllocation(n_components=20,
                                    learning_method='batch',
                                    verbose=1, 
                                    evaluate_every=2, 
                                    perp_tol=1e-2, 
                                    max_iter=100)
    # train model
    lda.fit(document_term_matrix)

    # predict for all combinations of products of size 2
    product_combinations = list(itertools.combinations(df_input.article.unique(), 2))
    random.shuffle(product_combinations)

    tmp = []
    for combination in tqdm(product_combinations):
        suggestions = closest_products(lda, combination)
        tmp.append({"timeframe" : timeframe,
                   "place": place,
                   "article_list": combination,
                   "suggestions_list": suggestions})

    final = pd.DataFrame(tmp)

    # reformat under columns for each prediction and proba

    l = []
    for (i, row) in enumerate(final.itertuples()):
        suggestions = row.suggestions_list
        flattened = list(chain(*suggestions))
        padded_flattened = pad_10(flattened)
        tmp = dict(zip(["pred1", "prob1", "pred2", "prob2", "pred3", "prob3", "pred4", "prob4", "pred5", "prob5"], padded_flattened))
        tmp.update(final.iloc[i].to_dict())
        l.append(tmp)

    res = pd.DataFrame(l)
    res['article_list'] = res.article_list.apply(lambda t: " & ".join(t))
    del res['suggestions_list']
    return res

In [414]:
generate_suggestions(df_input=df, timeframe="dayoff", place="shop")

iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 26.7703
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 25.6822
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 25.6512
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 25.6544


100%|██████████| 22366/22366 [00:18<00:00, 1196.97it/s]


Unnamed: 0,article_list,place,pred1,pred2,pred3,pred4,pred5,prob1,prob2,prob3,prob4,prob5,timeframe
0,Bier vom Hof AMBER 3.3dl & Suure Moscht vom Ho...,shop,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,Fladen mit Artischocken klein vom Holzofen,Buffet - Zmittag Salat/Warm Take away,0.200000,0.200000,0.200000,0.200000,0.200000,dayoff
1,Brunch bis 12 Uhr & Brotsack,shop,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,Fladen mit Artischocken klein vom Holzofen,Buffet - Zmittag Salat/Warm Take away,0.200000,0.200000,0.200000,0.200000,0.200000,dayoff
2,Kaffee & Fruchtschorli 5dl,shop,Buffet - Dessert Take away,Fladen mit Käse vom Holzofen,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,0.544019,0.423294,0.006236,0.006236,0.006236,dayoff
3,Mineral Passugger 5dl & Suppe spezial,shop,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,Fladen mit Artischocken klein vom Holzofen,Buffet - Zmittag Salat/Warm Take away,0.200000,0.200000,0.200000,0.200000,0.200000,dayoff
4,Buffet - Zmorge L & Buffet - Birchermüesli L,shop,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,Fladen mit Artischocken klein vom Holzofen,Buffet - Zmittag Salat/Warm Take away,0.200000,0.200000,0.200000,0.200000,0.200000,dayoff
5,Buffet - Zmittag Salat/Warm Take away & Sultan...,shop,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,Fladen mit Artischocken klein vom Holzofen,Apfelschorli 5dl,0.202150,0.202150,0.202150,0.202150,0.178349,dayoff
6,Buffet - Dessert S & Buffet - Zmittag Salat/Wa...,shop,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,Fladen mit Artischocken klein vom Holzofen,Apfelschorli 5dl,0.202150,0.202150,0.202150,0.202150,0.178349,dayoff
7,Cabernet Blanc Juckerhof 2016 7.5dl & Kafi Ueli,shop,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,Fladen mit Artischocken klein vom Holzofen,Buffet - Zmittag Salat/Warm Take away,0.200000,0.200000,0.200000,0.200000,0.200000,dayoff
8,Süssmost 2dl & Fladen mit Salami klein vom Hol...,shop,Joghurt 500g Himbeer Rhabarber,Tröpfel Demi-sec 7.5dl,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,0.741705,0.204724,0.009372,0.009372,0.009372,dayoff
9,Tee vom Hof 1l & Brotsack,shop,Tragtasche klein,Brunch bis 13 Uhr,Apfelschorli 5dl,Fladen mit Artischocken klein vom Holzofen,Buffet - Zmittag Salat/Warm Take away,0.200000,0.200000,0.200000,0.200000,0.200000,dayoff


In [426]:
l = []
for tf in ["winter", "summer", "spring", "fall", "dayoff"]:
    for plc in ["shop", "restaurant"]:
        l.append(generate_suggestions(df, tf, plc))

iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 29.1735
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 28.3326
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 94.0592
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 93.2185
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 93.1707
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100, perplexity: 93.1892
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100, perplexity: 93.1673
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100, perplexity: 93.1900
iteration: 17 of max_iter: 100
iteration: 18 of max_iter: 100, perplexity: 93.1695
iteration: 19 of max_iter: 100
iteration: 20 of max_iter: 100, perplexity: 27.5076
iteration: 21 of max_iter: 100
iteration: 22 of max_iter: 100, perplexity: 93.2040
iteration: 23 of max_iter: 100
iteration: 24 of max_iter: 100, perplexity: 93.1346
iteration: 25

100%|██████████| 135460/135460 [01:50<00:00, 1223.07it/s]


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 112.6237
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 81.8078
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 77.6658
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 76.2325
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 75.8128
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100, perplexity: 75.6564
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100, perplexity: 75.6213
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100, perplexity: 75.6161


100%|██████████| 26335/26335 [00:22<00:00, 1177.22it/s]


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 35.6487
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 30.9451
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 30.7650
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 30.7650


100%|██████████| 149331/149331 [03:03<00:00, 811.85it/s]


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 96.0023
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 73.0043
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 69.8044
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 68.3027
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 68.0530
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100, perplexity: 68.0179
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100, perplexity: 68.0109


100%|██████████| 33153/33153 [00:33<00:00, 984.25it/s]


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 23.9482
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 41.6575
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 41.4839
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 41.4727
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 41.4722


100%|██████████| 141778/141778 [01:53<00:00, 1250.19it/s]


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 96.6458
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 75.3210
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 71.5175
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 71.0013
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 70.5452
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100, perplexity: 70.3881
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100, perplexity: 70.3662
iteration: 15 of max_iter: 100
iteration: 16 of max_iter: 100, perplexity: 70.3599


100%|██████████| 34191/34191 [00:29<00:00, 1167.49it/s]


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 26.3270
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 25.5104
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 183.8726
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 183.8678


100%|██████████| 125250/125250 [08:06<00:00, 257.52it/s]


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 90.1741
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 71.3688
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 66.6055
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 65.4376
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 65.2805
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100, perplexity: 65.2505
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100, perplexity: 65.2436


100%|██████████| 27730/27730 [00:24<00:00, 1136.68it/s]


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 25.6902
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 24.3521
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 53.2291
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 52.9716
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 52.9717


100%|██████████| 285390/285390 [04:13<00:00, 1125.08it/s]


iteration: 1 of max_iter: 100
iteration: 2 of max_iter: 100, perplexity: 102.3138
iteration: 3 of max_iter: 100
iteration: 4 of max_iter: 100, perplexity: 79.9350
iteration: 5 of max_iter: 100
iteration: 6 of max_iter: 100, perplexity: 74.6582
iteration: 7 of max_iter: 100
iteration: 8 of max_iter: 100, perplexity: 73.6013
iteration: 9 of max_iter: 100
iteration: 10 of max_iter: 100, perplexity: 73.4235
iteration: 11 of max_iter: 100
iteration: 12 of max_iter: 100, perplexity: 73.3983
iteration: 13 of max_iter: 100
iteration: 14 of max_iter: 100, perplexity: 73.3937


100%|██████████| 53301/53301 [00:49<00:00, 1076.69it/s]


In [427]:
concatenation = pd.concat(l)

In [428]:
concatenation.to_csv("bundles.csv")

In [431]:
concatenation.pred5.value_counts()

Joghurt 180g Himbeer Rhabarber           260341
Flammkuchen                              143588
Tragtasche klein                         142255
Buffet - Zmittag Salat/Warm Take away    135282
HofBäckerei Spezialpreis                 119726
Fladen mit Käse vom Holzofen              16115
Bier vom Hof HELL 5dl                     12891
Fruchtschorli 5dl                          5825
Brötli vom Hof                             5581
Fladen mit Schinken vom Holzofen           5384
Quöllfrisch 5dl                            5374
Quöllfrisch 3.3dl                          5176
Suure Moscht vom Hof mit Alkohol 5dl       4947
Brunch bis 13 Uhr                          4787
Apfelschorli 5dl                           4699
pa es 2.75dl                               4535
Knobliwurst                                4159
Suppe                                      3983
Buffet - Birchermüesli Take away           3848
Flamm kuchen Vom Garte                     3828
Grosi's Gugelhopf Schoggi               