In [8]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [77]:
df = pd.read_csv('./data/POS-data_Baechlihof.csv', sep=";", encoding="latin-1")
#df = pd.read_csv('./data/POS-data_Juckerhof.csv', sep=";", encoding="latin-1")

In [78]:
df.head()

Unnamed: 0,timestamp,timestamp_human,shop,transaction_id,cost_unit,position_id,article,count,unit,article_group0,article_group1,article_group2,vat_pos,price
0,1485957427,01.02.2017 13:57:07,Juckerhof Bächlihof,2638192,400,7194841,Buffet - Dessert M,0.06,KG,HofKonditorei,HofKonditorei>>Dessert,,0.15,2.05
1,1485963471,01.02.2017 15:37:51,Juckerhof Bächlihof,2638193,400,7194842,Kaffee,2.0,,Getränke,Getränke>>Heissgetränke,,0.56,7.6
2,1485964029,01.02.2017 15:47:09,Juckerhof Bächlihof,2638194,400,7194843,Apfel-Birnen 3.50,2.4,KG,Obst,Obst>>Äpfel/Birnen,,0.2,8.4
3,1485964084,01.02.2017 15:48:04,Juckerhof Bächlihof,2638195,400,7194844,Most past. 1 Liter,1.0,L,Getränke,Getränke>>Hoftee/Most,,0.14,5.9
4,1485964248,01.02.2017 15:50:48,Juckerhof Bächlihof,2638196,400,7194845,Brot vom Hof,1.0,STK,HofBäckerei,HofBäckerei>>Brot,,0.12,4.9


Average number of articles per basket

In [79]:
df.groupby('transaction_id').article.count().mean()

2.7108093018739683

Average basket price

In [80]:
df.groupby('transaction_id').price.mean().mean()

8.5324999605754783

Number of categories 

In [92]:
df.article_group0.unique()

array(['HofKonditorei', 'Getränke', 'Obst', 'HofBäckerei',
       'Frischprodukte', 'Gemüse', 'Trockenprodukte', 'Non-Food',
       'HofChuchi', 'Erlebnis'], dtype=object)

In [90]:
len(df.article_group0.unique()), len(df.article_group1.unique()), len(df.article_group2.unique())

(10, 49, 3)

In [91]:
df.article_group2.value_counts()

Fremd    118933
Eigen     60201
Name: article_group2, dtype: int64

Top 10 most sold products

In [81]:
df_count = df.groupby('article').count()[['count']]
df_count.sort_values(by="count", ascending=False).iloc[:10]

Unnamed: 0_level_0,count
article,Unnamed: 1_level_1
"Süssmost 1l, frisch gepresst",18479
PET Flasche,18251
Kuchen-/Gebäckstück 4 CHF,14492
Wähen süss,12412
Cappuccino,11814
Buffet - Zmittag Salat/Warm L,11788
Glace Kübeli,10206
Kaffee,10068
Apfelschorli 5dl,9914
Buffet - Fleisch/Spargeln L,9052


# Trying to find communities of articles that work together

First, group by transaction_id all the articles under a list (consider this as a tokenized document)

In [87]:
df_transactions_full = df.groupby('transaction_id')['article'].apply(list)

Remove the transactions with only one article ? 

In [85]:
len(df_transactions)

164784

Take a random subset

In [169]:
N_RANDOM = 20000
df_transactions = df_transactions_full.sample(frac=1).iloc[:N_RANDOM]

In [170]:
from gensim.models.ldamodel import LdaModel
from gensim import corpora

In [171]:
dictionary = corpora.Dictionary(df_transactions.values)
dictionary.save('./dictionary.dict')

In [172]:
doc_term_matrix = [dictionary.doc2bow(text) for text in df_transactions.values]
corpora.MmCorpus.serialize('./corpus.mm', doc_term_matrix)

In [173]:
lda = LdaModel(corpus, id2word=dictionary, num_topics=100)

  diff = np.log(self.expElogbeta)


In [174]:
print(lda.print_topics(num_topics=2, num_words=4))

[(55, '0.673*"Kaffee / Espresso" + 0.048*"Zitrone normal" + 0.044*"Glühwein" + 0.039*"Rüebli"'), (20, '0.302*"Balsamico weiss 5dl" + 0.245*"Baumnussen Frankreich " + 0.092*"Flamm kuchen Vom Garte" + 0.053*"HofChuchi Spezialpreis"')]


In [175]:
lda.save('./lda.model')
from gensim.models import LdaModel
loading = LdaModel.load('./lda.model')

Visualization

In [176]:
import pyLDAvis.gensim
import gensim
import operator
pyLDAvis.enable_notebook()

In [177]:
d = gensim.corpora.Dictionary.load('dictionary.dict')
c = gensim.corpora.MmCorpus('corpus.mm')
lda = gensim.models.LdaModel.load('lda.model')

From one or more articles, we want to find the next one that'd be nice to sell with.

<u> Method 1 : Handcrafting </u>

In [187]:
input_article = "Auberginen"
best_topics = dict(lda.get_term_topics(input_article, minimum_probability=.01))
sorted_best_topics = sorted(best_topics.items(), key=operator.itemgetter(1), reverse=True)
print("best topics:\n")
print(sorted_best_topics)
best_community_idx = lda.get_topic_terms(sorted_best_topics[0][0])
print([(dictionary.get(i), p) for (i,p) in best_community_idx])

best topics:

[(85, 0.15216638), (58, 0.035415795), (16, 0.030991629)]
[('Kaffee', 0.42478865), ('Auberginen', 0.15411475), ('HofChuchi Spezialpreis', 0.098582804), ('Peterli', 0.080427818), ('Gurke Nostrano', 0.043230966), ('Kuchenstück 4.-', 0.038418077), ('Trockenfleisch Eichenberger', 0.018223355), ('Kopfsalat', 0.016675303), ('Buffet - Zmittag Salat/Warm M', 0.011449457), ('Zierkürbismix in Kartonschale', 0.010535863)]


In [128]:
def handcraft_similarity(input_article):
    input_article = "Kaffee"
    best_topics = dict(lda.get_term_topics(input_article, minimum_probability=.2))
    sorted_best_topics = sorted(best_topics.items(), key=operator.itemgetter(1), reverse=True)
    best_community_idx = lda.get_topic_terms(sorted_best_topics[0][0])
    return [(dictionary.get(i), p) for (i,p) in best_community_idx]

<u> Similarity using topic embeddings </u>

In [129]:
def get_similar(date, articles_list, n_similar):
    """ from an articles list, find the n_similar other articles that'd
    best fit to be sold with it"""

In [185]:
dict(lda.get_term_topics("Kaffee", minimum_probability=.2))

{3: 0.23025934, 4: 0.2658352, 85: 0.42346159}

In [179]:
topics_terms = lda.state.get_lambda()
topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)

In [182]:
topics_terms_proba.shape

(100, 884)

In [180]:
sum([p for (_,p) in lda.get_term_topics("Brötli vom Hof", minimum_probability=0.)])

ValueError: too many values to unpack (expected 2)

In [188]:
text = ["Brötli vom Hof"]
bow = dictionary.doc2bow(text)
lda.get_document_topics(bow, minimum_probability=0, minimum_phi_value=0);

In [189]:
dictionary.get(216)

'Quöllfrisch 3.3dl'

Autre méthode : prendre l'article dont l'embedding est le plus proche

In [190]:
data = pyLDAvis.gensim.prepare(lda, c, d)
data

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]
